xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision 02219a082eb38cf2d3edc97fbfe55fa395a4dc99)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D>
20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21   __syncthreads();
22   data.slice[data.t_id_x] = *U;
23   __syncthreads();
24   *V = 0.0;
25   if (data.t_id_x < Q_1D) {
26     for (CeedInt i = 0; i < P_1D; i++) {
27       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
28     }
29   }
30 }
31 
32 //------------------------------------------------------------------------------
33 // 1D transpose tensor contraction x
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D, int Q_1D>
36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37   __syncthreads();
38   data.slice[data.t_id_x] = *U;
39   __syncthreads();
40   *V = 0.0;
41   if (data.t_id_x < P_1D) {
42     for (CeedInt i = 0; i < Q_1D; i++) {
43       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
44     }
45   }
46 }
47 
48 //------------------------------------------------------------------------------
49 // 1D interpolate to quadrature points
50 //------------------------------------------------------------------------------
51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
53   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
55   }
56 }
57 
58 //------------------------------------------------------------------------------
59 // 1D interpolate transpose
60 //------------------------------------------------------------------------------
61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
63                                          CeedScalar *__restrict__ r_V) {
64   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D derivatives at quadrature points
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
73 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
74                               CeedScalar *__restrict__ r_V) {
75   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
77   }
78 }
79 
80 //------------------------------------------------------------------------------
81 // 1D derivatives transpose
82 //------------------------------------------------------------------------------
83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
84 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
85                                        CeedScalar *__restrict__ r_V) {
86   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // 1D quadrature weights
93 //------------------------------------------------------------------------------
94 template <int P_1D, int Q_1D>
95 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
96   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
97 }
98 
99 //------------------------------------------------------------------------------
100 // 2D
101 //------------------------------------------------------------------------------
102 
103 //------------------------------------------------------------------------------
104 // 2D tensor contraction x
105 //------------------------------------------------------------------------------
106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
107 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
108   __syncthreads();
109   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
110   __syncthreads();
111   *V = 0.0;
112   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
113     for (CeedInt i = 0; i < P_1D; i++) {
114       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
115     }
116   }
117 }
118 
119 //------------------------------------------------------------------------------
120 // 2D tensor contract y
121 //------------------------------------------------------------------------------
122 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
123 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
124   __syncthreads();
125   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
126   __syncthreads();
127   *V = 0.0;
128   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
129     for (CeedInt i = 0; i < P_1D; i++) {
130       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
131     }
132   }
133 }
134 
135 //------------------------------------------------------------------------------
136 // 2D transpose tensor contract y
137 //------------------------------------------------------------------------------
138 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
139 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
140   __syncthreads();
141   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
142   __syncthreads();
143   *V = 0.0;
144   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
145     for (CeedInt i = 0; i < Q_1D; i++) {
146       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
147     }
148   }
149 }
150 
151 //------------------------------------------------------------------------------
152 // 2D transpose tensor contract x
153 //------------------------------------------------------------------------------
154 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
155 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
156   __syncthreads();
157   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
158   __syncthreads();
159   *V = 0.0;
160   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
161     for (CeedInt i = 0; i < Q_1D; i++) {
162       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
163     }
164   }
165 }
166 
167 //------------------------------------------------------------------------------
168 // 2D transpose tensor contract and add x
169 //------------------------------------------------------------------------------
170 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
171 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
172   __syncthreads();
173   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
174   __syncthreads();
175   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
176     for (CeedInt i = 0; i < Q_1D; i++) {
177       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
178     }
179   }
180 }
181 
182 //------------------------------------------------------------------------------
183 // 2D interpolate to quadrature points
184 //------------------------------------------------------------------------------
185 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
186 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
187                                       CeedScalar *__restrict__ r_V) {
188   CeedScalar r_t[1];
189   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
190     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
191     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
192   }
193 }
194 
195 //------------------------------------------------------------------------------
196 // 2D interpolate transpose
197 //------------------------------------------------------------------------------
198 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
199 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
200                                                CeedScalar *__restrict__ r_V) {
201   CeedScalar r_t[1];
202   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
203     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
204     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
205   }
206 }
207 
208 //------------------------------------------------------------------------------
209 // 2D derivatives at quadrature points
210 //------------------------------------------------------------------------------
211 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
212 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
213                                     CeedScalar *__restrict__ r_V) {
214   CeedScalar r_t[1];
215   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
216     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
217     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
218     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
219     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
220   }
221 }
222 
223 //------------------------------------------------------------------------------
224 // 2D derivatives transpose
225 //------------------------------------------------------------------------------
226 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
227 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
228                                              CeedScalar *__restrict__ r_V) {
229   CeedScalar r_t[1];
230   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
231     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
232     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
233     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
234     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
235   }
236 }
237 
238 //------------------------------------------------------------------------------
239 // 2D derivatives at quadrature points, nodes and quadrature points collocated
240 //------------------------------------------------------------------------------
241 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
242 inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
243                                                    CeedScalar *__restrict__ r_V) {
244   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
245     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
246     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
247   }
248 }
249 
250 //------------------------------------------------------------------------------
251 // 2D derivatives transpose, nodes and quadrature points collocated
252 //------------------------------------------------------------------------------
253 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
254 inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
255                                                             CeedScalar *__restrict__ r_V) {
256   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
257     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
258     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
259   }
260 }
261 
262 //------------------------------------------------------------------------------
263 // 2D quadrature weights
264 //------------------------------------------------------------------------------
265 template <int P_1D, int Q_1D>
266 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
267   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
268 }
269 
270 //------------------------------------------------------------------------------
271 // 3D
272 //------------------------------------------------------------------------------
273 
274 //------------------------------------------------------------------------------
275 // 3D tensor contract x
276 //------------------------------------------------------------------------------
277 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
278 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
279   CeedScalar r_B[P_1D];
280   for (CeedInt i = 0; i < P_1D; i++) {
281     r_B[i] = B[i + data.t_id_x * P_1D];
282   }
283 
284   for (CeedInt k = 0; k < P_1D; k++) {
285     __syncthreads();
286     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
287     __syncthreads();
288     V[k] = 0.0;
289     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
290       for (CeedInt i = 0; i < P_1D; i++) {
291         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
292       }
293     }
294   }
295 }
296 
297 //------------------------------------------------------------------------------
298 // 3D tensor contract y
299 //------------------------------------------------------------------------------
300 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
301 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
302   CeedScalar r_B[P_1D];
303   for (CeedInt i = 0; i < P_1D; i++) {
304     r_B[i] = B[i + data.t_id_y * P_1D];
305   }
306 
307   for (CeedInt k = 0; k < P_1D; k++) {
308     __syncthreads();
309     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
310     __syncthreads();
311     V[k] = 0.0;
312     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
313       for (CeedInt i = 0; i < P_1D; i++) {
314         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
315       }
316     }
317   }
318 }
319 
320 //------------------------------------------------------------------------------
321 // 3D tensor contract z
322 //------------------------------------------------------------------------------
323 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
324 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
325   for (CeedInt k = 0; k < Q_1D; k++) {
326     V[k] = 0.0;
327     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
328       for (CeedInt i = 0; i < P_1D; i++) {
329         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
330       }
331     }
332   }
333 }
334 
335 //------------------------------------------------------------------------------
336 // 3D transpose tensor contract z
337 //------------------------------------------------------------------------------
338 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
339 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
340   for (CeedInt k = 0; k < P_1D; k++) {
341     V[k] = 0.0;
342     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
343       for (CeedInt i = 0; i < Q_1D; i++) {
344         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
345       }
346     }
347   }
348 }
349 
350 //------------------------------------------------------------------------------
351 // 3D transpose tensor contract y
352 //------------------------------------------------------------------------------
353 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
354 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
355   CeedScalar r_B[Q_1D];
356   for (CeedInt i = 0; i < Q_1D; i++) {
357     r_B[i] = B[data.t_id_y + i * P_1D];
358   }
359 
360   for (CeedInt k = 0; k < P_1D; k++) {
361     __syncthreads();
362     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
363     __syncthreads();
364     V[k] = 0.0;
365     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
366       for (CeedInt i = 0; i < Q_1D; i++) {
367         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
368       }
369     }
370   }
371 }
372 
373 //------------------------------------------------------------------------------
374 // 3D transpose tensor contract y
375 //------------------------------------------------------------------------------
376 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
377 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
378   CeedScalar r_B[Q_1D];
379   for (CeedInt i = 0; i < Q_1D; i++) {
380     r_B[i] = B[data.t_id_y + i * P_1D];
381   }
382 
383   for (CeedInt k = 0; k < P_1D; k++) {
384     __syncthreads();
385     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
386     __syncthreads();
387     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
388       for (CeedInt i = 0; i < Q_1D; i++) {
389         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
390       }
391     }
392   }
393 }
394 
395 //------------------------------------------------------------------------------
396 // 3D transpose tensor contract x
397 //------------------------------------------------------------------------------
398 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
399 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
400   CeedScalar r_B[Q_1D];
401   for (CeedInt i = 0; i < Q_1D; i++) {
402     r_B[i] = B[data.t_id_x + i * P_1D];
403   }
404 
405   for (CeedInt k = 0; k < P_1D; k++) {
406     __syncthreads();
407     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
408     __syncthreads();
409     V[k] = 0.0;
410     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
411       for (CeedInt i = 0; i < Q_1D; i++) {
412         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
413       }
414     }
415   }
416 }
417 
418 //------------------------------------------------------------------------------
419 // 3D transpose tensor contract add x
420 //------------------------------------------------------------------------------
421 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
422 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
423   CeedScalar r_B[Q_1D];
424   for (CeedInt i = 0; i < Q_1D; i++) {
425     r_B[i] = B[data.t_id_x + i * P_1D];
426   }
427 
428   for (CeedInt k = 0; k < P_1D; k++) {
429     __syncthreads();
430     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
431     __syncthreads();
432     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
433       for (CeedInt i = 0; i < Q_1D; i++) {
434         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
435       }
436     }
437   }
438 }
439 
440 //------------------------------------------------------------------------------
441 // 3D interpolate to quadrature points
442 //------------------------------------------------------------------------------
443 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
444 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
445                                       CeedScalar *__restrict__ r_V) {
446   CeedScalar r_t1[T_1D];
447   CeedScalar r_t2[T_1D];
448   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
449     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
450     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
451     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
452   }
453 }
454 
455 //------------------------------------------------------------------------------
456 // 3D interpolate transpose
457 //------------------------------------------------------------------------------
458 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
459 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
460                                                CeedScalar *__restrict__ r_V) {
461   CeedScalar r_t1[T_1D];
462   CeedScalar r_t2[T_1D];
463   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
464     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
465     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
466     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
467   }
468 }
469 
470 //------------------------------------------------------------------------------
471 // 3D derivatives at quadrature points
472 //------------------------------------------------------------------------------
473 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
474 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
475                                     CeedScalar *__restrict__ r_V) {
476   CeedScalar r_t1[T_1D];
477   CeedScalar r_t2[T_1D];
478   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
479     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
480     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
481     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
482     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
483     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
484     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
485     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
486     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
487     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
488   }
489 }
490 
491 //------------------------------------------------------------------------------
492 // 3D derivatives transpose
493 //------------------------------------------------------------------------------
494 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
495 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
496                                              CeedScalar *__restrict__ r_V) {
497   CeedScalar r_t1[T_1D];
498   CeedScalar r_t2[T_1D];
499   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
500     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
501     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
502     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
503     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
504     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
505     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
506     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
507     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
508     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
509   }
510 }
511 
512 //------------------------------------------------------------------------------
513 // 3D derivatives at quadrature points
514 //------------------------------------------------------------------------------
515 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
516 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
517                                               CeedScalar *__restrict__ r_V) {
518   CeedScalar r_t1[T_1D];
519   CeedScalar r_t2[T_1D];
520   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
521     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
522     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
523     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
524     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
525     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
526     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
527   }
528 }
529 
530 //------------------------------------------------------------------------------
531 // 3D derivatives transpose
532 //------------------------------------------------------------------------------
533 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
534 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
535                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
536   CeedScalar r_t1[T_1D];
537   CeedScalar r_t2[T_1D];
538   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
539     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
540     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
541     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
542     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
543     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
544     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
545   }
546 }
547 
548 //------------------------------------------------------------------------------
549 // 3D derivatives at quadrature points, nodes and quadrature points collocated
550 //------------------------------------------------------------------------------
551 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
552 inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
553                                                    CeedScalar *__restrict__ r_V) {
554   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
555     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
556     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
557     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
558   }
559 }
560 
561 //------------------------------------------------------------------------------
562 // 3D derivatives transpose, nodes and quadrature points collocated
563 //------------------------------------------------------------------------------
564 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
565 inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
566                                                             CeedScalar *__restrict__ r_V) {
567   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
568     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
569     ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
570     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
571   }
572 }
573 
574 //------------------------------------------------------------------------------
575 // 3D quadrature weights
576 //------------------------------------------------------------------------------
577 template <int P_1D, int Q_1D>
578 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
579   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
580   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
581   for (CeedInt q = 0; q < Q_1D; q++) {
582     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
583   }
584 }
585