xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision d6c19ee8504c74d8f30ec67127f069a58291b3ac)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D>
20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21   __syncthreads();
22   data.slice[data.t_id_x] = *U;
23   __syncthreads();
24   *V = 0.0;
25   if (data.t_id_x < Q_1D) {
26     for (CeedInt i = 0; i < P_1D; i++) {
27       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
28     }
29   }
30 }
31 
32 //------------------------------------------------------------------------------
33 // 1D transpose tensor contraction x
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D, int Q_1D>
36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37   __syncthreads();
38   data.slice[data.t_id_x] = *U;
39   __syncthreads();
40   *V = 0.0;
41   if (data.t_id_x < P_1D) {
42     for (CeedInt i = 0; i < Q_1D; i++) {
43       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
44     }
45   }
46 }
47 
48 //------------------------------------------------------------------------------
49 // 1D interpolate to quadrature points
50 //------------------------------------------------------------------------------
51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
53   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
55   }
56 }
57 
58 //------------------------------------------------------------------------------
59 // 1D interpolate transpose
60 //------------------------------------------------------------------------------
61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
63                                          CeedScalar *__restrict__ r_V) {
64   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D derivatives at quadrature points
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
73 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
74                               CeedScalar *__restrict__ r_V) {
75   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
77   }
78 }
79 
80 //------------------------------------------------------------------------------
81 // 1D derivatives transpose
82 //------------------------------------------------------------------------------
83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
84 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
85                                        CeedScalar *__restrict__ r_V) {
86   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // 1D quadrature weights
93 //------------------------------------------------------------------------------
94 template <int P_1D, int Q_1D>
95 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
96   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
97 }
98 
99 //------------------------------------------------------------------------------
100 // 2D
101 //------------------------------------------------------------------------------
102 
103 //------------------------------------------------------------------------------
104 // 2D tensor contraction x
105 //------------------------------------------------------------------------------
106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
107 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
108   __syncthreads();
109   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
110   __syncthreads();
111   *V = 0.0;
112   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
113     for (CeedInt i = 0; i < P_1D; i++) {
114       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
115     }
116   }
117 }
118 
119 //------------------------------------------------------------------------------
120 // 2D tensor contract y
121 //------------------------------------------------------------------------------
122 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
123 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
124   __syncthreads();
125   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
126   __syncthreads();
127   *V = 0.0;
128   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
129     for (CeedInt i = 0; i < P_1D; i++) {
130       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
131     }
132   }
133 }
134 
135 //------------------------------------------------------------------------------
136 // 2D transpose tensor contract y
137 //------------------------------------------------------------------------------
138 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
139 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
140   __syncthreads();
141   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
142   __syncthreads();
143   *V = 0.0;
144   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
145     for (CeedInt i = 0; i < Q_1D; i++) {
146       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
147     }
148   }
149 }
150 
151 //------------------------------------------------------------------------------
152 // 2D transpose tensor contract x
153 //------------------------------------------------------------------------------
154 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
155 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
156   __syncthreads();
157   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
158   __syncthreads();
159   *V = 0.0;
160   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
161     for (CeedInt i = 0; i < Q_1D; i++) {
162       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
163     }
164   }
165 }
166 
167 //------------------------------------------------------------------------------
168 // 2D transpose tensor contract and add x
169 //------------------------------------------------------------------------------
170 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
171 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
172   __syncthreads();
173   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
174   __syncthreads();
175   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
176     for (CeedInt i = 0; i < Q_1D; i++) {
177       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
178     }
179   }
180 }
181 
182 //------------------------------------------------------------------------------
183 // 2D interpolate to quadrature points
184 //------------------------------------------------------------------------------
185 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
186 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
187                                       CeedScalar *__restrict__ r_V) {
188   CeedScalar r_t[1];
189   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
190     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
191     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
192   }
193 }
194 
195 //------------------------------------------------------------------------------
196 // 2D interpolate transpose
197 //------------------------------------------------------------------------------
198 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
199 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
200                                                CeedScalar *__restrict__ r_V) {
201   CeedScalar r_t[1];
202   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
203     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
204     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
205   }
206 }
207 
208 //------------------------------------------------------------------------------
209 // 2D derivatives at quadrature points
210 //------------------------------------------------------------------------------
211 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
212 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
213                                     CeedScalar *__restrict__ r_V) {
214   CeedScalar r_t[1];
215   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
216     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
217     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
218     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
219     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
220   }
221 }
222 
223 //------------------------------------------------------------------------------
224 // 2D derivatives transpose
225 //------------------------------------------------------------------------------
226 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
227 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
228                                              CeedScalar *__restrict__ r_V) {
229   CeedScalar r_t[1];
230   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
231     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
232     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
233     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
234     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
235   }
236 }
237 
238 //------------------------------------------------------------------------------
239 // 2D quadrature weights
240 //------------------------------------------------------------------------------
241 template <int P_1D, int Q_1D>
242 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
243   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
244 }
245 
246 //------------------------------------------------------------------------------
247 // 3D
248 //------------------------------------------------------------------------------
249 
250 //------------------------------------------------------------------------------
251 // 3D tensor contract x
252 //------------------------------------------------------------------------------
253 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
254 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
255   CeedScalar r_B[P_1D];
256   for (CeedInt i = 0; i < P_1D; i++) {
257     r_B[i] = B[i + data.t_id_x * P_1D];
258   }
259 
260   for (CeedInt k = 0; k < P_1D; k++) {
261     __syncthreads();
262     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
263     __syncthreads();
264     V[k] = 0.0;
265     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
266       for (CeedInt i = 0; i < P_1D; i++) {
267         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
268       }
269     }
270   }
271 }
272 
273 //------------------------------------------------------------------------------
274 // 3D tensor contract y
275 //------------------------------------------------------------------------------
276 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
277 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
278   CeedScalar r_B[P_1D];
279   for (CeedInt i = 0; i < P_1D; i++) {
280     r_B[i] = B[i + data.t_id_y * P_1D];
281   }
282 
283   for (CeedInt k = 0; k < P_1D; k++) {
284     __syncthreads();
285     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
286     __syncthreads();
287     V[k] = 0.0;
288     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
289       for (CeedInt i = 0; i < P_1D; i++) {
290         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
291       }
292     }
293   }
294 }
295 
296 //------------------------------------------------------------------------------
297 // 3D tensor contract z
298 //------------------------------------------------------------------------------
299 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
300 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
301   for (CeedInt k = 0; k < Q_1D; k++) {
302     V[k] = 0.0;
303     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
304       for (CeedInt i = 0; i < P_1D; i++) {
305         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
306       }
307     }
308   }
309 }
310 
311 //------------------------------------------------------------------------------
312 // 3D transpose tensor contract z
313 //------------------------------------------------------------------------------
314 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
315 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
316   for (CeedInt k = 0; k < P_1D; k++) {
317     V[k] = 0.0;
318     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
319       for (CeedInt i = 0; i < Q_1D; i++) {
320         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
321       }
322     }
323   }
324 }
325 
326 //------------------------------------------------------------------------------
327 // 3D transpose tensor contract y
328 //------------------------------------------------------------------------------
329 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
330 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
331   CeedScalar r_B[Q_1D];
332   for (CeedInt i = 0; i < Q_1D; i++) {
333     r_B[i] = B[data.t_id_y + i * P_1D];
334   }
335 
336   for (CeedInt k = 0; k < P_1D; k++) {
337     __syncthreads();
338     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
339     __syncthreads();
340     V[k] = 0.0;
341     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
342       for (CeedInt i = 0; i < Q_1D; i++) {
343         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
344       }
345     }
346   }
347 }
348 
349 //------------------------------------------------------------------------------
350 // 3D transpose tensor contract y
351 //------------------------------------------------------------------------------
352 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
353 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
354   CeedScalar r_B[Q_1D];
355   for (CeedInt i = 0; i < Q_1D; i++) {
356     r_B[i] = B[data.t_id_y + i * P_1D];
357   }
358 
359   for (CeedInt k = 0; k < P_1D; k++) {
360     __syncthreads();
361     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
362     __syncthreads();
363     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
364       for (CeedInt i = 0; i < Q_1D; i++) {
365         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
366       }
367     }
368   }
369 }
370 
371 //------------------------------------------------------------------------------
372 // 3D transpose tensor contract x
373 //------------------------------------------------------------------------------
374 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
375 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
376   CeedScalar r_B[Q_1D];
377   for (CeedInt i = 0; i < Q_1D; i++) {
378     r_B[i] = B[data.t_id_x + i * P_1D];
379   }
380 
381   for (CeedInt k = 0; k < P_1D; k++) {
382     __syncthreads();
383     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
384     __syncthreads();
385     V[k] = 0.0;
386     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
387       for (CeedInt i = 0; i < Q_1D; i++) {
388         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
389       }
390     }
391   }
392 }
393 
394 //------------------------------------------------------------------------------
395 // 3D transpose tensor contract add x
396 //------------------------------------------------------------------------------
397 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
398 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
399   CeedScalar r_B[Q_1D];
400   for (CeedInt i = 0; i < Q_1D; i++) {
401     r_B[i] = B[data.t_id_x + i * P_1D];
402   }
403 
404   for (CeedInt k = 0; k < P_1D; k++) {
405     __syncthreads();
406     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
407     __syncthreads();
408     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
409       for (CeedInt i = 0; i < Q_1D; i++) {
410         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
411       }
412     }
413   }
414 }
415 
416 //------------------------------------------------------------------------------
417 // 3D interpolate to quadrature points
418 //------------------------------------------------------------------------------
419 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
420 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
421                                       CeedScalar *__restrict__ r_V) {
422   CeedScalar r_t1[T_1D];
423   CeedScalar r_t2[T_1D];
424   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
425     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
426     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
427     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
428   }
429 }
430 
431 //------------------------------------------------------------------------------
432 // 3D interpolate transpose
433 //------------------------------------------------------------------------------
434 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
435 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
436                                                CeedScalar *__restrict__ r_V) {
437   CeedScalar r_t1[T_1D];
438   CeedScalar r_t2[T_1D];
439   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
440     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
441     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
442     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
443   }
444 }
445 
446 //------------------------------------------------------------------------------
447 // 3D derivatives at quadrature points
448 //------------------------------------------------------------------------------
449 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
450 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
451                                     CeedScalar *__restrict__ r_V) {
452   CeedScalar r_t1[T_1D];
453   CeedScalar r_t2[T_1D];
454   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
455     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
456     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
457     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
458     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
459     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
460     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
461     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
462     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
463     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
464   }
465 }
466 
467 //------------------------------------------------------------------------------
468 // 3D derivatives transpose
469 //------------------------------------------------------------------------------
470 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
471 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
472                                              CeedScalar *__restrict__ r_V) {
473   CeedScalar r_t1[T_1D];
474   CeedScalar r_t2[T_1D];
475   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
476     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
477     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
478     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
479     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
480     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
481     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
482     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
483     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
484     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
485   }
486 }
487 
488 //------------------------------------------------------------------------------
489 // 3D derivatives at quadrature points
490 //------------------------------------------------------------------------------
491 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
492 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
493                                               CeedScalar *__restrict__ r_V) {
494   CeedScalar r_t1[T_1D];
495   CeedScalar r_t2[T_1D];
496   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
497     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
498     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
499     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
500     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
501     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
502     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
503   }
504 }
505 
506 //------------------------------------------------------------------------------
507 // 3D derivatives transpose
508 //------------------------------------------------------------------------------
509 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
510 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
511                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
512   CeedScalar r_t1[T_1D];
513   CeedScalar r_t2[T_1D];
514   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
515     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
516     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
517     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
518     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
519     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
520     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
521   }
522 }
523 
524 //------------------------------------------------------------------------------
525 // 3D quadrature weights
526 //------------------------------------------------------------------------------
527 template <int P_1D, int Q_1D>
528 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
529   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
530   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
531   for (CeedInt q = 0; q < Q_1D; q++) {
532     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
533   }
534 }
535