xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision 343e3094792a64f9c2da70ef2256f98e7dc173cf)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D>
20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21   data.slice[data.t_id_x] = *U;
22   __syncthreads();
23   *V = 0.0;
24   if (data.t_id_x < Q_1D) {
25     for (CeedInt i = 0; i < P_1D; i++) {
26       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
27     }
28   }
29   __syncthreads();
30 }
31 
32 //------------------------------------------------------------------------------
33 // 1D transpose tensor contraction x
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D, int Q_1D>
36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37   data.slice[data.t_id_x] = *U;
38   __syncthreads();
39   *V = 0.0;
40   if (data.t_id_x < P_1D) {
41     for (CeedInt i = 0; i < Q_1D; i++) {
42       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
43     }
44   }
45   __syncthreads();
46 }
47 
48 //------------------------------------------------------------------------------
49 // 1D interpolate to quadrature points
50 //------------------------------------------------------------------------------
51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
53   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
55   }
56 }
57 
58 //------------------------------------------------------------------------------
59 // 1D interpolate transpose
60 //------------------------------------------------------------------------------
61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
63                                          CeedScalar *__restrict__ r_V) {
64   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D derivatives at quadrature points
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
73 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
74                               CeedScalar *__restrict__ r_V) {
75   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
77   }
78 }
79 
80 //------------------------------------------------------------------------------
81 // 1D derivatives transpose
82 //------------------------------------------------------------------------------
83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
84 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
85                                        CeedScalar *__restrict__ r_V) {
86   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // 1D quadrature weights
93 //------------------------------------------------------------------------------
94 template <int P_1D, int Q_1D>
95 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
96   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
97 }
98 
99 //------------------------------------------------------------------------------
100 // 2D
101 //------------------------------------------------------------------------------
102 
103 //------------------------------------------------------------------------------
104 // 2D tensor contraction x
105 //------------------------------------------------------------------------------
106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
107 inline __device__ void ContractX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
108                                    CeedScalar *V) {
109   data.slice[t_id_x + t_id_y * T_1D] = *U;
110   __syncthreads();
111   *V = 0.0;
112   if (t_id_x < Q_1D && t_id_y < P_1D) {
113     for (CeedInt i = 0; i < P_1D; i++) {
114       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
115     }
116   }
117   __syncthreads();
118 }
119 
120 //------------------------------------------------------------------------------
121 // 2D tensor contract y
122 //------------------------------------------------------------------------------
123 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
124 inline __device__ void ContractY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
125                                    CeedScalar *V) {
126   data.slice[t_id_x + t_id_y * T_1D] = *U;
127   __syncthreads();
128   *V = 0.0;
129   if (t_id_x < Q_1D && t_id_y < Q_1D) {
130     for (CeedInt i = 0; i < P_1D; i++) {
131       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
132     }
133   }
134   __syncthreads();
135 }
136 
137 //------------------------------------------------------------------------------
138 // 2D transpose tensor contract y
139 //------------------------------------------------------------------------------
140 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
141 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
142                                             CeedScalar *V) {
143   data.slice[t_id_x + t_id_y * T_1D] = *U;
144   __syncthreads();
145   *V = 0.0;
146   if (t_id_x < Q_1D && t_id_y < P_1D) {
147     for (CeedInt i = 0; i < Q_1D; i++) {
148       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
149     }
150   }
151   __syncthreads();
152 }
153 
154 //------------------------------------------------------------------------------
155 // 2D transpose tensor contract x
156 //------------------------------------------------------------------------------
157 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
158 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
159                                             CeedScalar *V) {
160   data.slice[t_id_x + t_id_y * T_1D] = *U;
161   __syncthreads();
162   *V = 0.0;
163   if (t_id_x < P_1D && t_id_y < P_1D) {
164     for (CeedInt i = 0; i < Q_1D; i++) {
165       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
166     }
167   }
168   __syncthreads();
169 }
170 
171 //------------------------------------------------------------------------------
172 // 2D transpose tensor contract and add x
173 //------------------------------------------------------------------------------
174 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
175 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
176                                                CeedScalar *V) {
177   data.slice[t_id_x + t_id_y * T_1D] = *U;
178   __syncthreads();
179   if (t_id_x < P_1D && t_id_y < P_1D) {
180     for (CeedInt i = 0; i < Q_1D; i++) {
181       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
182     }
183   }
184   __syncthreads();
185 }
186 
187 //------------------------------------------------------------------------------
188 // 2D interpolate to quadrature points
189 //------------------------------------------------------------------------------
190 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
191 inline __device__ void InterpTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
192                                            const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
193   CeedScalar r_t[1];
194   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
195     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
196     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
197   }
198 }
199 
200 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
201 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
202                                       CeedScalar *__restrict__ r_V) {
203   InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
204 }
205 
206 //------------------------------------------------------------------------------
207 // 2D interpolate transpose
208 //------------------------------------------------------------------------------
209 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
210 inline __device__ void InterpTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
211                                                     const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
212   CeedScalar r_t[1];
213   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
214     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
215     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
216   }
217 }
218 
219 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
220 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
221                                                CeedScalar *__restrict__ r_V) {
222   InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
223 }
224 
225 //------------------------------------------------------------------------------
226 // 2D derivatives at quadrature points
227 //------------------------------------------------------------------------------
228 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
229 inline __device__ void GradTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
230                                          const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
231   CeedScalar r_t[1];
232   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
233     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
234     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
235     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
236     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
237   }
238 }
239 
240 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
241 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
242                                     CeedScalar *__restrict__ r_V) {
243   GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
244 }
245 
246 //------------------------------------------------------------------------------
247 // 2D derivatives transpose
248 //------------------------------------------------------------------------------
249 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
250 inline __device__ void GradTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
251                                                   const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
252   CeedScalar r_t[1];
253   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
254     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
255     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
256     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
257     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
258   }
259 }
260 
261 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
262 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
263                                              CeedScalar *__restrict__ r_V) {
264   GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
265 }
266 
267 //------------------------------------------------------------------------------
268 // 2D quadrature weights
269 //------------------------------------------------------------------------------
270 template <int Q_1D>
271 inline __device__ void WeightTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d,
272                                            CeedScalar *w) {
273   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
274 }
275 
276 template <int P_1D, int Q_1D>
277 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
278   WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w);
279 }
280 
281 //------------------------------------------------------------------------------
282 // 3D
283 //------------------------------------------------------------------------------
284 
285 //------------------------------------------------------------------------------
286 // 3D tensor contract x
287 //------------------------------------------------------------------------------
288 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
289 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
290   CeedScalar r_B[P_1D];
291   for (CeedInt i = 0; i < P_1D; i++) {
292     r_B[i] = B[i + data.t_id_x * P_1D];
293   }
294 
295   for (CeedInt k = 0; k < P_1D; k++) {
296     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
297     __syncthreads();
298     V[k] = 0.0;
299     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
300       for (CeedInt i = 0; i < P_1D; i++) {
301         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
302       }
303     }
304     __syncthreads();
305   }
306 }
307 
308 //------------------------------------------------------------------------------
309 // 3D tensor contract y
310 //------------------------------------------------------------------------------
311 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
312 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
313   CeedScalar r_B[P_1D];
314   for (CeedInt i = 0; i < P_1D; i++) {
315     r_B[i] = B[i + data.t_id_y * P_1D];
316   }
317 
318   for (CeedInt k = 0; k < P_1D; k++) {
319     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
320     __syncthreads();
321     V[k] = 0.0;
322     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
323       for (CeedInt i = 0; i < P_1D; i++) {
324         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
325       }
326     }
327     __syncthreads();
328   }
329 }
330 
331 //------------------------------------------------------------------------------
332 // 3D tensor contract z
333 //------------------------------------------------------------------------------
334 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
335 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
336   for (CeedInt k = 0; k < Q_1D; k++) {
337     V[k] = 0.0;
338     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
339       for (CeedInt i = 0; i < P_1D; i++) {
340         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
341       }
342     }
343   }
344 }
345 
346 //------------------------------------------------------------------------------
347 // 3D transpose tensor contract z
348 //------------------------------------------------------------------------------
349 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
350 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
351   for (CeedInt k = 0; k < P_1D; k++) {
352     V[k] = 0.0;
353     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
354       for (CeedInt i = 0; i < Q_1D; i++) {
355         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
356       }
357     }
358   }
359 }
360 
361 //------------------------------------------------------------------------------
362 // 3D transpose tensor contract y
363 //------------------------------------------------------------------------------
364 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
365 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
366   CeedScalar r_B[Q_1D];
367   for (CeedInt i = 0; i < Q_1D; i++) {
368     r_B[i] = B[data.t_id_y + i * P_1D];
369   }
370 
371   for (CeedInt k = 0; k < P_1D; k++) {
372     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
373     __syncthreads();
374     V[k] = 0.0;
375     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
376       for (CeedInt i = 0; i < Q_1D; i++) {
377         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
378       }
379     }
380     __syncthreads();
381   }
382 }
383 
384 //------------------------------------------------------------------------------
385 // 3D transpose tensor contract y
386 //------------------------------------------------------------------------------
387 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
388 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
389   CeedScalar r_B[Q_1D];
390   for (CeedInt i = 0; i < Q_1D; i++) {
391     r_B[i] = B[data.t_id_y + i * P_1D];
392   }
393 
394   for (CeedInt k = 0; k < P_1D; k++) {
395     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
396     __syncthreads();
397     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
398       for (CeedInt i = 0; i < Q_1D; i++) {
399         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
400       }
401     }
402     __syncthreads();
403   }
404 }
405 
406 //------------------------------------------------------------------------------
407 // 3D transpose tensor contract x
408 //------------------------------------------------------------------------------
409 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
410 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
411   CeedScalar r_B[Q_1D];
412   for (CeedInt i = 0; i < Q_1D; i++) {
413     r_B[i] = B[data.t_id_x + i * P_1D];
414   }
415 
416   for (CeedInt k = 0; k < P_1D; k++) {
417     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
418     __syncthreads();
419     V[k] = 0.0;
420     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
421       for (CeedInt i = 0; i < Q_1D; i++) {
422         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
423       }
424     }
425     __syncthreads();
426   }
427 }
428 
429 //------------------------------------------------------------------------------
430 // 3D transpose tensor contract add x
431 //------------------------------------------------------------------------------
432 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
433 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
434   CeedScalar r_B[Q_1D];
435   for (CeedInt i = 0; i < Q_1D; i++) {
436     r_B[i] = B[data.t_id_x + i * P_1D];
437   }
438 
439   for (CeedInt k = 0; k < P_1D; k++) {
440     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
441     __syncthreads();
442     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
443       for (CeedInt i = 0; i < Q_1D; i++) {
444         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
445       }
446     }
447     __syncthreads();
448   }
449 }
450 
451 //------------------------------------------------------------------------------
452 // 3D interpolate to quadrature points
453 //------------------------------------------------------------------------------
454 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
455 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
456                                       CeedScalar *__restrict__ r_V) {
457   CeedScalar r_t1[T_1D];
458   CeedScalar r_t2[T_1D];
459   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
460     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
461     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
462     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
463   }
464 }
465 
466 //------------------------------------------------------------------------------
467 // 3D interpolate transpose
468 //------------------------------------------------------------------------------
469 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
470 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
471                                                CeedScalar *__restrict__ r_V) {
472   CeedScalar r_t1[T_1D];
473   CeedScalar r_t2[T_1D];
474   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
475     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
476     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
477     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
478   }
479 }
480 
481 //------------------------------------------------------------------------------
482 // 3D derivatives at quadrature points
483 //------------------------------------------------------------------------------
484 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
485 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
486                                     CeedScalar *__restrict__ r_V) {
487   CeedScalar r_t1[T_1D];
488   CeedScalar r_t2[T_1D];
489   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
490     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
491     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
492     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
493     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
494     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
495     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
496     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
497     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
498     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
499   }
500 }
501 
502 //------------------------------------------------------------------------------
503 // 3D derivatives transpose
504 //------------------------------------------------------------------------------
505 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
506 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
507                                              CeedScalar *__restrict__ r_V) {
508   CeedScalar r_t1[T_1D];
509   CeedScalar r_t2[T_1D];
510   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
511     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
512     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
513     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
514     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
515     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
516     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
517     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
518     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
519     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
520   }
521 }
522 
523 //------------------------------------------------------------------------------
524 // 3D derivatives at quadrature points
525 //------------------------------------------------------------------------------
526 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
527 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
528                                               CeedScalar *__restrict__ r_V) {
529   CeedScalar r_t1[T_1D];
530   CeedScalar r_t2[T_1D];
531   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
532     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
533     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
534     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
535     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
536     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
537     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
538   }
539 }
540 
541 //------------------------------------------------------------------------------
542 // 3D derivatives transpose
543 //------------------------------------------------------------------------------
544 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
545 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
546                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
547   CeedScalar r_t1[T_1D];
548   CeedScalar r_t2[T_1D];
549   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
550     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
551     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
552     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
553     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
554     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
555     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
556   }
557 }
558 
559 //------------------------------------------------------------------------------
560 // 3D quadrature weights
561 //------------------------------------------------------------------------------
562 template <int P_1D, int Q_1D>
563 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
564   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
565   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
566   for (CeedInt q = 0; q < Q_1D; q++) {
567     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
568   }
569 }
570