xref: /libCEED/rust/libceed-sys/c-src/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision 412e56839ae1393ae29619b6fd39ea10cdd89adf)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D>
20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21   data.slice[data.t_id_x] = *U;
22   __syncthreads();
23   *V = 0.0;
24   if (data.t_id_x < Q_1D) {
25     for (CeedInt i = 0; i < P_1D; i++) {
26       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
27     }
28   }
29   __syncthreads();
30 }
31 
32 //------------------------------------------------------------------------------
33 // 1D transpose tensor contraction x
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D, int Q_1D>
36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37   data.slice[data.t_id_x] = *U;
38   __syncthreads();
39   *V = 0.0;
40   if (data.t_id_x < P_1D) {
41     for (CeedInt i = 0; i < Q_1D; i++) {
42       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
43     }
44   }
45   __syncthreads();
46 }
47 
48 //------------------------------------------------------------------------------
49 // 1D interpolate to quadrature points
50 //------------------------------------------------------------------------------
51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
53   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
55   }
56 }
57 
58 //------------------------------------------------------------------------------
59 // 1D interpolate transpose
60 //------------------------------------------------------------------------------
61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
63                                          CeedScalar *__restrict__ r_V) {
64   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D derivatives at quadrature points
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
73 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
74                               CeedScalar *__restrict__ r_V) {
75   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
77   }
78 }
79 
80 //------------------------------------------------------------------------------
81 // 1D derivatives transpose
82 //------------------------------------------------------------------------------
83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
84 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
85                                        CeedScalar *__restrict__ r_V) {
86   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // 1D quadrature weights
93 //------------------------------------------------------------------------------
94 template <int P_1D, int Q_1D>
95 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
96   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
97 }
98 
99 //------------------------------------------------------------------------------
100 // 2D
101 //------------------------------------------------------------------------------
102 
103 //------------------------------------------------------------------------------
104 // 2D tensor contraction x
105 //------------------------------------------------------------------------------
106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
107 inline __device__ void ContractX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
108                                    CeedScalar *V) {
109   data.slice[t_id_x + t_id_y * T_1D] = *U;
110   __syncthreads();
111   *V = 0.0;
112   if (t_id_x < Q_1D && t_id_y < P_1D) {
113     for (CeedInt i = 0; i < P_1D; i++) {
114       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
115     }
116   }
117   __syncthreads();
118 }
119 
120 //------------------------------------------------------------------------------
121 // 2D tensor contract y
122 //------------------------------------------------------------------------------
123 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
124 inline __device__ void ContractY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
125                                    CeedScalar *V) {
126   data.slice[t_id_x + t_id_y * T_1D] = *U;
127   __syncthreads();
128   *V = 0.0;
129   if (t_id_x < Q_1D && t_id_y < Q_1D) {
130     for (CeedInt i = 0; i < P_1D; i++) {
131       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
132     }
133   }
134   __syncthreads();
135 }
136 
137 //------------------------------------------------------------------------------
138 // 2D transpose tensor contract y
139 //------------------------------------------------------------------------------
140 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
141 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
142                                             CeedScalar *V) {
143   data.slice[t_id_x + t_id_y * T_1D] = *U;
144   __syncthreads();
145   *V = 0.0;
146   if (t_id_x < Q_1D && t_id_y < P_1D) {
147     for (CeedInt i = 0; i < Q_1D; i++) {
148       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
149     }
150   }
151   __syncthreads();
152 }
153 
154 //------------------------------------------------------------------------------
155 // 2D transpose tensor contract x
156 //------------------------------------------------------------------------------
157 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
158 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
159                                             CeedScalar *V) {
160   data.slice[t_id_x + t_id_y * T_1D] = *U;
161   __syncthreads();
162   *V = 0.0;
163   if (t_id_x < P_1D && t_id_y < P_1D) {
164     for (CeedInt i = 0; i < Q_1D; i++) {
165       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
166     }
167   }
168   __syncthreads();
169 }
170 
171 //------------------------------------------------------------------------------
172 // 2D transpose tensor contract and add x
173 //------------------------------------------------------------------------------
174 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
175 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
176                                                CeedScalar *V) {
177   data.slice[t_id_x + t_id_y * T_1D] = *U;
178   __syncthreads();
179   if (t_id_x < P_1D && t_id_y < P_1D) {
180     for (CeedInt i = 0; i < Q_1D; i++) {
181       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
182     }
183   }
184   __syncthreads();
185 }
186 
187 //------------------------------------------------------------------------------
188 // 2D interpolate to quadrature points
189 //------------------------------------------------------------------------------
190 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
191 inline __device__ void InterpTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
192                                            const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
193   CeedScalar r_t[1];
194   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
195     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
196     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
197   }
198 }
199 
200 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
201 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
202                                       CeedScalar *__restrict__ r_V) {
203   InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
204 }
205 
206 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
207 inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
208                                                CeedScalar *__restrict__ r_V) {
209   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
210 
211   InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
212 }
213 
214 //------------------------------------------------------------------------------
215 // 2D interpolate transpose
216 //------------------------------------------------------------------------------
217 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
218 inline __device__ void InterpTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
219                                                     const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
220   CeedScalar r_t[1];
221   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
222     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
223     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
224   }
225 }
226 
227 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
228 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
229                                                CeedScalar *__restrict__ r_V) {
230   InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
231 }
232 
233 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
234 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
235                                                         CeedScalar *__restrict__ r_V) {
236   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
237 
238   InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
239 }
240 
241 //------------------------------------------------------------------------------
242 // 2D derivatives at quadrature points
243 //------------------------------------------------------------------------------
244 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
245 inline __device__ void GradTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
246                                          const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
247   CeedScalar r_t[1];
248   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
249     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
250     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
251     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
252     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
253   }
254 }
255 
256 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
257 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
258                                     CeedScalar *__restrict__ r_V) {
259   GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
260 }
261 
262 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
263 inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
264                                              CeedScalar *__restrict__ r_V) {
265   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
266 
267   GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
268 }
269 
270 //------------------------------------------------------------------------------
271 // 2D derivatives transpose
272 //------------------------------------------------------------------------------
273 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
274 inline __device__ void GradTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
275                                                   const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
276   CeedScalar r_t[1];
277   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
278     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
279     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
280     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
281     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
282   }
283 }
284 
285 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
286 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
287                                              CeedScalar *__restrict__ r_V) {
288   GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
289 }
290 
291 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
292 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
293                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
294   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
295 
296   GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
297 }
298 
299 //------------------------------------------------------------------------------
300 // 2D quadrature weights
301 //------------------------------------------------------------------------------
302 template <int Q_1D>
303 inline __device__ void WeightTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d,
304                                            CeedScalar *w) {
305   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
306 }
307 
308 template <int P_1D, int Q_1D>
309 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
310   WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w);
311 }
312 
313 template <int P_1D, int Q_1D>
314 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
315   const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
316 
317   WeightTensor2d_Core<Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, q_weight_1d, w);
318 }
319 
320 //------------------------------------------------------------------------------
321 // 3D
322 //------------------------------------------------------------------------------
323 
324 //------------------------------------------------------------------------------
325 // 3D tensor contract x
326 //------------------------------------------------------------------------------
327 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
328 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
329   CeedScalar r_B[P_1D];
330   for (CeedInt i = 0; i < P_1D; i++) {
331     r_B[i] = B[i + data.t_id_x * P_1D];
332   }
333 
334   for (CeedInt k = 0; k < P_1D; k++) {
335     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
336     __syncthreads();
337     V[k] = 0.0;
338     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
339       for (CeedInt i = 0; i < P_1D; i++) {
340         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
341       }
342     }
343     __syncthreads();
344   }
345 }
346 
347 //------------------------------------------------------------------------------
348 // 3D tensor contract y
349 //------------------------------------------------------------------------------
350 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
351 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
352   CeedScalar r_B[P_1D];
353   for (CeedInt i = 0; i < P_1D; i++) {
354     r_B[i] = B[i + data.t_id_y * P_1D];
355   }
356 
357   for (CeedInt k = 0; k < P_1D; k++) {
358     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
359     __syncthreads();
360     V[k] = 0.0;
361     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
362       for (CeedInt i = 0; i < P_1D; i++) {
363         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
364       }
365     }
366     __syncthreads();
367   }
368 }
369 
370 //------------------------------------------------------------------------------
371 // 3D tensor contract z
372 //------------------------------------------------------------------------------
373 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
374 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
375   for (CeedInt k = 0; k < Q_1D; k++) {
376     V[k] = 0.0;
377     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
378       for (CeedInt i = 0; i < P_1D; i++) {
379         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
380       }
381     }
382   }
383 }
384 
385 //------------------------------------------------------------------------------
386 // 3D transpose tensor contract z
387 //------------------------------------------------------------------------------
388 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
389 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
390   for (CeedInt k = 0; k < P_1D; k++) {
391     V[k] = 0.0;
392     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
393       for (CeedInt i = 0; i < Q_1D; i++) {
394         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
395       }
396     }
397   }
398 }
399 
400 //------------------------------------------------------------------------------
401 // 3D transpose tensor contract y
402 //------------------------------------------------------------------------------
403 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
404 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
405   CeedScalar r_B[Q_1D];
406   for (CeedInt i = 0; i < Q_1D; i++) {
407     r_B[i] = B[data.t_id_y + i * P_1D];
408   }
409 
410   for (CeedInt k = 0; k < P_1D; k++) {
411     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
412     __syncthreads();
413     V[k] = 0.0;
414     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
415       for (CeedInt i = 0; i < Q_1D; i++) {
416         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
417       }
418     }
419     __syncthreads();
420   }
421 }
422 
423 //------------------------------------------------------------------------------
424 // 3D transpose tensor contract y
425 //------------------------------------------------------------------------------
426 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
427 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
428   CeedScalar r_B[Q_1D];
429   for (CeedInt i = 0; i < Q_1D; i++) {
430     r_B[i] = B[data.t_id_y + i * P_1D];
431   }
432 
433   for (CeedInt k = 0; k < P_1D; k++) {
434     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
435     __syncthreads();
436     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
437       for (CeedInt i = 0; i < Q_1D; i++) {
438         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
439       }
440     }
441     __syncthreads();
442   }
443 }
444 
445 //------------------------------------------------------------------------------
446 // 3D transpose tensor contract x
447 //------------------------------------------------------------------------------
448 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
449 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
450   CeedScalar r_B[Q_1D];
451   for (CeedInt i = 0; i < Q_1D; i++) {
452     r_B[i] = B[data.t_id_x + i * P_1D];
453   }
454 
455   for (CeedInt k = 0; k < P_1D; k++) {
456     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
457     __syncthreads();
458     V[k] = 0.0;
459     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
460       for (CeedInt i = 0; i < Q_1D; i++) {
461         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
462       }
463     }
464     __syncthreads();
465   }
466 }
467 
468 //------------------------------------------------------------------------------
469 // 3D transpose tensor contract add x
470 //------------------------------------------------------------------------------
471 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
472 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
473   CeedScalar r_B[Q_1D];
474   for (CeedInt i = 0; i < Q_1D; i++) {
475     r_B[i] = B[data.t_id_x + i * P_1D];
476   }
477 
478   for (CeedInt k = 0; k < P_1D; k++) {
479     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
480     __syncthreads();
481     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
482       for (CeedInt i = 0; i < Q_1D; i++) {
483         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
484       }
485     }
486     __syncthreads();
487   }
488 }
489 
490 //------------------------------------------------------------------------------
491 // 3D interpolate to quadrature points
492 //------------------------------------------------------------------------------
493 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
494 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
495                                       CeedScalar *__restrict__ r_V) {
496   CeedScalar r_t1[T_1D];
497   CeedScalar r_t2[T_1D];
498   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
499     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
500     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
501     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
502   }
503 }
504 
505 //------------------------------------------------------------------------------
506 // 3D interpolate transpose
507 //------------------------------------------------------------------------------
508 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
509 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
510                                                CeedScalar *__restrict__ r_V) {
511   CeedScalar r_t1[T_1D];
512   CeedScalar r_t2[T_1D];
513   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
514     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
515     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
516     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
517   }
518 }
519 
520 //------------------------------------------------------------------------------
521 // 3D derivatives at quadrature points
522 //------------------------------------------------------------------------------
523 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
524 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
525                                     CeedScalar *__restrict__ r_V) {
526   CeedScalar r_t1[T_1D];
527   CeedScalar r_t2[T_1D];
528   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
529     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
530     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
531     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
532     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
533     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
534     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
535     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
536     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
537     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
538   }
539 }
540 
541 //------------------------------------------------------------------------------
542 // 3D derivatives transpose
543 //------------------------------------------------------------------------------
544 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
545 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
546                                              CeedScalar *__restrict__ r_V) {
547   CeedScalar r_t1[T_1D];
548   CeedScalar r_t2[T_1D];
549   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
550     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
551     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
552     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
553     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
554     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
555     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
556     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
557     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
558     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
559   }
560 }
561 
562 //------------------------------------------------------------------------------
563 // 3D derivatives at quadrature points
564 //------------------------------------------------------------------------------
565 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
566 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
567                                               CeedScalar *__restrict__ r_V) {
568   CeedScalar r_t1[T_1D];
569   CeedScalar r_t2[T_1D];
570   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
571     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
572     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
573     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
574     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
575     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
576     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
577   }
578 }
579 
580 //------------------------------------------------------------------------------
581 // 3D derivatives transpose
582 //------------------------------------------------------------------------------
583 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
584 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
585                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
586   CeedScalar r_t1[T_1D];
587   CeedScalar r_t2[T_1D];
588   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
589     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
590     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
591     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
592     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
593     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
594     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
595   }
596 }
597 
598 //------------------------------------------------------------------------------
599 // 3D quadrature weights
600 //------------------------------------------------------------------------------
601 template <int P_1D, int Q_1D>
602 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
603   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
604   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
605   for (CeedInt q = 0; q < Q_1D; q++) {
606     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
607   }
608 }
609