xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision ed094490f53e580908aa80e9fe815a6fd76d7526)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D>
20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21   __syncthreads();
22   data.slice[data.t_id_x] = *U;
23   __syncthreads();
24   *V = 0.0;
25   if (data.t_id_x < Q_1D) {
26     for (CeedInt i = 0; i < P_1D; i++) {
27       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
28     }
29   }
30 }
31 
32 //------------------------------------------------------------------------------
33 // 1D transpose tensor contraction x
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D, int Q_1D>
36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37   __syncthreads();
38   data.slice[data.t_id_x] = *U;
39   __syncthreads();
40   *V = 0.0;
41   if (data.t_id_x < P_1D) {
42     for (CeedInt i = 0; i < Q_1D; i++) {
43       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
44     }
45   }
46 }
47 
48 //------------------------------------------------------------------------------
49 // 1D interpolate to quadrature points
50 //------------------------------------------------------------------------------
51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
53   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
55   }
56 }
57 
58 //------------------------------------------------------------------------------
59 // 1D interpolate transpose
60 //------------------------------------------------------------------------------
61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
63                                          CeedScalar *__restrict__ r_V) {
64   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D interpolate to quadrature points, nodes and quadrature points collocated
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
73 inline __device__ void InterpCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
74                                                CeedScalar *__restrict__ r_V) {
75   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76     r_V[comp] = r_U[comp];
77   }
78 }
79 
80 //------------------------------------------------------------------------------
81 // 1D interpolate transpose, nodes and quadrature points collocated
82 //------------------------------------------------------------------------------
83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
84 inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
85                                                         CeedScalar *__restrict__ r_V) {
86   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87     r_V[comp] = r_U[comp];
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // 1D derivatives at quadrature points
93 //------------------------------------------------------------------------------
94 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
95 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
96                               CeedScalar *__restrict__ r_V) {
97   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
98     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
99   }
100 }
101 
102 //------------------------------------------------------------------------------
103 // 1D derivatives transpose
104 //------------------------------------------------------------------------------
105 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
106 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
107                                        CeedScalar *__restrict__ r_V) {
108   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
109     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
110   }
111 }
112 
113 //------------------------------------------------------------------------------
114 // 1D quadrature weights
115 //------------------------------------------------------------------------------
116 template <int P_1D, int Q_1D>
117 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
118   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
119 }
120 
121 //------------------------------------------------------------------------------
122 // 2D
123 //------------------------------------------------------------------------------
124 
125 //------------------------------------------------------------------------------
126 // 2D tensor contraction x
127 //------------------------------------------------------------------------------
128 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
129 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130   __syncthreads();
131   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
132   __syncthreads();
133   *V = 0.0;
134   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
135     for (CeedInt i = 0; i < P_1D; i++) {
136       *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
137     }
138   }
139 }
140 
141 //------------------------------------------------------------------------------
142 // 2D tensor contract y
143 //------------------------------------------------------------------------------
144 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
145 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
146   __syncthreads();
147   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
148   __syncthreads();
149   *V = 0.0;
150   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
151     for (CeedInt i = 0; i < P_1D; i++) {
152       *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
153     }
154   }
155 }
156 
157 //------------------------------------------------------------------------------
158 // 2D transpose tensor contract y
159 //------------------------------------------------------------------------------
160 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
161 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
162   __syncthreads();
163   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
164   __syncthreads();
165   *V = 0.0;
166   if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
167     for (CeedInt i = 0; i < Q_1D; i++) {
168       *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
169     }
170   }
171 }
172 
173 //------------------------------------------------------------------------------
174 // 2D transpose tensor contract x
175 //------------------------------------------------------------------------------
176 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
177 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
178   __syncthreads();
179   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
180   __syncthreads();
181   *V = 0.0;
182   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
183     for (CeedInt i = 0; i < Q_1D; i++) {
184       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
185     }
186   }
187 }
188 
189 //------------------------------------------------------------------------------
190 // 2D transpose tensor contract and add x
191 //------------------------------------------------------------------------------
192 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
193 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
194   __syncthreads();
195   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
196   __syncthreads();
197   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
198     for (CeedInt i = 0; i < Q_1D; i++) {
199       *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
200     }
201   }
202 }
203 
204 //------------------------------------------------------------------------------
205 // 2D interpolate to quadrature points
206 //------------------------------------------------------------------------------
207 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
208 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
209                                       CeedScalar *__restrict__ r_V) {
210   CeedScalar r_t[1];
211   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
212     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
213     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
214   }
215 }
216 
217 //------------------------------------------------------------------------------
218 // 2D interpolate transpose
219 //------------------------------------------------------------------------------
220 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
221 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
222                                                CeedScalar *__restrict__ r_V) {
223   CeedScalar r_t[1];
224   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
225     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
226     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
227   }
228 }
229 
230 //------------------------------------------------------------------------------
231 // 2D interpolate to quadrature points, nodes and quadrature points collocated
232 //------------------------------------------------------------------------------
233 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
234 inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
235                                                      CeedScalar *__restrict__ r_V) {
236   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
237     r_V[comp] = r_U[comp];
238   }
239 }
240 
241 //------------------------------------------------------------------------------
242 // 2D interpolate transpose, nodes and quadrature points collocated
243 //------------------------------------------------------------------------------
244 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
245 inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
246                                                               CeedScalar *__restrict__ r_V) {
247   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
248     r_V[comp] = r_U[comp];
249   }
250 }
251 
252 //------------------------------------------------------------------------------
253 // 2D derivatives at quadrature points
254 //------------------------------------------------------------------------------
255 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
256 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
257                                     CeedScalar *__restrict__ r_V) {
258   CeedScalar r_t[1];
259   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
260     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
261     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
262     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
263     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
264   }
265 }
266 
267 //------------------------------------------------------------------------------
268 // 2D derivatives transpose
269 //------------------------------------------------------------------------------
270 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
271 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
272                                              CeedScalar *__restrict__ r_V) {
273   CeedScalar r_t[1];
274   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
275     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
276     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
277     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
278     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
279   }
280 }
281 
282 //------------------------------------------------------------------------------
283 // 2D derivatives at quadrature points, nodes and quadrature points collocated
284 //------------------------------------------------------------------------------
285 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
286 inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
287                                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
288   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
289     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
290     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
291   }
292 }
293 
294 //------------------------------------------------------------------------------
295 // 2D derivatives transpose, nodes and quadrature points collocated
296 //------------------------------------------------------------------------------
297 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
298 inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
299                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
300   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
301     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
302     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
303   }
304 }
305 
306 //------------------------------------------------------------------------------
307 // 2D quadrature weights
308 //------------------------------------------------------------------------------
309 template <int P_1D, int Q_1D>
310 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
311   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
312 }
313 
314 //------------------------------------------------------------------------------
315 // 3D
316 //------------------------------------------------------------------------------
317 
318 //------------------------------------------------------------------------------
319 // 3D tensor contract x
320 //------------------------------------------------------------------------------
321 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
322 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
323   CeedScalar r_B[P_1D];
324   for (CeedInt i = 0; i < P_1D; i++) {
325     r_B[i] = B[i + data.t_id_x * P_1D];
326   }
327 
328   for (CeedInt k = 0; k < P_1D; k++) {
329     __syncthreads();
330     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
331     __syncthreads();
332     V[k] = 0.0;
333     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
334       for (CeedInt i = 0; i < P_1D; i++) {
335         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
336       }
337     }
338   }
339 }
340 
341 //------------------------------------------------------------------------------
342 // 3D tensor contract y
343 //------------------------------------------------------------------------------
344 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
345 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
346   CeedScalar r_B[P_1D];
347   for (CeedInt i = 0; i < P_1D; i++) {
348     r_B[i] = B[i + data.t_id_y * P_1D];
349   }
350 
351   for (CeedInt k = 0; k < P_1D; k++) {
352     __syncthreads();
353     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
354     __syncthreads();
355     V[k] = 0.0;
356     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
357       for (CeedInt i = 0; i < P_1D; i++) {
358         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
359       }
360     }
361   }
362 }
363 
364 //------------------------------------------------------------------------------
365 // 3D tensor contract z
366 //------------------------------------------------------------------------------
367 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
368 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
369   for (CeedInt k = 0; k < Q_1D; k++) {
370     V[k] = 0.0;
371     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
372       for (CeedInt i = 0; i < P_1D; i++) {
373         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
374       }
375     }
376   }
377 }
378 
379 //------------------------------------------------------------------------------
380 // 3D transpose tensor contract z
381 //------------------------------------------------------------------------------
382 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
383 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
384   for (CeedInt k = 0; k < P_1D; k++) {
385     V[k] = 0.0;
386     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
387       for (CeedInt i = 0; i < Q_1D; i++) {
388         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
389       }
390     }
391   }
392 }
393 
394 //------------------------------------------------------------------------------
395 // 3D transpose tensor contract y
396 //------------------------------------------------------------------------------
397 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
398 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
399   CeedScalar r_B[Q_1D];
400   for (CeedInt i = 0; i < Q_1D; i++) {
401     r_B[i] = B[data.t_id_y + i * P_1D];
402   }
403 
404   for (CeedInt k = 0; k < P_1D; k++) {
405     __syncthreads();
406     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
407     __syncthreads();
408     V[k] = 0.0;
409     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
410       for (CeedInt i = 0; i < Q_1D; i++) {
411         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
412       }
413     }
414   }
415 }
416 
417 //------------------------------------------------------------------------------
418 // 3D transpose tensor contract y
419 //------------------------------------------------------------------------------
420 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
421 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
422   CeedScalar r_B[Q_1D];
423   for (CeedInt i = 0; i < Q_1D; i++) {
424     r_B[i] = B[data.t_id_y + i * P_1D];
425   }
426 
427   for (CeedInt k = 0; k < P_1D; k++) {
428     __syncthreads();
429     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
430     __syncthreads();
431     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
432       for (CeedInt i = 0; i < Q_1D; i++) {
433         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
434       }
435     }
436   }
437 }
438 
439 //------------------------------------------------------------------------------
440 // 3D transpose tensor contract x
441 //------------------------------------------------------------------------------
442 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
443 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
444   CeedScalar r_B[Q_1D];
445   for (CeedInt i = 0; i < Q_1D; i++) {
446     r_B[i] = B[data.t_id_x + i * P_1D];
447   }
448 
449   for (CeedInt k = 0; k < P_1D; k++) {
450     __syncthreads();
451     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
452     __syncthreads();
453     V[k] = 0.0;
454     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
455       for (CeedInt i = 0; i < Q_1D; i++) {
456         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
457       }
458     }
459   }
460 }
461 
462 //------------------------------------------------------------------------------
463 // 3D transpose tensor contract add x
464 //------------------------------------------------------------------------------
465 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
466 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
467   CeedScalar r_B[Q_1D];
468   for (CeedInt i = 0; i < Q_1D; i++) {
469     r_B[i] = B[data.t_id_x + i * P_1D];
470   }
471 
472   for (CeedInt k = 0; k < P_1D; k++) {
473     __syncthreads();
474     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
475     __syncthreads();
476     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
477       for (CeedInt i = 0; i < Q_1D; i++) {
478         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
479       }
480     }
481   }
482 }
483 
484 //------------------------------------------------------------------------------
485 // 3D interpolate to quadrature points
486 //------------------------------------------------------------------------------
487 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
488 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
489                                       CeedScalar *__restrict__ r_V) {
490   CeedScalar r_t1[T_1D];
491   CeedScalar r_t2[T_1D];
492   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
493     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
494     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
495     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
496   }
497 }
498 
499 //------------------------------------------------------------------------------
500 // 3D interpolate transpose
501 //------------------------------------------------------------------------------
502 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
503 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
504                                                CeedScalar *__restrict__ r_V) {
505   CeedScalar r_t1[T_1D];
506   CeedScalar r_t2[T_1D];
507   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
508     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
509     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
510     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
511   }
512 }
513 
514 //------------------------------------------------------------------------------
515 // 3D interpolate to quadrature points, nodes and quadrature points collocated
516 //------------------------------------------------------------------------------
517 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
518 inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
519                                                      CeedScalar *__restrict__ r_V) {
520   for (CeedInt i = 0; i < Q_1D; i++) {
521     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
522       r_V[i + comp * Q_1D] = r_U[i + comp * P_1D];
523     }
524   }
525 }
526 
527 //------------------------------------------------------------------------------
528 // 3D interpolate transpose, nodes and quadrature points collocated
529 //------------------------------------------------------------------------------
530 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
531 inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
532                                                               CeedScalar *__restrict__ r_V) {
533   for (CeedInt i = 0; i < Q_1D; i++) {
534     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
535       r_V[i + comp * P_1D] = r_U[i + comp * Q_1D];
536     }
537   }
538 }
539 
540 //------------------------------------------------------------------------------
541 // 3D derivatives at quadrature points
542 //------------------------------------------------------------------------------
543 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
544 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
545                                     CeedScalar *__restrict__ r_V) {
546   CeedScalar r_t1[T_1D];
547   CeedScalar r_t2[T_1D];
548   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
549     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
550     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
551     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
552     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
553     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
554     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
555     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
556     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
557     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
558   }
559 }
560 
561 //------------------------------------------------------------------------------
562 // 3D derivatives transpose
563 //------------------------------------------------------------------------------
564 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
565 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
566                                              CeedScalar *__restrict__ r_V) {
567   CeedScalar r_t1[T_1D];
568   CeedScalar r_t2[T_1D];
569   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
570     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
571     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
572     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
573     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
574     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
575     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
576     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
577     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
578     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
579   }
580 }
581 
582 //------------------------------------------------------------------------------
583 // 3D derivatives at quadrature points
584 //------------------------------------------------------------------------------
585 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
586 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
587                                               CeedScalar *__restrict__ r_V) {
588   CeedScalar r_t1[T_1D];
589   CeedScalar r_t2[T_1D];
590   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
591     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
592     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
593     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
594     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
595     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
596     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
597   }
598 }
599 
600 //------------------------------------------------------------------------------
601 // 3D derivatives transpose
602 //------------------------------------------------------------------------------
603 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
604 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
605                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
606   CeedScalar r_t1[T_1D];
607   CeedScalar r_t2[T_1D];
608   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
609     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
610     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
611     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
612     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
613     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
614     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
615   }
616 }
617 
618 //------------------------------------------------------------------------------
619 // 3D derivatives at quadrature points, nodes and quadrature points collocated
620 //------------------------------------------------------------------------------
621 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
622 inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
623                                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
624   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
625     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
626     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
627     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
628   }
629 }
630 
631 //------------------------------------------------------------------------------
632 // 3D derivatives transpose, nodes and quadrature points collocated
633 //------------------------------------------------------------------------------
634 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
635 inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
636                                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
637   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
638     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
639     ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
640     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
641   }
642 }
643 
644 //------------------------------------------------------------------------------
645 // 3D quadrature weights
646 //------------------------------------------------------------------------------
647 template <int P_1D, int Q_1D>
648 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
649   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
650   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
651   for (CeedInt q = 0; q < Q_1D; q++) {
652     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
653   }
654 }
655