xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h (revision c433aabc65bbaa1ba98688d834b18c213e9c6270)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D>
20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21   data.slice[data.t_id_x] = *U;
22   __syncthreads();
23   *V = 0.0;
24   if (data.t_id_x < Q_1D) {
25     for (CeedInt i = 0; i < P_1D; i++) {
26       *V += B[i + data.t_id_x * P_1D] * data.slice[i];  // Contract x direction
27     }
28   }
29   __syncthreads();
30 }
31 
32 //------------------------------------------------------------------------------
33 // 1D transpose tensor contraction x
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D, int Q_1D>
36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37   data.slice[data.t_id_x] = *U;
38   __syncthreads();
39   *V = 0.0;
40   if (data.t_id_x < P_1D) {
41     for (CeedInt i = 0; i < Q_1D; i++) {
42       *V += B[data.t_id_x + i * P_1D] * data.slice[i];  // Contract x direction
43     }
44   }
45   __syncthreads();
46 }
47 
48 //------------------------------------------------------------------------------
49 // 1D interpolate to quadrature points
50 //------------------------------------------------------------------------------
51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
53   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
55   }
56 }
57 
58 //------------------------------------------------------------------------------
59 // 1D interpolate transpose
60 //------------------------------------------------------------------------------
61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
63                                          CeedScalar *__restrict__ r_V) {
64   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D derivatives at quadrature points
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
73 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
74                               CeedScalar *__restrict__ r_V) {
75   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
77   }
78 }
79 
80 //------------------------------------------------------------------------------
81 // 1D derivatives transpose
82 //------------------------------------------------------------------------------
83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
84 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
85                                        CeedScalar *__restrict__ r_V) {
86   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // 1D quadrature weights
93 //------------------------------------------------------------------------------
94 template <int P_1D, int Q_1D>
95 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
96   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
97 }
98 
99 //------------------------------------------------------------------------------
100 // 2D
101 //------------------------------------------------------------------------------
102 
103 //------------------------------------------------------------------------------
104 // 2D tensor contraction x
105 //------------------------------------------------------------------------------
106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
107 inline __device__ void ContractX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
108                                    CeedScalar *V) {
109   data.slice[t_id_x + t_id_y * T_1D] = *U;
110   __syncthreads();
111   *V = 0.0;
112   if (t_id_x < Q_1D && t_id_y < P_1D) {
113     for (CeedInt i = 0; i < P_1D; i++) {
114       *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
115     }
116   }
117   __syncthreads();
118 }
119 
120 //------------------------------------------------------------------------------
121 // 2D tensor contract y
122 //------------------------------------------------------------------------------
123 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
124 inline __device__ void ContractY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
125                                    CeedScalar *V) {
126   data.slice[t_id_x + t_id_y * T_1D] = *U;
127   __syncthreads();
128   *V = 0.0;
129   if (t_id_x < Q_1D && t_id_y < Q_1D) {
130     for (CeedInt i = 0; i < P_1D; i++) {
131       *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
132     }
133   }
134   __syncthreads();
135 }
136 
137 //------------------------------------------------------------------------------
138 // 2D transpose tensor contract y
139 //------------------------------------------------------------------------------
140 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
141 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
142                                             CeedScalar *V) {
143   data.slice[t_id_x + t_id_y * T_1D] = *U;
144   __syncthreads();
145   *V = 0.0;
146   if (t_id_x < Q_1D && t_id_y < P_1D) {
147     for (CeedInt i = 0; i < Q_1D; i++) {
148       *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
149     }
150   }
151   __syncthreads();
152 }
153 
154 //------------------------------------------------------------------------------
155 // 2D transpose tensor contract x
156 //------------------------------------------------------------------------------
157 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
158 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
159                                             CeedScalar *V) {
160   data.slice[t_id_x + t_id_y * T_1D] = *U;
161   __syncthreads();
162   *V = 0.0;
163   if (t_id_x < P_1D && t_id_y < P_1D) {
164     for (CeedInt i = 0; i < Q_1D; i++) {
165       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
166     }
167   }
168   __syncthreads();
169 }
170 
171 //------------------------------------------------------------------------------
172 // 2D transpose tensor contract and add x
173 //------------------------------------------------------------------------------
174 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
175 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
176                                                CeedScalar *V) {
177   data.slice[t_id_x + t_id_y * T_1D] = *U;
178   __syncthreads();
179   if (t_id_x < P_1D && t_id_y < P_1D) {
180     for (CeedInt i = 0; i < Q_1D; i++) {
181       *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
182     }
183   }
184   __syncthreads();
185 }
186 
187 //------------------------------------------------------------------------------
188 // 2D pack/unpack quadrature values
189 //------------------------------------------------------------------------------
190 template <int NUM_COMP, int Q_1D, int T_1D>
191 inline __device__ void QPack2D(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
192   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
193     if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
194     __syncthreads();
195     U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] : 0.0;
196     __syncthreads();
197   }
198 }
199 
200 template <int NUM_COMP, int Q_1D, int T_1D>
201 inline __device__ void QUnpack2D(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
202   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
203     if (data.t_id_x < (Q_1D * Q_1D)) data.slice[(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] = U[comp];
204     __syncthreads();
205     U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
206     __syncthreads();
207   }
208 }
209 
210 //------------------------------------------------------------------------------
211 // 2D interpolate to quadrature points
212 //------------------------------------------------------------------------------
213 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
214 inline __device__ void InterpTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
215                                            const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
216   CeedScalar r_t[1];
217   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
218     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
219     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
220   }
221 }
222 
223 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
224 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
225                                       CeedScalar *__restrict__ r_V) {
226   InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
227 }
228 
229 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
230 inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
231                                                CeedScalar *__restrict__ r_V) {
232   QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
233   InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, r_V);
234   QPack2D<NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
235 }
236 
237 //------------------------------------------------------------------------------
238 // 2D interpolate transpose
239 //------------------------------------------------------------------------------
240 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
241 inline __device__ void InterpTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
242                                                     const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
243   CeedScalar r_t[1];
244   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
245     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
246     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
247   }
248 }
249 
250 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
251 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
252                                                CeedScalar *__restrict__ r_V) {
253   InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V);
254 }
255 
256 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
257 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
258                                                         CeedScalar *__restrict__ r_V) {
259   QUnpack2D<NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
260   InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, r_V);
261   QPack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
262 }
263 
264 //------------------------------------------------------------------------------
265 // 2D derivatives at quadrature points
266 //------------------------------------------------------------------------------
267 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
268 inline __device__ void GradTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
269                                          const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
270   CeedScalar r_t[1];
271   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
272     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
273     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
274     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
275     ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
276   }
277 }
278 
279 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
280 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
281                                     CeedScalar *__restrict__ r_V) {
282   GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
283 }
284 
285 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
286 inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
287                                              CeedScalar *__restrict__ r_V) {
288   QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
289   GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, c_G, r_V);
290   QPack2D<NUM_COMP * 2, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
291 }
292 
293 //------------------------------------------------------------------------------
294 // 2D derivatives transpose
295 //------------------------------------------------------------------------------
296 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
297 inline __device__ void GradTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U,
298                                                   const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
299   CeedScalar r_t[1];
300   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
301     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
302     ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
303     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
304     ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
305   }
306 }
307 
308 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
309 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
310                                              CeedScalar *__restrict__ r_V) {
311   GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V);
312 }
313 
314 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
315 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
316                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
317   QUnpack2D<NUM_COMP * 2, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
318   GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, c_G, r_V);
319   QPack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
320 }
321 
322 //------------------------------------------------------------------------------
323 // 2D quadrature weights
324 //------------------------------------------------------------------------------
325 template <int Q_1D>
326 inline __device__ void WeightTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d,
327                                            CeedScalar *w) {
328   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
329 }
330 
331 template <int P_1D, int Q_1D>
332 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
333   WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w);
334 }
335 
336 template <int P_1D, int Q_1D>
337 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
338   WeightTensor2d_Core<Q_1D>(data, data.t_id_x % Q_1D, data.t_id_x / Q_1D, q_weight_1d, w);
339 }
340 
341 //------------------------------------------------------------------------------
342 // 3D
343 //------------------------------------------------------------------------------
344 
345 //------------------------------------------------------------------------------
346 // 3D tensor contract x
347 //------------------------------------------------------------------------------
348 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
349 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
350   CeedScalar r_B[P_1D];
351   for (CeedInt i = 0; i < P_1D; i++) {
352     r_B[i] = B[i + data.t_id_x * P_1D];
353   }
354 
355   for (CeedInt k = 0; k < P_1D; k++) {
356     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
357     __syncthreads();
358     V[k] = 0.0;
359     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
360       for (CeedInt i = 0; i < P_1D; i++) {
361         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
362       }
363     }
364     __syncthreads();
365   }
366 }
367 
368 //------------------------------------------------------------------------------
369 // 3D tensor contract y
370 //------------------------------------------------------------------------------
371 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
372 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
373   CeedScalar r_B[P_1D];
374   for (CeedInt i = 0; i < P_1D; i++) {
375     r_B[i] = B[i + data.t_id_y * P_1D];
376   }
377 
378   for (CeedInt k = 0; k < P_1D; k++) {
379     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
380     __syncthreads();
381     V[k] = 0.0;
382     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
383       for (CeedInt i = 0; i < P_1D; i++) {
384         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
385       }
386     }
387     __syncthreads();
388   }
389 }
390 
391 //------------------------------------------------------------------------------
392 // 3D tensor contract z
393 //------------------------------------------------------------------------------
394 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
395 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
396   for (CeedInt k = 0; k < Q_1D; k++) {
397     V[k] = 0.0;
398     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
399       for (CeedInt i = 0; i < P_1D; i++) {
400         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
401       }
402     }
403   }
404 }
405 
406 //------------------------------------------------------------------------------
407 // 3D transpose tensor contract z
408 //------------------------------------------------------------------------------
409 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
410 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
411   for (CeedInt k = 0; k < P_1D; k++) {
412     V[k] = 0.0;
413     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
414       for (CeedInt i = 0; i < Q_1D; i++) {
415         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
416       }
417     }
418   }
419 }
420 
421 //------------------------------------------------------------------------------
422 // 3D transpose tensor contract y
423 //------------------------------------------------------------------------------
424 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
425 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
426   CeedScalar r_B[Q_1D];
427   for (CeedInt i = 0; i < Q_1D; i++) {
428     r_B[i] = B[data.t_id_y + i * P_1D];
429   }
430 
431   for (CeedInt k = 0; k < P_1D; k++) {
432     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
433     __syncthreads();
434     V[k] = 0.0;
435     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
436       for (CeedInt i = 0; i < Q_1D; i++) {
437         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
438       }
439     }
440     __syncthreads();
441   }
442 }
443 
444 //------------------------------------------------------------------------------
445 // 3D transpose tensor contract y
446 //------------------------------------------------------------------------------
447 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
448 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
449   CeedScalar r_B[Q_1D];
450   for (CeedInt i = 0; i < Q_1D; i++) {
451     r_B[i] = B[data.t_id_y + i * P_1D];
452   }
453 
454   for (CeedInt k = 0; k < P_1D; k++) {
455     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
456     __syncthreads();
457     if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
458       for (CeedInt i = 0; i < Q_1D; i++) {
459         V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction
460       }
461     }
462     __syncthreads();
463   }
464 }
465 
466 //------------------------------------------------------------------------------
467 // 3D transpose tensor contract x
468 //------------------------------------------------------------------------------
469 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
470 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
471   CeedScalar r_B[Q_1D];
472   for (CeedInt i = 0; i < Q_1D; i++) {
473     r_B[i] = B[data.t_id_x + i * P_1D];
474   }
475 
476   for (CeedInt k = 0; k < P_1D; k++) {
477     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
478     __syncthreads();
479     V[k] = 0.0;
480     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
481       for (CeedInt i = 0; i < Q_1D; i++) {
482         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
483       }
484     }
485     __syncthreads();
486   }
487 }
488 
489 //------------------------------------------------------------------------------
490 // 3D transpose tensor contract add x
491 //------------------------------------------------------------------------------
492 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
493 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
494   CeedScalar r_B[Q_1D];
495   for (CeedInt i = 0; i < Q_1D; i++) {
496     r_B[i] = B[data.t_id_x + i * P_1D];
497   }
498 
499   for (CeedInt k = 0; k < P_1D; k++) {
500     data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
501     __syncthreads();
502     if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
503       for (CeedInt i = 0; i < Q_1D; i++) {
504         V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction
505       }
506     }
507     __syncthreads();
508   }
509 }
510 
511 //------------------------------------------------------------------------------
512 // 3D interpolate to quadrature points
513 //------------------------------------------------------------------------------
514 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
515 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
516                                       CeedScalar *__restrict__ r_V) {
517   CeedScalar r_t1[T_1D];
518   CeedScalar r_t2[T_1D];
519   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
520     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
521     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
522     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
523   }
524 }
525 
526 //------------------------------------------------------------------------------
527 // 3D interpolate transpose
528 //------------------------------------------------------------------------------
529 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
530 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
531                                                CeedScalar *__restrict__ r_V) {
532   CeedScalar r_t1[T_1D];
533   CeedScalar r_t2[T_1D];
534   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
535     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
536     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
537     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
538   }
539 }
540 
541 //------------------------------------------------------------------------------
542 // 3D derivatives at quadrature points
543 //------------------------------------------------------------------------------
544 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
545 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
546                                     CeedScalar *__restrict__ r_V) {
547   CeedScalar r_t1[T_1D];
548   CeedScalar r_t2[T_1D];
549   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
550     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
551     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
552     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
553     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
554     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
555     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
556     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
557     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
558     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
559   }
560 }
561 
562 //------------------------------------------------------------------------------
563 // 3D derivatives transpose
564 //------------------------------------------------------------------------------
565 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
566 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
567                                              CeedScalar *__restrict__ r_V) {
568   CeedScalar r_t1[T_1D];
569   CeedScalar r_t2[T_1D];
570   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
571     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
572     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
573     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
574     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
575     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
576     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
577     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
578     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
579     ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
580   }
581 }
582 
583 //------------------------------------------------------------------------------
584 // 3D derivatives at quadrature points
585 //------------------------------------------------------------------------------
586 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
587 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
588                                               CeedScalar *__restrict__ r_V) {
589   CeedScalar r_t1[T_1D];
590   CeedScalar r_t2[T_1D];
591   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
592     ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
593     ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
594     ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
595     ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
596     ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
597     ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
598   }
599 }
600 
601 //------------------------------------------------------------------------------
602 // 3D derivatives transpose
603 //------------------------------------------------------------------------------
604 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
605 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
606                                                        const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
607   CeedScalar r_t1[T_1D];
608   CeedScalar r_t2[T_1D];
609   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
610     ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
611     ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
612     ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
613     ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
614     ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
615     ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
616   }
617 }
618 
619 //------------------------------------------------------------------------------
620 // 3D quadrature weights
621 //------------------------------------------------------------------------------
622 template <int P_1D, int Q_1D>
623 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
624   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
625   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
626   for (CeedInt q = 0; q < Q_1D; q++) {
627     w[q] = quad ? pw * q_weight_1d[q] : 0.0;
628   }
629 }
630