xref: /libCEED/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h (revision ee4ca9cbfe2be39196684117442f3ce8d00b6506)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for SYCL shared memory tensor product basis templates
10 #ifndef _ceed_sycl_shared_basis_tensor_templates_h
11 #define _ceed_sycl_shared_basis_tensor_templates_h
12 
13 #include <ceed.h>
14 
15 //------------------------------------------------------------------------------
16 // 1D
17 //------------------------------------------------------------------------------
18 
19 //------------------------------------------------------------------------------
20 // 1D tensor contraction x
21 //------------------------------------------------------------------------------
22 inline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
23                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
24   const CeedInt item_id_x = get_local_id(0);
25 
26   scratch[item_id_x] = *U;
27   work_group_barrier(CLK_LOCAL_MEM_FENCE);
28 
29   *V = 0.0;
30   if (item_id_x < Q_1D) {
31     for (CeedInt i = 0; i < P_1D; i++) {
32       *V += B[i + item_id_x * P_1D] * scratch[i];  // Contract x direction
33     }
34   }
35   work_group_barrier(CLK_LOCAL_MEM_FENCE);
36 }
37 
38 //------------------------------------------------------------------------------
39 // 1D transpose tensor contraction x
40 //------------------------------------------------------------------------------
41 inline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
42                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
43   const CeedInt item_id_x = get_local_id(0);
44 
45   scratch[item_id_x] = *U;
46   work_group_barrier(CLK_LOCAL_MEM_FENCE);
47 
48   *V = 0.0;
49   if (item_id_x < P_1D) {
50     for (CeedInt i = 0; i < Q_1D; i++) {
51       *V += B[item_id_x + i * P_1D] * scratch[i];  // Contract x direction
52     }
53   }
54   work_group_barrier(CLK_LOCAL_MEM_FENCE);
55 }
56 
57 //------------------------------------------------------------------------------
58 // 1D interpolate to quadrature points
59 //------------------------------------------------------------------------------
60 inline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
61                      local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
62   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
63     ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
64   }
65 }
66 
67 //------------------------------------------------------------------------------
68 // 1D interpolate transpose
69 //------------------------------------------------------------------------------
70 inline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
71                               local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
72   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
73     ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
74   }
75 }
76 
77 //------------------------------------------------------------------------------
78 // 1D derivatives at quadrature points
79 //------------------------------------------------------------------------------
80 inline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
81                    local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
82   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
83     ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
84   }
85 }
86 
87 //------------------------------------------------------------------------------
88 // 1D derivatives transpose
89 //------------------------------------------------------------------------------
90 inline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
91                             local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
92   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
93     ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
94   }
95 }
96 
97 //------------------------------------------------------------------------------
98 // 1D quadrature weights
99 //------------------------------------------------------------------------------
100 inline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
101   const CeedInt item_id_x = get_local_id(0);
102   *w                      = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0;
103 }
104 
105 //------------------------------------------------------------------------------
106 // 2D
107 //------------------------------------------------------------------------------
108 
109 //------------------------------------------------------------------------------
110 // 2D tensor contraction x
111 //------------------------------------------------------------------------------
112 inline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
113                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
114   const CeedInt item_id_x = get_local_id(0);
115   const CeedInt item_id_y = get_local_id(1);
116 
117   scratch[item_id_x + item_id_y * T_1D] = *U;
118   work_group_barrier(CLK_LOCAL_MEM_FENCE);
119 
120   *V = 0.0;
121   if (item_id_x < Q_1D && item_id_y < P_1D) {
122     for (CeedInt i = 0; i < P_1D; i++) {
123       *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
124     }
125   }
126   work_group_barrier(CLK_LOCAL_MEM_FENCE);
127 }
128 
129 //------------------------------------------------------------------------------
130 // 2D tensor contract y
131 //------------------------------------------------------------------------------
132 inline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
133                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
134   const CeedInt item_id_x = get_local_id(0);
135   const CeedInt item_id_y = get_local_id(1);
136 
137   scratch[item_id_x + item_id_y * T_1D] = *U;
138   work_group_barrier(CLK_LOCAL_MEM_FENCE);
139 
140   *V = 0.0;
141   if (item_id_x < Q_1D && item_id_y < Q_1D) {
142     for (CeedInt i = 0; i < P_1D; i++) {
143       *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction
144     }
145   }
146   work_group_barrier(CLK_LOCAL_MEM_FENCE);
147 }
148 
149 //------------------------------------------------------------------------------
150 // 2D transpose tensor contract y
151 //------------------------------------------------------------------------------
152 inline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
153                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
154   const CeedInt item_id_x = get_local_id(0);
155   const CeedInt item_id_y = get_local_id(1);
156 
157   scratch[item_id_x + item_id_y * T_1D] = *U;
158   work_group_barrier(CLK_LOCAL_MEM_FENCE);
159 
160   *V = 0.0;
161   if (item_id_x < Q_1D && item_id_y < P_1D) {
162     for (CeedInt i = 0; i < Q_1D; i++) {
163       *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction
164     }
165   }
166   work_group_barrier(CLK_LOCAL_MEM_FENCE);
167 }
168 
169 //------------------------------------------------------------------------------
170 // 2D transpose tensor contract x
171 //------------------------------------------------------------------------------
172 inline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
173                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
174   const CeedInt item_id_x = get_local_id(0);
175   const CeedInt item_id_y = get_local_id(1);
176 
177   scratch[item_id_x + item_id_y * T_1D] = *U;
178   work_group_barrier(CLK_LOCAL_MEM_FENCE);
179 
180   *V = 0.0;
181   if (item_id_x < P_1D && item_id_y < P_1D) {
182     for (CeedInt i = 0; i < Q_1D; i++) {
183       *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
184     }
185   }
186   work_group_barrier(CLK_LOCAL_MEM_FENCE);
187 }
188 
189 //------------------------------------------------------------------------------
190 // 2D transpose tensor contract and add x
191 //------------------------------------------------------------------------------
192 inline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
193                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
194   const CeedInt item_id_x = get_local_id(0);
195   const CeedInt item_id_y = get_local_id(1);
196 
197   scratch[item_id_x + item_id_y * T_1D] = *U;
198   work_group_barrier(CLK_LOCAL_MEM_FENCE);
199 
200   if (item_id_x < P_1D && item_id_y < P_1D) {
201     for (CeedInt i = 0; i < Q_1D; i++) {
202       *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
203     }
204   }
205   work_group_barrier(CLK_LOCAL_MEM_FENCE);
206 }
207 
208 //------------------------------------------------------------------------------
209 // 2D interpolate to quadrature points
210 //------------------------------------------------------------------------------
211 inline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
212                            local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
213   CeedScalar r_t[1];
214 
215   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
216     ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
217     ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
218   }
219 }
220 
221 //------------------------------------------------------------------------------
222 // 2D interpolate transpose
223 //------------------------------------------------------------------------------
224 inline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
225                                     local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
226   CeedScalar r_t[1];
227 
228   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
229     ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
230     ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
231   }
232 }
233 
234 //------------------------------------------------------------------------------
235 // 2D derivatives at quadrature points
236 //------------------------------------------------------------------------------
237 inline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
238                          local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
239                          local CeedScalar *restrict scratch) {
240   CeedScalar r_t[1];
241 
242   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
243     ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch);
244     ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch);
245     ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
246     ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch);
247   }
248 }
249 
250 //------------------------------------------------------------------------------
251 // 2D derivatives transpose
252 //------------------------------------------------------------------------------
253 inline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
254                                   local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
255                                   local CeedScalar *restrict scratch) {
256   CeedScalar r_t[1];
257 
258   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
259     ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch);
260     ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch);
261     ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch);
262     ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
263   }
264 }
265 
266 //------------------------------------------------------------------------------
267 // 2D quadrature weights
268 //------------------------------------------------------------------------------
269 inline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
270   const CeedInt item_id_x = get_local_id(0);
271   const CeedInt item_id_y = get_local_id(1);
272 
273   *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0;
274 }
275 
276 //------------------------------------------------------------------------------
277 // 3D
278 //------------------------------------------------------------------------------
279 
280 //------------------------------------------------------------------------------
281 // 3D tensor contract x
282 //------------------------------------------------------------------------------
283 inline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
284                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
285   const CeedInt item_id_x = get_local_id(0);
286   const CeedInt item_id_y = get_local_id(1);
287 
288   CeedScalar r_B[T_1D];
289   for (CeedInt i = 0; i < P_1D; i++) {
290     r_B[i] = B[i + item_id_x * P_1D];
291   }
292 
293   for (CeedInt k = 0; k < P_1D; k++) {
294     scratch[item_id_x + item_id_y * T_1D] = U[k];
295     work_group_barrier(CLK_LOCAL_MEM_FENCE);
296 
297     V[k] = 0.0;
298     if (item_id_x < Q_1D && item_id_y < P_1D) {
299       for (CeedInt i = 0; i < P_1D; i++) {
300         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
301       }
302     }
303     work_group_barrier(CLK_LOCAL_MEM_FENCE);
304   }
305 }
306 
307 //------------------------------------------------------------------------------
308 // 3D tensor contract y
309 //------------------------------------------------------------------------------
310 inline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
311                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
312   const CeedInt item_id_x = get_local_id(0);
313   const CeedInt item_id_y = get_local_id(1);
314 
315   CeedScalar r_B[T_1D];
316   for (CeedInt i = 0; i < P_1D; i++) {
317     r_B[i] = B[i + item_id_y * P_1D];
318   }
319 
320   for (CeedInt k = 0; k < P_1D; k++) {
321     scratch[item_id_x + item_id_y * T_1D] = U[k];
322     work_group_barrier(CLK_LOCAL_MEM_FENCE);
323 
324     V[k] = 0.0;
325     if (item_id_x < Q_1D && item_id_y < Q_1D) {
326       for (CeedInt i = 0; i < P_1D; i++) {
327         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
328       }
329     }
330     work_group_barrier(CLK_LOCAL_MEM_FENCE);
331   }
332 }
333 
334 //------------------------------------------------------------------------------
335 // 3D tensor contract z
336 //------------------------------------------------------------------------------
337 inline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
338                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
339   const CeedInt item_id_x = get_local_id(0);
340   const CeedInt item_id_y = get_local_id(1);
341 
342   for (CeedInt k = 0; k < Q_1D; k++) {
343     V[k] = 0.0;
344     if (item_id_x < Q_1D && item_id_y < Q_1D) {
345       for (CeedInt i = 0; i < P_1D; i++) {
346         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
347       }
348     }
349   }
350 }
351 
352 //------------------------------------------------------------------------------
353 // 3D transpose tensor contract z
354 //------------------------------------------------------------------------------
355 inline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
356                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
357   const CeedInt item_id_x = get_local_id(0);
358   const CeedInt item_id_y = get_local_id(1);
359 
360   for (CeedInt k = 0; k < P_1D; k++) {
361     V[k] = 0.0;
362     if (item_id_x < Q_1D && item_id_y < Q_1D) {
363       for (CeedInt i = 0; i < Q_1D; i++) {
364         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
365       }
366     }
367   }
368 }
369 
370 //------------------------------------------------------------------------------
371 // 3D transpose tensor contract y
372 //------------------------------------------------------------------------------
373 inline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
374                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
375   const CeedInt item_id_x = get_local_id(0);
376   const CeedInt item_id_y = get_local_id(1);
377 
378   CeedScalar r_B[T_1D];
379   for (CeedInt i = 0; i < Q_1D; i++) {
380     r_B[i] = B[item_id_y + i * P_1D];
381   }
382 
383   for (CeedInt k = 0; k < P_1D; k++) {
384     scratch[item_id_x + item_id_y * T_1D] = U[k];
385     work_group_barrier(CLK_LOCAL_MEM_FENCE);
386 
387     V[k] = 0.0;
388     if (item_id_x < Q_1D && item_id_y < P_1D) {
389       for (CeedInt i = 0; i < Q_1D; i++) {
390         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
391       }
392     }
393     work_group_barrier(CLK_LOCAL_MEM_FENCE);
394   }
395 }
396 
397 //------------------------------------------------------------------------------
398 // 3D transpose tensor contract y
399 //------------------------------------------------------------------------------
400 inline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
401                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
402   const CeedInt item_id_x = get_local_id(0);
403   const CeedInt item_id_y = get_local_id(1);
404 
405   CeedScalar r_B[T_1D];
406   for (CeedInt i = 0; i < Q_1D; i++) {
407     r_B[i] = B[item_id_y + i * P_1D];
408   }
409 
410   for (CeedInt k = 0; k < P_1D; k++) {
411     scratch[item_id_x + item_id_y * T_1D] = U[k];
412     work_group_barrier(CLK_LOCAL_MEM_FENCE);
413     if (item_id_x < Q_1D && item_id_y < P_1D) {
414       for (CeedInt i = 0; i < Q_1D; i++) {
415         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
416       }
417     }
418     work_group_barrier(CLK_LOCAL_MEM_FENCE);
419   }
420 }
421 
422 //------------------------------------------------------------------------------
423 // 3D transpose tensor contract x
424 //------------------------------------------------------------------------------
425 inline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
426                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
427   const CeedInt item_id_x = get_local_id(0);
428   const CeedInt item_id_y = get_local_id(1);
429 
430   CeedScalar r_B[T_1D];
431   for (CeedInt i = 0; i < Q_1D; i++) {
432     r_B[i] = B[item_id_x + i * P_1D];
433   }
434 
435   for (CeedInt k = 0; k < P_1D; k++) {
436     scratch[item_id_x + item_id_y * T_1D] = U[k];
437     work_group_barrier(CLK_LOCAL_MEM_FENCE);
438     V[k] = 0.0;
439     if (item_id_x < P_1D && item_id_y < P_1D) {
440       for (CeedInt i = 0; i < Q_1D; i++) {
441         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
442       }
443     }
444     work_group_barrier(CLK_LOCAL_MEM_FENCE);
445   }
446 }
447 
448 //------------------------------------------------------------------------------
449 // 3D transpose tensor contract add x
450 //------------------------------------------------------------------------------
451 inline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
452                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
453   const CeedInt item_id_x = get_local_id(0);
454   const CeedInt item_id_y = get_local_id(1);
455 
456   CeedScalar r_B[T_1D];
457   for (CeedInt i = 0; i < Q_1D; i++) {
458     r_B[i] = B[item_id_x + i * P_1D];
459   }
460 
461   for (CeedInt k = 0; k < P_1D; k++) {
462     scratch[item_id_x + item_id_y * T_1D] = U[k];
463     work_group_barrier(CLK_LOCAL_MEM_FENCE);
464 
465     if (item_id_x < P_1D && item_id_y < P_1D) {
466       for (CeedInt i = 0; i < Q_1D; i++) {
467         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
468       }
469     }
470     work_group_barrier(CLK_LOCAL_MEM_FENCE);
471   }
472 }
473 
474 //------------------------------------------------------------------------------
475 // 3D interpolate to quadrature points
476 //------------------------------------------------------------------------------
477 inline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
478                            local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
479   CeedScalar r_t1[T_1D];
480   CeedScalar r_t2[T_1D];
481 
482   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
483     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
484     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
485     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch);
486   }
487 }
488 
489 //------------------------------------------------------------------------------
490 // 3D interpolate transpose
491 //------------------------------------------------------------------------------
492 inline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
493                                     local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
494   CeedScalar r_t1[T_1D];
495   CeedScalar r_t2[T_1D];
496 
497   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
498     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch);
499     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
500     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
501   }
502 }
503 
504 //------------------------------------------------------------------------------
505 // 3D derivatives at quadrature points
506 //------------------------------------------------------------------------------
507 inline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
508                          local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
509                          local CeedScalar *restrict scratch) {
510   CeedScalar r_t1[T_1D];
511   CeedScalar r_t2[T_1D];
512 
513   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
514     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch);
515     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
516     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
517     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
518     ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
519     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
520     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
521     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
522     ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
523   }
524 }
525 
526 //------------------------------------------------------------------------------
527 // 3D derivatives transpose
528 //------------------------------------------------------------------------------
529 inline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
530                                   local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
531                                   local CeedScalar *restrict scratch) {
532   CeedScalar r_t1[T_1D];
533   CeedScalar r_t2[T_1D];
534 
535   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
536     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
537     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
538     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch);
539     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
540     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
541     ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
542     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch);
543     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
544     ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
545   }
546 }
547 
548 //------------------------------------------------------------------------------
549 // 3D derivatives at quadrature points
550 //------------------------------------------------------------------------------
551 inline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
552                                    local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
553                                    local CeedScalar *restrict scratch) {
554   CeedScalar r_t1[T_1D];
555   CeedScalar r_t2[T_1D];
556 
557   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
558     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
559     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
560     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
561     ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
562     ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
563     ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
564   }
565 }
566 
567 //------------------------------------------------------------------------------
568 // 3D derivatives transpose
569 //------------------------------------------------------------------------------
570 inline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
571                                             local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G,
572                                             private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
573   CeedScalar r_t1[T_1D];
574   CeedScalar r_t2[T_1D];
575 
576   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
577     ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
578     ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
579     ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
580     ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
581     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
582     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
583   }
584 }
585 
586 //------------------------------------------------------------------------------
587 // 3D quadrature weights
588 //------------------------------------------------------------------------------
589 // template <int Q_1D>
590 inline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
591   const CeedInt item_id_x = get_local_id(0);
592   const CeedInt item_id_y = get_local_id(1);
593 
594   if (item_id_x < Q_1D && item_id_y < Q_1D) {
595     const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y];
596     for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q];
597   } else {
598     for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0;
599   }
600 }
601 
602 //------------------------------------------------------------------------------
603 
604 #endif
605