xref: /libCEED/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h (revision ed094490f53e580908aa80e9fe815a6fd76d7526)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for SYCL shared memory tensor product basis templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 inline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
20                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
21   const CeedInt item_id_x = get_local_id(0);
22 
23   scratch[item_id_x] = *U;
24   work_group_barrier(CLK_LOCAL_MEM_FENCE);
25 
26   *V = 0.0;
27   if (item_id_x < Q_1D) {
28     for (CeedInt i = 0; i < P_1D; i++) {
29       *V += B[i + item_id_x * P_1D] * scratch[i];  // Contract x direction
30     }
31   }
32   work_group_barrier(CLK_LOCAL_MEM_FENCE);
33 }
34 
35 //------------------------------------------------------------------------------
36 // 1D transpose tensor contraction x
37 //------------------------------------------------------------------------------
38 inline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
39                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
40   const CeedInt item_id_x = get_local_id(0);
41 
42   scratch[item_id_x] = *U;
43   work_group_barrier(CLK_LOCAL_MEM_FENCE);
44 
45   *V = 0.0;
46   if (item_id_x < P_1D) {
47     for (CeedInt i = 0; i < Q_1D; i++) {
48       *V += B[item_id_x + i * P_1D] * scratch[i];  // Contract x direction
49     }
50   }
51   work_group_barrier(CLK_LOCAL_MEM_FENCE);
52 }
53 
54 //------------------------------------------------------------------------------
55 // 1D interpolate to quadrature points
56 //------------------------------------------------------------------------------
57 inline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
58                      local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
59   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
60     ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
61   }
62 }
63 
64 //------------------------------------------------------------------------------
65 // 1D interpolate transpose
66 //------------------------------------------------------------------------------
67 inline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
68                               local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
69   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
70     ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
71   }
72 }
73 
74 //------------------------------------------------------------------------------
75 // 1D derivatives at quadrature points
76 //------------------------------------------------------------------------------
77 inline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
78                    local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
79   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
80     ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
81   }
82 }
83 
84 //------------------------------------------------------------------------------
85 // 1D derivatives transpose
86 //------------------------------------------------------------------------------
87 inline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
88                             local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
89   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
90     ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
91   }
92 }
93 
94 //------------------------------------------------------------------------------
95 // 1D quadrature weights
96 //------------------------------------------------------------------------------
97 inline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
98   const CeedInt item_id_x = get_local_id(0);
99   *w                      = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0;
100 }
101 
102 //------------------------------------------------------------------------------
103 // 2D
104 //------------------------------------------------------------------------------
105 
106 //------------------------------------------------------------------------------
107 // 2D tensor contraction x
108 //------------------------------------------------------------------------------
109 inline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
110                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
111   const CeedInt item_id_x = get_local_id(0);
112   const CeedInt item_id_y = get_local_id(1);
113 
114   scratch[item_id_x + item_id_y * T_1D] = *U;
115   work_group_barrier(CLK_LOCAL_MEM_FENCE);
116 
117   *V = 0.0;
118   if (item_id_x < Q_1D && item_id_y < P_1D) {
119     for (CeedInt i = 0; i < P_1D; i++) {
120       *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
121     }
122   }
123   work_group_barrier(CLK_LOCAL_MEM_FENCE);
124 }
125 
126 //------------------------------------------------------------------------------
127 // 2D tensor contract y
128 //------------------------------------------------------------------------------
129 inline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
130                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
131   const CeedInt item_id_x = get_local_id(0);
132   const CeedInt item_id_y = get_local_id(1);
133 
134   scratch[item_id_x + item_id_y * T_1D] = *U;
135   work_group_barrier(CLK_LOCAL_MEM_FENCE);
136 
137   *V = 0.0;
138   if (item_id_x < Q_1D && item_id_y < Q_1D) {
139     for (CeedInt i = 0; i < P_1D; i++) {
140       *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction
141     }
142   }
143   work_group_barrier(CLK_LOCAL_MEM_FENCE);
144 }
145 
146 //------------------------------------------------------------------------------
147 // 2D transpose tensor contract y
148 //------------------------------------------------------------------------------
149 inline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
150                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
151   const CeedInt item_id_x = get_local_id(0);
152   const CeedInt item_id_y = get_local_id(1);
153 
154   scratch[item_id_x + item_id_y * T_1D] = *U;
155   work_group_barrier(CLK_LOCAL_MEM_FENCE);
156 
157   *V = 0.0;
158   if (item_id_x < Q_1D && item_id_y < P_1D) {
159     for (CeedInt i = 0; i < Q_1D; i++) {
160       *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction
161     }
162   }
163   work_group_barrier(CLK_LOCAL_MEM_FENCE);
164 }
165 
166 //------------------------------------------------------------------------------
167 // 2D transpose tensor contract x
168 //------------------------------------------------------------------------------
169 inline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
170                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
171   const CeedInt item_id_x = get_local_id(0);
172   const CeedInt item_id_y = get_local_id(1);
173 
174   scratch[item_id_x + item_id_y * T_1D] = *U;
175   work_group_barrier(CLK_LOCAL_MEM_FENCE);
176 
177   *V = 0.0;
178   if (item_id_x < P_1D && item_id_y < P_1D) {
179     for (CeedInt i = 0; i < Q_1D; i++) {
180       *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
181     }
182   }
183   work_group_barrier(CLK_LOCAL_MEM_FENCE);
184 }
185 
186 //------------------------------------------------------------------------------
187 // 2D transpose tensor contract and add x
188 //------------------------------------------------------------------------------
189 inline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
190                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
191   const CeedInt item_id_x = get_local_id(0);
192   const CeedInt item_id_y = get_local_id(1);
193 
194   scratch[item_id_x + item_id_y * T_1D] = *U;
195   work_group_barrier(CLK_LOCAL_MEM_FENCE);
196 
197   if (item_id_x < P_1D && item_id_y < P_1D) {
198     for (CeedInt i = 0; i < Q_1D; i++) {
199       *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
200     }
201   }
202   work_group_barrier(CLK_LOCAL_MEM_FENCE);
203 }
204 
205 //------------------------------------------------------------------------------
206 // 2D interpolate to quadrature points
207 //------------------------------------------------------------------------------
208 inline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
209                            local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
210   CeedScalar r_t[1];
211 
212   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
213     ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
214     ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
215   }
216 }
217 
218 //------------------------------------------------------------------------------
219 // 2D interpolate transpose
220 //------------------------------------------------------------------------------
221 inline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
222                                     local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
223   CeedScalar r_t[1];
224 
225   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
226     ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
227     ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
228   }
229 }
230 
231 //------------------------------------------------------------------------------
232 // 2D derivatives at quadrature points
233 //------------------------------------------------------------------------------
234 inline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
235                          local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
236                          local CeedScalar *restrict scratch) {
237   CeedScalar r_t[1];
238 
239   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
240     ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch);
241     ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch);
242     ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
243     ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch);
244   }
245 }
246 
247 //------------------------------------------------------------------------------
248 // 2D derivatives transpose
249 //------------------------------------------------------------------------------
250 inline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
251                                   local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
252                                   local CeedScalar *restrict scratch) {
253   CeedScalar r_t[1];
254 
255   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
256     ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch);
257     ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch);
258     ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch);
259     ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
260   }
261 }
262 
263 //------------------------------------------------------------------------------
264 // 2D quadrature weights
265 //------------------------------------------------------------------------------
266 inline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
267   const CeedInt item_id_x = get_local_id(0);
268   const CeedInt item_id_y = get_local_id(1);
269 
270   *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0;
271 }
272 
273 //------------------------------------------------------------------------------
274 // 3D
275 //------------------------------------------------------------------------------
276 
277 //------------------------------------------------------------------------------
278 // 3D tensor contract x
279 //------------------------------------------------------------------------------
280 inline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
281                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
282   const CeedInt item_id_x = get_local_id(0);
283   const CeedInt item_id_y = get_local_id(1);
284 
285   CeedScalar r_B[T_1D];
286   for (CeedInt i = 0; i < P_1D; i++) {
287     r_B[i] = B[i + item_id_x * P_1D];
288   }
289 
290   for (CeedInt k = 0; k < P_1D; k++) {
291     scratch[item_id_x + item_id_y * T_1D] = U[k];
292     work_group_barrier(CLK_LOCAL_MEM_FENCE);
293 
294     V[k] = 0.0;
295     if (item_id_x < Q_1D && item_id_y < P_1D) {
296       for (CeedInt i = 0; i < P_1D; i++) {
297         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
298       }
299     }
300     work_group_barrier(CLK_LOCAL_MEM_FENCE);
301   }
302 }
303 
304 //------------------------------------------------------------------------------
305 // 3D tensor contract y
306 //------------------------------------------------------------------------------
307 inline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
308                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
309   const CeedInt item_id_x = get_local_id(0);
310   const CeedInt item_id_y = get_local_id(1);
311 
312   CeedScalar r_B[T_1D];
313   for (CeedInt i = 0; i < P_1D; i++) {
314     r_B[i] = B[i + item_id_y * P_1D];
315   }
316 
317   for (CeedInt k = 0; k < P_1D; k++) {
318     scratch[item_id_x + item_id_y * T_1D] = U[k];
319     work_group_barrier(CLK_LOCAL_MEM_FENCE);
320 
321     V[k] = 0.0;
322     if (item_id_x < Q_1D && item_id_y < Q_1D) {
323       for (CeedInt i = 0; i < P_1D; i++) {
324         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
325       }
326     }
327     work_group_barrier(CLK_LOCAL_MEM_FENCE);
328   }
329 }
330 
331 //------------------------------------------------------------------------------
332 // 3D tensor contract z
333 //------------------------------------------------------------------------------
334 inline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
335                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
336   const CeedInt item_id_x = get_local_id(0);
337   const CeedInt item_id_y = get_local_id(1);
338 
339   for (CeedInt k = 0; k < Q_1D; k++) {
340     V[k] = 0.0;
341     if (item_id_x < Q_1D && item_id_y < Q_1D) {
342       for (CeedInt i = 0; i < P_1D; i++) {
343         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
344       }
345     }
346   }
347 }
348 
349 //------------------------------------------------------------------------------
350 // 3D transpose tensor contract z
351 //------------------------------------------------------------------------------
352 inline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
353                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
354   const CeedInt item_id_x = get_local_id(0);
355   const CeedInt item_id_y = get_local_id(1);
356 
357   for (CeedInt k = 0; k < P_1D; k++) {
358     V[k] = 0.0;
359     if (item_id_x < Q_1D && item_id_y < Q_1D) {
360       for (CeedInt i = 0; i < Q_1D; i++) {
361         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
362       }
363     }
364   }
365 }
366 
367 //------------------------------------------------------------------------------
368 // 3D transpose tensor contract y
369 //------------------------------------------------------------------------------
370 inline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
371                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
372   const CeedInt item_id_x = get_local_id(0);
373   const CeedInt item_id_y = get_local_id(1);
374 
375   CeedScalar r_B[T_1D];
376   for (CeedInt i = 0; i < Q_1D; i++) {
377     r_B[i] = B[item_id_y + i * P_1D];
378   }
379 
380   for (CeedInt k = 0; k < P_1D; k++) {
381     scratch[item_id_x + item_id_y * T_1D] = U[k];
382     work_group_barrier(CLK_LOCAL_MEM_FENCE);
383 
384     V[k] = 0.0;
385     if (item_id_x < Q_1D && item_id_y < P_1D) {
386       for (CeedInt i = 0; i < Q_1D; i++) {
387         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
388       }
389     }
390     work_group_barrier(CLK_LOCAL_MEM_FENCE);
391   }
392 }
393 
394 //------------------------------------------------------------------------------
395 // 3D transpose tensor contract y
396 //------------------------------------------------------------------------------
397 inline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
398                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
399   const CeedInt item_id_x = get_local_id(0);
400   const CeedInt item_id_y = get_local_id(1);
401 
402   CeedScalar r_B[T_1D];
403   for (CeedInt i = 0; i < Q_1D; i++) {
404     r_B[i] = B[item_id_y + i * P_1D];
405   }
406 
407   for (CeedInt k = 0; k < P_1D; k++) {
408     scratch[item_id_x + item_id_y * T_1D] = U[k];
409     work_group_barrier(CLK_LOCAL_MEM_FENCE);
410     if (item_id_x < Q_1D && item_id_y < P_1D) {
411       for (CeedInt i = 0; i < Q_1D; i++) {
412         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
413       }
414     }
415     work_group_barrier(CLK_LOCAL_MEM_FENCE);
416   }
417 }
418 
419 //------------------------------------------------------------------------------
420 // 3D transpose tensor contract x
421 //------------------------------------------------------------------------------
422 inline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
423                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
424   const CeedInt item_id_x = get_local_id(0);
425   const CeedInt item_id_y = get_local_id(1);
426 
427   CeedScalar r_B[T_1D];
428   for (CeedInt i = 0; i < Q_1D; i++) {
429     r_B[i] = B[item_id_x + i * P_1D];
430   }
431 
432   for (CeedInt k = 0; k < P_1D; k++) {
433     scratch[item_id_x + item_id_y * T_1D] = U[k];
434     work_group_barrier(CLK_LOCAL_MEM_FENCE);
435     V[k] = 0.0;
436     if (item_id_x < P_1D && item_id_y < P_1D) {
437       for (CeedInt i = 0; i < Q_1D; i++) {
438         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
439       }
440     }
441     work_group_barrier(CLK_LOCAL_MEM_FENCE);
442   }
443 }
444 
445 //------------------------------------------------------------------------------
446 // 3D transpose tensor contract add x
447 //------------------------------------------------------------------------------
448 inline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
449                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
450   const CeedInt item_id_x = get_local_id(0);
451   const CeedInt item_id_y = get_local_id(1);
452 
453   CeedScalar r_B[T_1D];
454   for (CeedInt i = 0; i < Q_1D; i++) {
455     r_B[i] = B[item_id_x + i * P_1D];
456   }
457 
458   for (CeedInt k = 0; k < P_1D; k++) {
459     scratch[item_id_x + item_id_y * T_1D] = U[k];
460     work_group_barrier(CLK_LOCAL_MEM_FENCE);
461 
462     if (item_id_x < P_1D && item_id_y < P_1D) {
463       for (CeedInt i = 0; i < Q_1D; i++) {
464         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
465       }
466     }
467     work_group_barrier(CLK_LOCAL_MEM_FENCE);
468   }
469 }
470 
471 //------------------------------------------------------------------------------
472 // 3D interpolate to quadrature points
473 //------------------------------------------------------------------------------
474 inline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
475                            local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
476   CeedScalar r_t1[T_1D];
477   CeedScalar r_t2[T_1D];
478 
479   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
480     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
481     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
482     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch);
483   }
484 }
485 
486 //------------------------------------------------------------------------------
487 // 3D interpolate transpose
488 //------------------------------------------------------------------------------
489 inline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
490                                     local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
491   CeedScalar r_t1[T_1D];
492   CeedScalar r_t2[T_1D];
493 
494   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
495     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch);
496     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
497     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
498   }
499 }
500 
501 //------------------------------------------------------------------------------
502 // 3D derivatives at quadrature points
503 //------------------------------------------------------------------------------
504 inline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
505                          local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
506                          local CeedScalar *restrict scratch) {
507   CeedScalar r_t1[T_1D];
508   CeedScalar r_t2[T_1D];
509 
510   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
511     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch);
512     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
513     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
514     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
515     ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
516     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
517     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
518     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
519     ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
520   }
521 }
522 
523 //------------------------------------------------------------------------------
524 // 3D derivatives transpose
525 //------------------------------------------------------------------------------
526 inline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
527                                   local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
528                                   local CeedScalar *restrict scratch) {
529   CeedScalar r_t1[T_1D];
530   CeedScalar r_t2[T_1D];
531 
532   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
533     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
534     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
535     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch);
536     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
537     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
538     ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
539     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch);
540     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
541     ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
542   }
543 }
544 
545 //------------------------------------------------------------------------------
546 // 3D derivatives at quadrature points
547 //------------------------------------------------------------------------------
548 inline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
549                                    local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
550                                    local CeedScalar *restrict scratch) {
551   CeedScalar r_t1[T_1D];
552   CeedScalar r_t2[T_1D];
553 
554   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
555     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
556     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
557     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
558     ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
559     ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
560     ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
561   }
562 }
563 
564 //------------------------------------------------------------------------------
565 // 3D derivatives transpose
566 //------------------------------------------------------------------------------
567 inline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
568                                             local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G,
569                                             private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
570   CeedScalar r_t1[T_1D];
571   CeedScalar r_t2[T_1D];
572 
573   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
574     ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
575     ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
576     ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
577     ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
578     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
579     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
580   }
581 }
582 
583 //------------------------------------------------------------------------------
584 // 3D quadrature weights
585 //------------------------------------------------------------------------------
586 // template <int Q_1D>
587 inline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
588   const CeedInt item_id_x = get_local_id(0);
589   const CeedInt item_id_y = get_local_id(1);
590 
591   if (item_id_x < Q_1D && item_id_y < Q_1D) {
592     const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y];
593     for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q];
594   } else {
595     for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0;
596   }
597 }
598