xref: /libCEED/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h (revision d83cf49fece5d7d5441d5b92eb712b904329a4d2)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for SYCL shared memory tensor product basis templates
10 
11 #include <ceed.h>
12 
13 //------------------------------------------------------------------------------
14 // 1D
15 //------------------------------------------------------------------------------
16 
17 //------------------------------------------------------------------------------
18 // 1D tensor contraction x
19 //------------------------------------------------------------------------------
20 inline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
21                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
22   const CeedInt item_id_x = get_local_id(0);
23 
24   scratch[item_id_x] = *U;
25   work_group_barrier(CLK_LOCAL_MEM_FENCE);
26 
27   *V = 0.0;
28   if (item_id_x < Q_1D) {
29     for (CeedInt i = 0; i < P_1D; i++) {
30       *V += B[i + item_id_x * P_1D] * scratch[i];  // Contract x direction
31     }
32   }
33   work_group_barrier(CLK_LOCAL_MEM_FENCE);
34 }
35 
36 //------------------------------------------------------------------------------
37 // 1D transpose tensor contraction x
38 //------------------------------------------------------------------------------
39 inline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
40                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
41   const CeedInt item_id_x = get_local_id(0);
42 
43   scratch[item_id_x] = *U;
44   work_group_barrier(CLK_LOCAL_MEM_FENCE);
45 
46   *V = 0.0;
47   if (item_id_x < P_1D) {
48     for (CeedInt i = 0; i < Q_1D; i++) {
49       *V += B[item_id_x + i * P_1D] * scratch[i];  // Contract x direction
50     }
51   }
52   work_group_barrier(CLK_LOCAL_MEM_FENCE);
53 }
54 
55 //------------------------------------------------------------------------------
56 // 1D interpolate to quadrature points
57 //------------------------------------------------------------------------------
58 inline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
59                      local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
60   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
61     ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
62   }
63 }
64 
65 //------------------------------------------------------------------------------
66 // 1D interpolate transpose
67 //------------------------------------------------------------------------------
68 inline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
69                               local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
70   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
71     ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
72   }
73 }
74 
75 //------------------------------------------------------------------------------
76 // 1D derivatives at quadrature points
77 //------------------------------------------------------------------------------
78 inline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
79                    local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
80   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
81     ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
82   }
83 }
84 
85 //------------------------------------------------------------------------------
86 // 1D derivatives transpose
87 //------------------------------------------------------------------------------
88 inline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
89                             local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
90   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
91     ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
92   }
93 }
94 
95 //------------------------------------------------------------------------------
96 // 1D quadrature weights
97 //------------------------------------------------------------------------------
98 inline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
99   const CeedInt item_id_x = get_local_id(0);
100   *w                      = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0;
101 }
102 
103 //------------------------------------------------------------------------------
104 // 2D
105 //------------------------------------------------------------------------------
106 
107 //------------------------------------------------------------------------------
108 // 2D tensor contraction x
109 //------------------------------------------------------------------------------
110 inline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
111                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
112   const CeedInt item_id_x = get_local_id(0);
113   const CeedInt item_id_y = get_local_id(1);
114 
115   scratch[item_id_x + item_id_y * T_1D] = *U;
116   work_group_barrier(CLK_LOCAL_MEM_FENCE);
117 
118   *V = 0.0;
119   if (item_id_x < Q_1D && item_id_y < P_1D) {
120     for (CeedInt i = 0; i < P_1D; i++) {
121       *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
122     }
123   }
124   work_group_barrier(CLK_LOCAL_MEM_FENCE);
125 }
126 
127 //------------------------------------------------------------------------------
128 // 2D tensor contract y
129 //------------------------------------------------------------------------------
130 inline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
131                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
132   const CeedInt item_id_x = get_local_id(0);
133   const CeedInt item_id_y = get_local_id(1);
134 
135   scratch[item_id_x + item_id_y * T_1D] = *U;
136   work_group_barrier(CLK_LOCAL_MEM_FENCE);
137 
138   *V = 0.0;
139   if (item_id_x < Q_1D && item_id_y < Q_1D) {
140     for (CeedInt i = 0; i < P_1D; i++) {
141       *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction
142     }
143   }
144   work_group_barrier(CLK_LOCAL_MEM_FENCE);
145 }
146 
147 //------------------------------------------------------------------------------
148 // 2D transpose tensor contract y
149 //------------------------------------------------------------------------------
150 inline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
151                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
152   const CeedInt item_id_x = get_local_id(0);
153   const CeedInt item_id_y = get_local_id(1);
154 
155   scratch[item_id_x + item_id_y * T_1D] = *U;
156   work_group_barrier(CLK_LOCAL_MEM_FENCE);
157 
158   *V = 0.0;
159   if (item_id_x < Q_1D && item_id_y < P_1D) {
160     for (CeedInt i = 0; i < Q_1D; i++) {
161       *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction
162     }
163   }
164   work_group_barrier(CLK_LOCAL_MEM_FENCE);
165 }
166 
167 //------------------------------------------------------------------------------
168 // 2D transpose tensor contract x
169 //------------------------------------------------------------------------------
170 inline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
171                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
172   const CeedInt item_id_x = get_local_id(0);
173   const CeedInt item_id_y = get_local_id(1);
174 
175   scratch[item_id_x + item_id_y * T_1D] = *U;
176   work_group_barrier(CLK_LOCAL_MEM_FENCE);
177 
178   *V = 0.0;
179   if (item_id_x < P_1D && item_id_y < P_1D) {
180     for (CeedInt i = 0; i < Q_1D; i++) {
181       *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
182     }
183   }
184   work_group_barrier(CLK_LOCAL_MEM_FENCE);
185 }
186 
187 //------------------------------------------------------------------------------
188 // 2D transpose tensor contract and add x
189 //------------------------------------------------------------------------------
190 inline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
191                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
192   const CeedInt item_id_x = get_local_id(0);
193   const CeedInt item_id_y = get_local_id(1);
194 
195   scratch[item_id_x + item_id_y * T_1D] = *U;
196   work_group_barrier(CLK_LOCAL_MEM_FENCE);
197 
198   if (item_id_x < P_1D && item_id_y < P_1D) {
199     for (CeedInt i = 0; i < Q_1D; i++) {
200       *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
201     }
202   }
203   work_group_barrier(CLK_LOCAL_MEM_FENCE);
204 }
205 
206 //------------------------------------------------------------------------------
207 // 2D interpolate to quadrature points
208 //------------------------------------------------------------------------------
209 inline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
210                            local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
211   CeedScalar r_t[1];
212 
213   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
214     ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
215     ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
216   }
217 }
218 
219 //------------------------------------------------------------------------------
220 // 2D interpolate transpose
221 //------------------------------------------------------------------------------
222 inline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
223                                     local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
224   CeedScalar r_t[1];
225 
226   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
227     ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
228     ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
229   }
230 }
231 
232 //------------------------------------------------------------------------------
233 // 2D derivatives at quadrature points
234 //------------------------------------------------------------------------------
235 inline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
236                          local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
237                          local CeedScalar *restrict scratch) {
238   CeedScalar r_t[1];
239 
240   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
241     ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch);
242     ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch);
243     ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
244     ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch);
245   }
246 }
247 
248 //------------------------------------------------------------------------------
249 // 2D derivatives transpose
250 //------------------------------------------------------------------------------
251 inline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
252                                   local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
253                                   local CeedScalar *restrict scratch) {
254   CeedScalar r_t[1];
255 
256   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
257     ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch);
258     ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch);
259     ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch);
260     ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
261   }
262 }
263 
264 //------------------------------------------------------------------------------
265 // 2D quadrature weights
266 //------------------------------------------------------------------------------
267 inline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
268   const CeedInt item_id_x = get_local_id(0);
269   const CeedInt item_id_y = get_local_id(1);
270 
271   *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0;
272 }
273 
274 //------------------------------------------------------------------------------
275 // 3D
276 //------------------------------------------------------------------------------
277 
278 //------------------------------------------------------------------------------
279 // 3D tensor contract x
280 //------------------------------------------------------------------------------
281 inline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
282                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
283   const CeedInt item_id_x = get_local_id(0);
284   const CeedInt item_id_y = get_local_id(1);
285 
286   CeedScalar r_B[T_1D];
287   for (CeedInt i = 0; i < P_1D; i++) {
288     r_B[i] = B[i + item_id_x * P_1D];
289   }
290 
291   for (CeedInt k = 0; k < P_1D; k++) {
292     scratch[item_id_x + item_id_y * T_1D] = U[k];
293     work_group_barrier(CLK_LOCAL_MEM_FENCE);
294 
295     V[k] = 0.0;
296     if (item_id_x < Q_1D && item_id_y < P_1D) {
297       for (CeedInt i = 0; i < P_1D; i++) {
298         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
299       }
300     }
301     work_group_barrier(CLK_LOCAL_MEM_FENCE);
302   }
303 }
304 
305 //------------------------------------------------------------------------------
306 // 3D tensor contract y
307 //------------------------------------------------------------------------------
308 inline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
309                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
310   const CeedInt item_id_x = get_local_id(0);
311   const CeedInt item_id_y = get_local_id(1);
312 
313   CeedScalar r_B[T_1D];
314   for (CeedInt i = 0; i < P_1D; i++) {
315     r_B[i] = B[i + item_id_y * P_1D];
316   }
317 
318   for (CeedInt k = 0; k < P_1D; k++) {
319     scratch[item_id_x + item_id_y * T_1D] = U[k];
320     work_group_barrier(CLK_LOCAL_MEM_FENCE);
321 
322     V[k] = 0.0;
323     if (item_id_x < Q_1D && item_id_y < Q_1D) {
324       for (CeedInt i = 0; i < P_1D; i++) {
325         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
326       }
327     }
328     work_group_barrier(CLK_LOCAL_MEM_FENCE);
329   }
330 }
331 
332 //------------------------------------------------------------------------------
333 // 3D tensor contract z
334 //------------------------------------------------------------------------------
335 inline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
336                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
337   const CeedInt item_id_x = get_local_id(0);
338   const CeedInt item_id_y = get_local_id(1);
339 
340   for (CeedInt k = 0; k < Q_1D; k++) {
341     V[k] = 0.0;
342     if (item_id_x < Q_1D && item_id_y < Q_1D) {
343       for (CeedInt i = 0; i < P_1D; i++) {
344         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
345       }
346     }
347   }
348 }
349 
350 //------------------------------------------------------------------------------
351 // 3D transpose tensor contract z
352 //------------------------------------------------------------------------------
353 inline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
354                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
355   const CeedInt item_id_x = get_local_id(0);
356   const CeedInt item_id_y = get_local_id(1);
357 
358   for (CeedInt k = 0; k < P_1D; k++) {
359     V[k] = 0.0;
360     if (item_id_x < Q_1D && item_id_y < Q_1D) {
361       for (CeedInt i = 0; i < Q_1D; i++) {
362         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
363       }
364     }
365   }
366 }
367 
368 //------------------------------------------------------------------------------
369 // 3D transpose tensor contract y
370 //------------------------------------------------------------------------------
371 inline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
372                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
373   const CeedInt item_id_x = get_local_id(0);
374   const CeedInt item_id_y = get_local_id(1);
375 
376   CeedScalar r_B[T_1D];
377   for (CeedInt i = 0; i < Q_1D; i++) {
378     r_B[i] = B[item_id_y + i * P_1D];
379   }
380 
381   for (CeedInt k = 0; k < P_1D; k++) {
382     scratch[item_id_x + item_id_y * T_1D] = U[k];
383     work_group_barrier(CLK_LOCAL_MEM_FENCE);
384 
385     V[k] = 0.0;
386     if (item_id_x < Q_1D && item_id_y < P_1D) {
387       for (CeedInt i = 0; i < Q_1D; i++) {
388         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
389       }
390     }
391     work_group_barrier(CLK_LOCAL_MEM_FENCE);
392   }
393 }
394 
395 //------------------------------------------------------------------------------
396 // 3D transpose tensor contract y
397 //------------------------------------------------------------------------------
398 inline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
399                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
400   const CeedInt item_id_x = get_local_id(0);
401   const CeedInt item_id_y = get_local_id(1);
402 
403   CeedScalar r_B[T_1D];
404   for (CeedInt i = 0; i < Q_1D; i++) {
405     r_B[i] = B[item_id_y + i * P_1D];
406   }
407 
408   for (CeedInt k = 0; k < P_1D; k++) {
409     scratch[item_id_x + item_id_y * T_1D] = U[k];
410     work_group_barrier(CLK_LOCAL_MEM_FENCE);
411     if (item_id_x < Q_1D && item_id_y < P_1D) {
412       for (CeedInt i = 0; i < Q_1D; i++) {
413         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
414       }
415     }
416     work_group_barrier(CLK_LOCAL_MEM_FENCE);
417   }
418 }
419 
420 //------------------------------------------------------------------------------
421 // 3D transpose tensor contract x
422 //------------------------------------------------------------------------------
423 inline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
424                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
425   const CeedInt item_id_x = get_local_id(0);
426   const CeedInt item_id_y = get_local_id(1);
427 
428   CeedScalar r_B[T_1D];
429   for (CeedInt i = 0; i < Q_1D; i++) {
430     r_B[i] = B[item_id_x + i * P_1D];
431   }
432 
433   for (CeedInt k = 0; k < P_1D; k++) {
434     scratch[item_id_x + item_id_y * T_1D] = U[k];
435     work_group_barrier(CLK_LOCAL_MEM_FENCE);
436     V[k] = 0.0;
437     if (item_id_x < P_1D && item_id_y < P_1D) {
438       for (CeedInt i = 0; i < Q_1D; i++) {
439         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
440       }
441     }
442     work_group_barrier(CLK_LOCAL_MEM_FENCE);
443   }
444 }
445 
446 //------------------------------------------------------------------------------
447 // 3D transpose tensor contract add x
448 //------------------------------------------------------------------------------
449 inline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
450                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
451   const CeedInt item_id_x = get_local_id(0);
452   const CeedInt item_id_y = get_local_id(1);
453 
454   CeedScalar r_B[T_1D];
455   for (CeedInt i = 0; i < Q_1D; i++) {
456     r_B[i] = B[item_id_x + i * P_1D];
457   }
458 
459   for (CeedInt k = 0; k < P_1D; k++) {
460     scratch[item_id_x + item_id_y * T_1D] = U[k];
461     work_group_barrier(CLK_LOCAL_MEM_FENCE);
462 
463     if (item_id_x < P_1D && item_id_y < P_1D) {
464       for (CeedInt i = 0; i < Q_1D; i++) {
465         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
466       }
467     }
468     work_group_barrier(CLK_LOCAL_MEM_FENCE);
469   }
470 }
471 
472 //------------------------------------------------------------------------------
473 // 3D interpolate to quadrature points
474 //------------------------------------------------------------------------------
475 inline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
476                            local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
477   CeedScalar r_t1[T_1D];
478   CeedScalar r_t2[T_1D];
479 
480   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
481     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
482     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
483     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch);
484   }
485 }
486 
487 //------------------------------------------------------------------------------
488 // 3D interpolate transpose
489 //------------------------------------------------------------------------------
490 inline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
491                                     local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
492   CeedScalar r_t1[T_1D];
493   CeedScalar r_t2[T_1D];
494 
495   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
496     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch);
497     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
498     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
499   }
500 }
501 
502 //------------------------------------------------------------------------------
503 // 3D derivatives at quadrature points
504 //------------------------------------------------------------------------------
505 inline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
506                          local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
507                          local CeedScalar *restrict scratch) {
508   CeedScalar r_t1[T_1D];
509   CeedScalar r_t2[T_1D];
510 
511   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
512     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch);
513     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
514     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
515     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
516     ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
517     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
518     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
519     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
520     ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
521   }
522 }
523 
524 //------------------------------------------------------------------------------
525 // 3D derivatives transpose
526 //------------------------------------------------------------------------------
527 inline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
528                                   local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
529                                   local CeedScalar *restrict scratch) {
530   CeedScalar r_t1[T_1D];
531   CeedScalar r_t2[T_1D];
532 
533   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
534     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
535     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
536     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch);
537     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
538     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
539     ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
540     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch);
541     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
542     ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
543   }
544 }
545 
546 //------------------------------------------------------------------------------
547 // 3D derivatives at quadrature points
548 //------------------------------------------------------------------------------
549 inline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
550                                    local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
551                                    local CeedScalar *restrict scratch) {
552   CeedScalar r_t1[T_1D];
553   CeedScalar r_t2[T_1D];
554 
555   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
556     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
557     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
558     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
559     ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
560     ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
561     ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
562   }
563 }
564 
565 //------------------------------------------------------------------------------
566 // 3D derivatives transpose
567 //------------------------------------------------------------------------------
568 inline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
569                                             local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G,
570                                             private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
571   CeedScalar r_t1[T_1D];
572   CeedScalar r_t2[T_1D];
573 
574   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
575     ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
576     ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
577     ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
578     ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
579     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
580     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
581   }
582 }
583 
584 //------------------------------------------------------------------------------
585 // 3D quadrature weights
586 //------------------------------------------------------------------------------
587 // template <int Q_1D>
588 inline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
589   const CeedInt item_id_x = get_local_id(0);
590   const CeedInt item_id_y = get_local_id(1);
591 
592   if (item_id_x < Q_1D && item_id_y < Q_1D) {
593     const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y];
594     for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q];
595   } else {
596     for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0;
597   }
598 }
599