xref: /libCEED/rust/libceed-sys/c-src/include/ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h (revision d4cc18453651bd0f94c1a2e078b2646a92dafdcc)
1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2bd882c8aSJames Wright // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3bd882c8aSJames Wright //
4bd882c8aSJames Wright // SPDX-License-Identifier: BSD-2-Clause
5bd882c8aSJames Wright //
6bd882c8aSJames Wright // This file is part of CEED:  http://github.com/ceed
7bd882c8aSJames Wright 
8bd882c8aSJames Wright /// @file
9bd882c8aSJames Wright /// Internal header for SYCL shared memory tensor product basis templates
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
11bd882c8aSJames Wright 
12bd882c8aSJames Wright //------------------------------------------------------------------------------
13bd882c8aSJames Wright // 1D
14bd882c8aSJames Wright //------------------------------------------------------------------------------
15bd882c8aSJames Wright 
16bd882c8aSJames Wright //------------------------------------------------------------------------------
17bd882c8aSJames Wright // 1D tensor contraction x
18bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractX1d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)19bd882c8aSJames Wright inline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
20bd882c8aSJames Wright                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
21bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
22bd882c8aSJames Wright 
23bd882c8aSJames Wright   scratch[item_id_x] = *U;
24bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
25bd882c8aSJames Wright 
26bd882c8aSJames Wright   *V = 0.0;
27bd882c8aSJames Wright   if (item_id_x < Q_1D) {
28bd882c8aSJames Wright     for (CeedInt i = 0; i < P_1D; i++) {
29bd882c8aSJames Wright       *V += B[i + item_id_x * P_1D] * scratch[i];  // Contract x direction
30bd882c8aSJames Wright     }
31bd882c8aSJames Wright   }
32bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
33bd882c8aSJames Wright }
34bd882c8aSJames Wright 
35bd882c8aSJames Wright //------------------------------------------------------------------------------
36bd882c8aSJames Wright // 1D transpose tensor contraction x
37bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractTransposeX1d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)38bd882c8aSJames Wright inline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
39bd882c8aSJames Wright                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
40bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
41bd882c8aSJames Wright 
42bd882c8aSJames Wright   scratch[item_id_x] = *U;
43bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
44bd882c8aSJames Wright 
45bd882c8aSJames Wright   *V = 0.0;
46bd882c8aSJames Wright   if (item_id_x < P_1D) {
47bd882c8aSJames Wright     for (CeedInt i = 0; i < Q_1D; i++) {
48bd882c8aSJames Wright       *V += B[item_id_x + i * P_1D] * scratch[i];  // Contract x direction
49bd882c8aSJames Wright     }
50bd882c8aSJames Wright   }
51bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
52bd882c8aSJames Wright }
53bd882c8aSJames Wright 
54bd882c8aSJames Wright //------------------------------------------------------------------------------
55bd882c8aSJames Wright // 1D interpolate to quadrature points
56bd882c8aSJames Wright //------------------------------------------------------------------------------
Interp1d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)57bd882c8aSJames Wright inline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
58bd882c8aSJames Wright                      local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
59bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
60bd882c8aSJames Wright     ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
61bd882c8aSJames Wright   }
62bd882c8aSJames Wright }
63bd882c8aSJames Wright 
64bd882c8aSJames Wright //------------------------------------------------------------------------------
65bd882c8aSJames Wright // 1D interpolate transpose
66bd882c8aSJames Wright //------------------------------------------------------------------------------
InterpTranspose1d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)67bd882c8aSJames Wright inline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
68bd882c8aSJames Wright                               local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
69bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
70bd882c8aSJames Wright     ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
71bd882c8aSJames Wright   }
72bd882c8aSJames Wright }
73bd882c8aSJames Wright 
74bd882c8aSJames Wright //------------------------------------------------------------------------------
75bd882c8aSJames Wright // 1D derivatives at quadrature points
76bd882c8aSJames Wright //------------------------------------------------------------------------------
Grad1d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)77bd882c8aSJames Wright inline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
78bd882c8aSJames Wright                    local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
79bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
80bd882c8aSJames Wright     ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
81bd882c8aSJames Wright   }
82bd882c8aSJames Wright }
83bd882c8aSJames Wright 
84bd882c8aSJames Wright //------------------------------------------------------------------------------
85bd882c8aSJames Wright // 1D derivatives transpose
86bd882c8aSJames Wright //------------------------------------------------------------------------------
GradTranspose1d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)87bd882c8aSJames Wright inline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
88bd882c8aSJames Wright                             local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
89bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
90bd882c8aSJames Wright     ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
91bd882c8aSJames Wright   }
92bd882c8aSJames Wright }
93bd882c8aSJames Wright 
94bd882c8aSJames Wright //------------------------------------------------------------------------------
95bd882c8aSJames Wright // 1D quadrature weights
96bd882c8aSJames Wright //------------------------------------------------------------------------------
Weight1d(const CeedInt Q_1D,const CeedScalar * restrict q_weight_1d,CeedScalar * restrict w)97bd882c8aSJames Wright inline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
98bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
99bd882c8aSJames Wright   *w                      = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0;
100bd882c8aSJames Wright }
101bd882c8aSJames Wright 
102bd882c8aSJames Wright //------------------------------------------------------------------------------
103bd882c8aSJames Wright // 2D
104bd882c8aSJames Wright //------------------------------------------------------------------------------
105bd882c8aSJames Wright 
106bd882c8aSJames Wright //------------------------------------------------------------------------------
107bd882c8aSJames Wright // 2D tensor contraction x
108bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractX2d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)109bd882c8aSJames Wright inline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
110bd882c8aSJames Wright                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
111bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
112bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
113bd882c8aSJames Wright 
114bd882c8aSJames Wright   scratch[item_id_x + item_id_y * T_1D] = *U;
115bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
116bd882c8aSJames Wright 
117bd882c8aSJames Wright   *V = 0.0;
118bd882c8aSJames Wright   if (item_id_x < Q_1D && item_id_y < P_1D) {
119bd882c8aSJames Wright     for (CeedInt i = 0; i < P_1D; i++) {
120bd882c8aSJames Wright       *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
121bd882c8aSJames Wright     }
122bd882c8aSJames Wright   }
123bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
124bd882c8aSJames Wright }
125bd882c8aSJames Wright 
126bd882c8aSJames Wright //------------------------------------------------------------------------------
127bd882c8aSJames Wright // 2D tensor contract y
128bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractY2d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)129bd882c8aSJames Wright inline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
130bd882c8aSJames Wright                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
131bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
132bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
133bd882c8aSJames Wright 
134bd882c8aSJames Wright   scratch[item_id_x + item_id_y * T_1D] = *U;
135bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
136bd882c8aSJames Wright 
137bd882c8aSJames Wright   *V = 0.0;
138bd882c8aSJames Wright   if (item_id_x < Q_1D && item_id_y < Q_1D) {
139bd882c8aSJames Wright     for (CeedInt i = 0; i < P_1D; i++) {
140bd882c8aSJames Wright       *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction
141bd882c8aSJames Wright     }
142bd882c8aSJames Wright   }
143bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
144bd882c8aSJames Wright }
145bd882c8aSJames Wright 
146bd882c8aSJames Wright //------------------------------------------------------------------------------
147bd882c8aSJames Wright // 2D transpose tensor contract y
148bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractTransposeY2d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)149bd882c8aSJames Wright inline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
150bd882c8aSJames Wright                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
151bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
152bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
153bd882c8aSJames Wright 
154bd882c8aSJames Wright   scratch[item_id_x + item_id_y * T_1D] = *U;
155bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
156bd882c8aSJames Wright 
157bd882c8aSJames Wright   *V = 0.0;
158bd882c8aSJames Wright   if (item_id_x < Q_1D && item_id_y < P_1D) {
159bd882c8aSJames Wright     for (CeedInt i = 0; i < Q_1D; i++) {
160bd882c8aSJames Wright       *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction
161bd882c8aSJames Wright     }
162bd882c8aSJames Wright   }
163bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
164bd882c8aSJames Wright }
165bd882c8aSJames Wright 
166bd882c8aSJames Wright //------------------------------------------------------------------------------
167bd882c8aSJames Wright // 2D transpose tensor contract x
168bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractTransposeX2d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)169bd882c8aSJames Wright inline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
170bd882c8aSJames Wright                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
171bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
172bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
173bd882c8aSJames Wright 
174bd882c8aSJames Wright   scratch[item_id_x + item_id_y * T_1D] = *U;
175bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
176bd882c8aSJames Wright 
177bd882c8aSJames Wright   *V = 0.0;
178bd882c8aSJames Wright   if (item_id_x < P_1D && item_id_y < P_1D) {
179bd882c8aSJames Wright     for (CeedInt i = 0; i < Q_1D; i++) {
180bd882c8aSJames Wright       *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
181bd882c8aSJames Wright     }
182bd882c8aSJames Wright   }
183bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
184bd882c8aSJames Wright }
185bd882c8aSJames Wright 
186bd882c8aSJames Wright //------------------------------------------------------------------------------
187bd882c8aSJames Wright // 2D transpose tensor contract and add x
188bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractTransposeAddX2d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)189bd882c8aSJames Wright inline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
190bd882c8aSJames Wright                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
191bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
192bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
193bd882c8aSJames Wright 
194bd882c8aSJames Wright   scratch[item_id_x + item_id_y * T_1D] = *U;
195bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
196bd882c8aSJames Wright 
197bd882c8aSJames Wright   if (item_id_x < P_1D && item_id_y < P_1D) {
198bd882c8aSJames Wright     for (CeedInt i = 0; i < Q_1D; i++) {
199bd882c8aSJames Wright       *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction
200bd882c8aSJames Wright     }
201bd882c8aSJames Wright   }
202bd882c8aSJames Wright   work_group_barrier(CLK_LOCAL_MEM_FENCE);
203bd882c8aSJames Wright }
204bd882c8aSJames Wright 
205bd882c8aSJames Wright //------------------------------------------------------------------------------
206bd882c8aSJames Wright // 2D interpolate to quadrature points
207bd882c8aSJames Wright //------------------------------------------------------------------------------
InterpTensor2d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)208bd882c8aSJames Wright inline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
209bd882c8aSJames Wright                            local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
210bd882c8aSJames Wright   CeedScalar r_t[1];
211bd882c8aSJames Wright 
212bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
213bd882c8aSJames Wright     ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
214bd882c8aSJames Wright     ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
215bd882c8aSJames Wright   }
216bd882c8aSJames Wright }
217bd882c8aSJames Wright 
218bd882c8aSJames Wright //------------------------------------------------------------------------------
219bd882c8aSJames Wright // 2D interpolate transpose
220bd882c8aSJames Wright //------------------------------------------------------------------------------
InterpTransposeTensor2d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)221bd882c8aSJames Wright inline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
222bd882c8aSJames Wright                                     local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
223bd882c8aSJames Wright   CeedScalar r_t[1];
224bd882c8aSJames Wright 
225bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
226bd882c8aSJames Wright     ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
227bd882c8aSJames Wright     ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
228bd882c8aSJames Wright   }
229bd882c8aSJames Wright }
230bd882c8aSJames Wright 
231bd882c8aSJames Wright //------------------------------------------------------------------------------
232bd882c8aSJames Wright // 2D derivatives at quadrature points
233bd882c8aSJames Wright //------------------------------------------------------------------------------
GradTensor2d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)234bd882c8aSJames Wright inline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
235bd882c8aSJames Wright                          local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
236bd882c8aSJames Wright                          local CeedScalar *restrict scratch) {
237bd882c8aSJames Wright   CeedScalar r_t[1];
238bd882c8aSJames Wright 
239bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
240bd882c8aSJames Wright     ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch);
241bd882c8aSJames Wright     ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch);
242bd882c8aSJames Wright     ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
243bd882c8aSJames Wright     ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch);
244bd882c8aSJames Wright   }
245bd882c8aSJames Wright }
246bd882c8aSJames Wright 
247bd882c8aSJames Wright //------------------------------------------------------------------------------
248bd882c8aSJames Wright // 2D derivatives transpose
249bd882c8aSJames Wright //------------------------------------------------------------------------------
GradTransposeTensor2d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)250bd882c8aSJames Wright inline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
251bd882c8aSJames Wright                                   local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
252bd882c8aSJames Wright                                   local CeedScalar *restrict scratch) {
253bd882c8aSJames Wright   CeedScalar r_t[1];
254bd882c8aSJames Wright 
255bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
256bd882c8aSJames Wright     ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch);
257bd882c8aSJames Wright     ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch);
258bd882c8aSJames Wright     ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch);
259bd882c8aSJames Wright     ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
260bd882c8aSJames Wright   }
261bd882c8aSJames Wright }
262bd882c8aSJames Wright 
263bd882c8aSJames Wright //------------------------------------------------------------------------------
264bd882c8aSJames Wright // 2D quadrature weights
265bd882c8aSJames Wright //------------------------------------------------------------------------------
WeightTensor2d(const CeedInt Q_1D,const CeedScalar * restrict q_weight_1d,CeedScalar * restrict w)266bd882c8aSJames Wright inline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
267bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
268bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
269bd882c8aSJames Wright 
270bd882c8aSJames Wright   *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0;
271bd882c8aSJames Wright }
272bd882c8aSJames Wright 
273bd882c8aSJames Wright //------------------------------------------------------------------------------
274bd882c8aSJames Wright // 3D
275bd882c8aSJames Wright //------------------------------------------------------------------------------
276bd882c8aSJames Wright 
277bd882c8aSJames Wright //------------------------------------------------------------------------------
278bd882c8aSJames Wright // 3D tensor contract x
279bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractX3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)280bd882c8aSJames Wright inline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
281bd882c8aSJames Wright                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
282bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
283bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
284bd882c8aSJames Wright 
285bd882c8aSJames Wright   CeedScalar r_B[T_1D];
286bd882c8aSJames Wright   for (CeedInt i = 0; i < P_1D; i++) {
287bd882c8aSJames Wright     r_B[i] = B[i + item_id_x * P_1D];
288bd882c8aSJames Wright   }
289bd882c8aSJames Wright 
290bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
291bd882c8aSJames Wright     scratch[item_id_x + item_id_y * T_1D] = U[k];
292bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
293bd882c8aSJames Wright 
294bd882c8aSJames Wright     V[k] = 0.0;
295bd882c8aSJames Wright     if (item_id_x < Q_1D && item_id_y < P_1D) {
296bd882c8aSJames Wright       for (CeedInt i = 0; i < P_1D; i++) {
297bd882c8aSJames Wright         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
298bd882c8aSJames Wright       }
299bd882c8aSJames Wright     }
300bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
301bd882c8aSJames Wright   }
302bd882c8aSJames Wright }
303bd882c8aSJames Wright 
304bd882c8aSJames Wright //------------------------------------------------------------------------------
305bd882c8aSJames Wright // 3D tensor contract y
306bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractY3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)307bd882c8aSJames Wright inline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
308bd882c8aSJames Wright                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
309bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
310bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
311bd882c8aSJames Wright 
312bd882c8aSJames Wright   CeedScalar r_B[T_1D];
313bd882c8aSJames Wright   for (CeedInt i = 0; i < P_1D; i++) {
314bd882c8aSJames Wright     r_B[i] = B[i + item_id_y * P_1D];
315bd882c8aSJames Wright   }
316bd882c8aSJames Wright 
317bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
318bd882c8aSJames Wright     scratch[item_id_x + item_id_y * T_1D] = U[k];
319bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
320bd882c8aSJames Wright 
321bd882c8aSJames Wright     V[k] = 0.0;
322bd882c8aSJames Wright     if (item_id_x < Q_1D && item_id_y < Q_1D) {
323bd882c8aSJames Wright       for (CeedInt i = 0; i < P_1D; i++) {
324bd882c8aSJames Wright         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
325bd882c8aSJames Wright       }
326bd882c8aSJames Wright     }
327bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
328bd882c8aSJames Wright   }
329bd882c8aSJames Wright }
330bd882c8aSJames Wright 
331bd882c8aSJames Wright //------------------------------------------------------------------------------
332bd882c8aSJames Wright // 3D tensor contract z
333bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractZ3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)334bd882c8aSJames Wright inline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
335bd882c8aSJames Wright                         private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
336bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
337bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
338bd882c8aSJames Wright 
339bd882c8aSJames Wright   for (CeedInt k = 0; k < Q_1D; k++) {
340bd882c8aSJames Wright     V[k] = 0.0;
341bd882c8aSJames Wright     if (item_id_x < Q_1D && item_id_y < Q_1D) {
342bd882c8aSJames Wright       for (CeedInt i = 0; i < P_1D; i++) {
343bd882c8aSJames Wright         V[k] += B[i + k * P_1D] * U[i];  // Contract z direction
344bd882c8aSJames Wright       }
345bd882c8aSJames Wright     }
346bd882c8aSJames Wright   }
347bd882c8aSJames Wright }
348bd882c8aSJames Wright 
349bd882c8aSJames Wright //------------------------------------------------------------------------------
350bd882c8aSJames Wright // 3D transpose tensor contract z
351bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractTransposeZ3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)352bd882c8aSJames Wright inline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
353bd882c8aSJames Wright                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
354bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
355bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
356bd882c8aSJames Wright 
357bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
358bd882c8aSJames Wright     V[k] = 0.0;
359bd882c8aSJames Wright     if (item_id_x < Q_1D && item_id_y < Q_1D) {
360bd882c8aSJames Wright       for (CeedInt i = 0; i < Q_1D; i++) {
361bd882c8aSJames Wright         V[k] += B[k + i * P_1D] * U[i];  // Contract z direction
362bd882c8aSJames Wright       }
363bd882c8aSJames Wright     }
364bd882c8aSJames Wright   }
365bd882c8aSJames Wright }
366bd882c8aSJames Wright 
367bd882c8aSJames Wright //------------------------------------------------------------------------------
368bd882c8aSJames Wright // 3D transpose tensor contract y
369bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractTransposeY3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)370bd882c8aSJames Wright inline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
371bd882c8aSJames Wright                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
372bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
373bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
374bd882c8aSJames Wright 
375bd882c8aSJames Wright   CeedScalar r_B[T_1D];
376bd882c8aSJames Wright   for (CeedInt i = 0; i < Q_1D; i++) {
377bd882c8aSJames Wright     r_B[i] = B[item_id_y + i * P_1D];
378bd882c8aSJames Wright   }
379bd882c8aSJames Wright 
380bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
381bd882c8aSJames Wright     scratch[item_id_x + item_id_y * T_1D] = U[k];
382bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
383bd882c8aSJames Wright 
384bd882c8aSJames Wright     V[k] = 0.0;
385bd882c8aSJames Wright     if (item_id_x < Q_1D && item_id_y < P_1D) {
386bd882c8aSJames Wright       for (CeedInt i = 0; i < Q_1D; i++) {
387bd882c8aSJames Wright         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
388bd882c8aSJames Wright       }
389bd882c8aSJames Wright     }
390bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
391bd882c8aSJames Wright   }
392bd882c8aSJames Wright }
393bd882c8aSJames Wright 
394bd882c8aSJames Wright //------------------------------------------------------------------------------
395bd882c8aSJames Wright // 3D transpose tensor contract y
396bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractTransposeAddY3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)397bd882c8aSJames Wright inline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
398bd882c8aSJames Wright                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
399bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
400bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
401bd882c8aSJames Wright 
402bd882c8aSJames Wright   CeedScalar r_B[T_1D];
403bd882c8aSJames Wright   for (CeedInt i = 0; i < Q_1D; i++) {
404bd882c8aSJames Wright     r_B[i] = B[item_id_y + i * P_1D];
405bd882c8aSJames Wright   }
406bd882c8aSJames Wright 
407bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
408bd882c8aSJames Wright     scratch[item_id_x + item_id_y * T_1D] = U[k];
409bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
410bd882c8aSJames Wright     if (item_id_x < Q_1D && item_id_y < P_1D) {
411bd882c8aSJames Wright       for (CeedInt i = 0; i < Q_1D; i++) {
412bd882c8aSJames Wright         V[k] += r_B[i] * scratch[item_id_x + i * T_1D];  // Contract y direction
413bd882c8aSJames Wright       }
414bd882c8aSJames Wright     }
415bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
416bd882c8aSJames Wright   }
417bd882c8aSJames Wright }
418bd882c8aSJames Wright 
419bd882c8aSJames Wright //------------------------------------------------------------------------------
420bd882c8aSJames Wright // 3D transpose tensor contract x
421bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractTransposeX3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)422bd882c8aSJames Wright inline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
423bd882c8aSJames Wright                                  private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
424bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
425bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
426bd882c8aSJames Wright 
427bd882c8aSJames Wright   CeedScalar r_B[T_1D];
428bd882c8aSJames Wright   for (CeedInt i = 0; i < Q_1D; i++) {
429bd882c8aSJames Wright     r_B[i] = B[item_id_x + i * P_1D];
430bd882c8aSJames Wright   }
431bd882c8aSJames Wright 
432bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
433bd882c8aSJames Wright     scratch[item_id_x + item_id_y * T_1D] = U[k];
434bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
435bd882c8aSJames Wright     V[k] = 0.0;
436bd882c8aSJames Wright     if (item_id_x < P_1D && item_id_y < P_1D) {
437bd882c8aSJames Wright       for (CeedInt i = 0; i < Q_1D; i++) {
438bd882c8aSJames Wright         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
439bd882c8aSJames Wright       }
440bd882c8aSJames Wright     }
441bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
442bd882c8aSJames Wright   }
443bd882c8aSJames Wright }
444bd882c8aSJames Wright 
445bd882c8aSJames Wright //------------------------------------------------------------------------------
446bd882c8aSJames Wright // 3D transpose tensor contract add x
447bd882c8aSJames Wright //------------------------------------------------------------------------------
ContractTransposeAddX3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)448bd882c8aSJames Wright inline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
449bd882c8aSJames Wright                                     private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
450bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
451bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
452bd882c8aSJames Wright 
453bd882c8aSJames Wright   CeedScalar r_B[T_1D];
454bd882c8aSJames Wright   for (CeedInt i = 0; i < Q_1D; i++) {
455bd882c8aSJames Wright     r_B[i] = B[item_id_x + i * P_1D];
456bd882c8aSJames Wright   }
457bd882c8aSJames Wright 
458bd882c8aSJames Wright   for (CeedInt k = 0; k < P_1D; k++) {
459bd882c8aSJames Wright     scratch[item_id_x + item_id_y * T_1D] = U[k];
460bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
461bd882c8aSJames Wright 
462bd882c8aSJames Wright     if (item_id_x < P_1D && item_id_y < P_1D) {
463bd882c8aSJames Wright       for (CeedInt i = 0; i < Q_1D; i++) {
464bd882c8aSJames Wright         V[k] += r_B[i] * scratch[i + item_id_y * T_1D];  // Contract x direction
465bd882c8aSJames Wright       }
466bd882c8aSJames Wright     }
467bd882c8aSJames Wright     work_group_barrier(CLK_LOCAL_MEM_FENCE);
468bd882c8aSJames Wright   }
469bd882c8aSJames Wright }
470bd882c8aSJames Wright 
471bd882c8aSJames Wright //------------------------------------------------------------------------------
472bd882c8aSJames Wright // 3D interpolate to quadrature points
473bd882c8aSJames Wright //------------------------------------------------------------------------------
InterpTensor3d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)474bd882c8aSJames Wright inline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
475bd882c8aSJames Wright                            local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
476bd882c8aSJames Wright   CeedScalar r_t1[T_1D];
477bd882c8aSJames Wright   CeedScalar r_t2[T_1D];
478bd882c8aSJames Wright 
479bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
480bd882c8aSJames Wright     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
481bd882c8aSJames Wright     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
482bd882c8aSJames Wright     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch);
483bd882c8aSJames Wright   }
484bd882c8aSJames Wright }
485bd882c8aSJames Wright 
486bd882c8aSJames Wright //------------------------------------------------------------------------------
487bd882c8aSJames Wright // 3D interpolate transpose
488bd882c8aSJames Wright //------------------------------------------------------------------------------
InterpTransposeTensor3d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)489bd882c8aSJames Wright inline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
490bd882c8aSJames Wright                                     local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
491bd882c8aSJames Wright   CeedScalar r_t1[T_1D];
492bd882c8aSJames Wright   CeedScalar r_t2[T_1D];
493bd882c8aSJames Wright 
494bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
495bd882c8aSJames Wright     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch);
496bd882c8aSJames Wright     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
497bd882c8aSJames Wright     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
498bd882c8aSJames Wright   }
499bd882c8aSJames Wright }
500bd882c8aSJames Wright 
501bd882c8aSJames Wright //------------------------------------------------------------------------------
502bd882c8aSJames Wright // 3D derivatives at quadrature points
503bd882c8aSJames Wright //------------------------------------------------------------------------------
GradTensor3d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)504bd882c8aSJames Wright inline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
505bd882c8aSJames Wright                          local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
506bd882c8aSJames Wright                          local CeedScalar *restrict scratch) {
507bd882c8aSJames Wright   CeedScalar r_t1[T_1D];
508bd882c8aSJames Wright   CeedScalar r_t2[T_1D];
509bd882c8aSJames Wright 
510bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
511bd882c8aSJames Wright     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch);
512bd882c8aSJames Wright     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
513bd882c8aSJames Wright     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
514bd882c8aSJames Wright     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
515bd882c8aSJames Wright     ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
516bd882c8aSJames Wright     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
517bd882c8aSJames Wright     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
518bd882c8aSJames Wright     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
519bd882c8aSJames Wright     ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
520bd882c8aSJames Wright   }
521bd882c8aSJames Wright }
522bd882c8aSJames Wright 
523bd882c8aSJames Wright //------------------------------------------------------------------------------
524bd882c8aSJames Wright // 3D derivatives transpose
525bd882c8aSJames Wright //------------------------------------------------------------------------------
GradTransposeTensor3d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)526bd882c8aSJames Wright inline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
527bd882c8aSJames Wright                                   local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
528bd882c8aSJames Wright                                   local CeedScalar *restrict scratch) {
529bd882c8aSJames Wright   CeedScalar r_t1[T_1D];
530bd882c8aSJames Wright   CeedScalar r_t2[T_1D];
531bd882c8aSJames Wright 
532bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
533bd882c8aSJames Wright     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
534bd882c8aSJames Wright     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
535bd882c8aSJames Wright     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch);
536bd882c8aSJames Wright     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
537bd882c8aSJames Wright     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
538bd882c8aSJames Wright     ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
539bd882c8aSJames Wright     ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch);
540bd882c8aSJames Wright     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
541bd882c8aSJames Wright     ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
542bd882c8aSJames Wright   }
543bd882c8aSJames Wright }
544bd882c8aSJames Wright 
545bd882c8aSJames Wright //------------------------------------------------------------------------------
546bd882c8aSJames Wright // 3D derivatives at quadrature points
547bd882c8aSJames Wright //------------------------------------------------------------------------------
GradTensorCollocated3d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)548bd882c8aSJames Wright inline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
549bd882c8aSJames Wright                                    local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
550bd882c8aSJames Wright                                    local CeedScalar *restrict scratch) {
551bd882c8aSJames Wright   CeedScalar r_t1[T_1D];
552bd882c8aSJames Wright   CeedScalar r_t2[T_1D];
553bd882c8aSJames Wright 
554bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
555bd882c8aSJames Wright     ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
556bd882c8aSJames Wright     ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
557bd882c8aSJames Wright     ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
558bd882c8aSJames Wright     ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
559bd882c8aSJames Wright     ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
560bd882c8aSJames Wright     ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
561bd882c8aSJames Wright   }
562bd882c8aSJames Wright }
563bd882c8aSJames Wright 
564bd882c8aSJames Wright //------------------------------------------------------------------------------
565bd882c8aSJames Wright // 3D derivatives transpose
566bd882c8aSJames Wright //------------------------------------------------------------------------------
GradTransposeTensorCollocated3d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)567bd882c8aSJames Wright inline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
568bd882c8aSJames Wright                                             local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G,
569bd882c8aSJames Wright                                             private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
570bd882c8aSJames Wright   CeedScalar r_t1[T_1D];
571bd882c8aSJames Wright   CeedScalar r_t2[T_1D];
572bd882c8aSJames Wright 
573bd882c8aSJames Wright   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
574bd882c8aSJames Wright     ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
575bd882c8aSJames Wright     ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
576bd882c8aSJames Wright     ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
577bd882c8aSJames Wright     ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
578bd882c8aSJames Wright     ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
579bd882c8aSJames Wright     ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
580bd882c8aSJames Wright   }
581bd882c8aSJames Wright }
582bd882c8aSJames Wright 
583bd882c8aSJames Wright //------------------------------------------------------------------------------
584bd882c8aSJames Wright // 3D quadrature weights
585bd882c8aSJames Wright //------------------------------------------------------------------------------
586bd882c8aSJames Wright // template <int Q_1D>
WeightTensor3d(const CeedInt Q_1D,const CeedScalar * restrict q_weight_1d,CeedScalar * restrict w)587bd882c8aSJames Wright inline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
588bd882c8aSJames Wright   const CeedInt item_id_x = get_local_id(0);
589bd882c8aSJames Wright   const CeedInt item_id_y = get_local_id(1);
590bd882c8aSJames Wright 
591bd882c8aSJames Wright   if (item_id_x < Q_1D && item_id_y < Q_1D) {
592bd882c8aSJames Wright     const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y];
593bd882c8aSJames Wright     for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q];
594bd882c8aSJames Wright   } else {
595bd882c8aSJames Wright     for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0;
596bd882c8aSJames Wright   }
597bd882c8aSJames Wright }
598