1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for SYCL shared memory tensor product basis templates
10 #include <ceed/types.h>
11
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
ContractX1d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)19 inline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
20 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
21 const CeedInt item_id_x = get_local_id(0);
22
23 scratch[item_id_x] = *U;
24 work_group_barrier(CLK_LOCAL_MEM_FENCE);
25
26 *V = 0.0;
27 if (item_id_x < Q_1D) {
28 for (CeedInt i = 0; i < P_1D; i++) {
29 *V += B[i + item_id_x * P_1D] * scratch[i]; // Contract x direction
30 }
31 }
32 work_group_barrier(CLK_LOCAL_MEM_FENCE);
33 }
34
35 //------------------------------------------------------------------------------
36 // 1D transpose tensor contraction x
37 //------------------------------------------------------------------------------
ContractTransposeX1d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)38 inline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
39 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
40 const CeedInt item_id_x = get_local_id(0);
41
42 scratch[item_id_x] = *U;
43 work_group_barrier(CLK_LOCAL_MEM_FENCE);
44
45 *V = 0.0;
46 if (item_id_x < P_1D) {
47 for (CeedInt i = 0; i < Q_1D; i++) {
48 *V += B[item_id_x + i * P_1D] * scratch[i]; // Contract x direction
49 }
50 }
51 work_group_barrier(CLK_LOCAL_MEM_FENCE);
52 }
53
54 //------------------------------------------------------------------------------
55 // 1D interpolate to quadrature points
56 //------------------------------------------------------------------------------
Interp1d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)57 inline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
58 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
59 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
60 ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
61 }
62 }
63
64 //------------------------------------------------------------------------------
65 // 1D interpolate transpose
66 //------------------------------------------------------------------------------
InterpTranspose1d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)67 inline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
68 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
69 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
70 ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch);
71 }
72 }
73
74 //------------------------------------------------------------------------------
75 // 1D derivatives at quadrature points
76 //------------------------------------------------------------------------------
Grad1d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)77 inline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
78 local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
79 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
80 ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
81 }
82 }
83
84 //------------------------------------------------------------------------------
85 // 1D derivatives transpose
86 //------------------------------------------------------------------------------
GradTranspose1d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)87 inline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
88 local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
89 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
90 ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch);
91 }
92 }
93
94 //------------------------------------------------------------------------------
95 // 1D quadrature weights
96 //------------------------------------------------------------------------------
Weight1d(const CeedInt Q_1D,const CeedScalar * restrict q_weight_1d,CeedScalar * restrict w)97 inline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
98 const CeedInt item_id_x = get_local_id(0);
99 *w = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0;
100 }
101
102 //------------------------------------------------------------------------------
103 // 2D
104 //------------------------------------------------------------------------------
105
106 //------------------------------------------------------------------------------
107 // 2D tensor contraction x
108 //------------------------------------------------------------------------------
ContractX2d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)109 inline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
110 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
111 const CeedInt item_id_x = get_local_id(0);
112 const CeedInt item_id_y = get_local_id(1);
113
114 scratch[item_id_x + item_id_y * T_1D] = *U;
115 work_group_barrier(CLK_LOCAL_MEM_FENCE);
116
117 *V = 0.0;
118 if (item_id_x < Q_1D && item_id_y < P_1D) {
119 for (CeedInt i = 0; i < P_1D; i++) {
120 *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction
121 }
122 }
123 work_group_barrier(CLK_LOCAL_MEM_FENCE);
124 }
125
126 //------------------------------------------------------------------------------
127 // 2D tensor contract y
128 //------------------------------------------------------------------------------
ContractY2d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)129 inline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
130 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
131 const CeedInt item_id_x = get_local_id(0);
132 const CeedInt item_id_y = get_local_id(1);
133
134 scratch[item_id_x + item_id_y * T_1D] = *U;
135 work_group_barrier(CLK_LOCAL_MEM_FENCE);
136
137 *V = 0.0;
138 if (item_id_x < Q_1D && item_id_y < Q_1D) {
139 for (CeedInt i = 0; i < P_1D; i++) {
140 *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction
141 }
142 }
143 work_group_barrier(CLK_LOCAL_MEM_FENCE);
144 }
145
146 //------------------------------------------------------------------------------
147 // 2D transpose tensor contract y
148 //------------------------------------------------------------------------------
ContractTransposeY2d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)149 inline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
150 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
151 const CeedInt item_id_x = get_local_id(0);
152 const CeedInt item_id_y = get_local_id(1);
153
154 scratch[item_id_x + item_id_y * T_1D] = *U;
155 work_group_barrier(CLK_LOCAL_MEM_FENCE);
156
157 *V = 0.0;
158 if (item_id_x < Q_1D && item_id_y < P_1D) {
159 for (CeedInt i = 0; i < Q_1D; i++) {
160 *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction
161 }
162 }
163 work_group_barrier(CLK_LOCAL_MEM_FENCE);
164 }
165
166 //------------------------------------------------------------------------------
167 // 2D transpose tensor contract x
168 //------------------------------------------------------------------------------
ContractTransposeX2d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)169 inline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
170 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
171 const CeedInt item_id_x = get_local_id(0);
172 const CeedInt item_id_y = get_local_id(1);
173
174 scratch[item_id_x + item_id_y * T_1D] = *U;
175 work_group_barrier(CLK_LOCAL_MEM_FENCE);
176
177 *V = 0.0;
178 if (item_id_x < P_1D && item_id_y < P_1D) {
179 for (CeedInt i = 0; i < Q_1D; i++) {
180 *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction
181 }
182 }
183 work_group_barrier(CLK_LOCAL_MEM_FENCE);
184 }
185
186 //------------------------------------------------------------------------------
187 // 2D transpose tensor contract and add x
188 //------------------------------------------------------------------------------
ContractTransposeAddX2d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)189 inline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
190 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
191 const CeedInt item_id_x = get_local_id(0);
192 const CeedInt item_id_y = get_local_id(1);
193
194 scratch[item_id_x + item_id_y * T_1D] = *U;
195 work_group_barrier(CLK_LOCAL_MEM_FENCE);
196
197 if (item_id_x < P_1D && item_id_y < P_1D) {
198 for (CeedInt i = 0; i < Q_1D; i++) {
199 *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction
200 }
201 }
202 work_group_barrier(CLK_LOCAL_MEM_FENCE);
203 }
204
205 //------------------------------------------------------------------------------
206 // 2D interpolate to quadrature points
207 //------------------------------------------------------------------------------
InterpTensor2d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)208 inline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
209 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
210 CeedScalar r_t[1];
211
212 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
213 ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
214 ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
215 }
216 }
217
218 //------------------------------------------------------------------------------
219 // 2D interpolate transpose
220 //------------------------------------------------------------------------------
InterpTransposeTensor2d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)221 inline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
222 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
223 CeedScalar r_t[1];
224
225 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
226 ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
227 ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
228 }
229 }
230
231 //------------------------------------------------------------------------------
232 // 2D derivatives at quadrature points
233 //------------------------------------------------------------------------------
GradTensor2d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)234 inline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
235 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
236 local CeedScalar *restrict scratch) {
237 CeedScalar r_t[1];
238
239 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
240 ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch);
241 ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch);
242 ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch);
243 ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch);
244 }
245 }
246
247 //------------------------------------------------------------------------------
248 // 2D derivatives transpose
249 //------------------------------------------------------------------------------
GradTransposeTensor2d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)250 inline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
251 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
252 local CeedScalar *restrict scratch) {
253 CeedScalar r_t[1];
254
255 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
256 ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch);
257 ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch);
258 ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch);
259 ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch);
260 }
261 }
262
263 //------------------------------------------------------------------------------
264 // 2D quadrature weights
265 //------------------------------------------------------------------------------
WeightTensor2d(const CeedInt Q_1D,const CeedScalar * restrict q_weight_1d,CeedScalar * restrict w)266 inline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
267 const CeedInt item_id_x = get_local_id(0);
268 const CeedInt item_id_y = get_local_id(1);
269
270 *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0;
271 }
272
273 //------------------------------------------------------------------------------
274 // 3D
275 //------------------------------------------------------------------------------
276
277 //------------------------------------------------------------------------------
278 // 3D tensor contract x
279 //------------------------------------------------------------------------------
ContractX3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)280 inline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
281 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
282 const CeedInt item_id_x = get_local_id(0);
283 const CeedInt item_id_y = get_local_id(1);
284
285 CeedScalar r_B[T_1D];
286 for (CeedInt i = 0; i < P_1D; i++) {
287 r_B[i] = B[i + item_id_x * P_1D];
288 }
289
290 for (CeedInt k = 0; k < P_1D; k++) {
291 scratch[item_id_x + item_id_y * T_1D] = U[k];
292 work_group_barrier(CLK_LOCAL_MEM_FENCE);
293
294 V[k] = 0.0;
295 if (item_id_x < Q_1D && item_id_y < P_1D) {
296 for (CeedInt i = 0; i < P_1D; i++) {
297 V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction
298 }
299 }
300 work_group_barrier(CLK_LOCAL_MEM_FENCE);
301 }
302 }
303
304 //------------------------------------------------------------------------------
305 // 3D tensor contract y
306 //------------------------------------------------------------------------------
ContractY3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)307 inline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
308 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
309 const CeedInt item_id_x = get_local_id(0);
310 const CeedInt item_id_y = get_local_id(1);
311
312 CeedScalar r_B[T_1D];
313 for (CeedInt i = 0; i < P_1D; i++) {
314 r_B[i] = B[i + item_id_y * P_1D];
315 }
316
317 for (CeedInt k = 0; k < P_1D; k++) {
318 scratch[item_id_x + item_id_y * T_1D] = U[k];
319 work_group_barrier(CLK_LOCAL_MEM_FENCE);
320
321 V[k] = 0.0;
322 if (item_id_x < Q_1D && item_id_y < Q_1D) {
323 for (CeedInt i = 0; i < P_1D; i++) {
324 V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction
325 }
326 }
327 work_group_barrier(CLK_LOCAL_MEM_FENCE);
328 }
329 }
330
331 //------------------------------------------------------------------------------
332 // 3D tensor contract z
333 //------------------------------------------------------------------------------
ContractZ3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)334 inline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
335 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
336 const CeedInt item_id_x = get_local_id(0);
337 const CeedInt item_id_y = get_local_id(1);
338
339 for (CeedInt k = 0; k < Q_1D; k++) {
340 V[k] = 0.0;
341 if (item_id_x < Q_1D && item_id_y < Q_1D) {
342 for (CeedInt i = 0; i < P_1D; i++) {
343 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction
344 }
345 }
346 }
347 }
348
349 //------------------------------------------------------------------------------
350 // 3D transpose tensor contract z
351 //------------------------------------------------------------------------------
ContractTransposeZ3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)352 inline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
353 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
354 const CeedInt item_id_x = get_local_id(0);
355 const CeedInt item_id_y = get_local_id(1);
356
357 for (CeedInt k = 0; k < P_1D; k++) {
358 V[k] = 0.0;
359 if (item_id_x < Q_1D && item_id_y < Q_1D) {
360 for (CeedInt i = 0; i < Q_1D; i++) {
361 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction
362 }
363 }
364 }
365 }
366
367 //------------------------------------------------------------------------------
368 // 3D transpose tensor contract y
369 //------------------------------------------------------------------------------
ContractTransposeY3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)370 inline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
371 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
372 const CeedInt item_id_x = get_local_id(0);
373 const CeedInt item_id_y = get_local_id(1);
374
375 CeedScalar r_B[T_1D];
376 for (CeedInt i = 0; i < Q_1D; i++) {
377 r_B[i] = B[item_id_y + i * P_1D];
378 }
379
380 for (CeedInt k = 0; k < P_1D; k++) {
381 scratch[item_id_x + item_id_y * T_1D] = U[k];
382 work_group_barrier(CLK_LOCAL_MEM_FENCE);
383
384 V[k] = 0.0;
385 if (item_id_x < Q_1D && item_id_y < P_1D) {
386 for (CeedInt i = 0; i < Q_1D; i++) {
387 V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction
388 }
389 }
390 work_group_barrier(CLK_LOCAL_MEM_FENCE);
391 }
392 }
393
394 //------------------------------------------------------------------------------
395 // 3D transpose tensor contract y
396 //------------------------------------------------------------------------------
ContractTransposeAddY3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)397 inline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
398 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
399 const CeedInt item_id_x = get_local_id(0);
400 const CeedInt item_id_y = get_local_id(1);
401
402 CeedScalar r_B[T_1D];
403 for (CeedInt i = 0; i < Q_1D; i++) {
404 r_B[i] = B[item_id_y + i * P_1D];
405 }
406
407 for (CeedInt k = 0; k < P_1D; k++) {
408 scratch[item_id_x + item_id_y * T_1D] = U[k];
409 work_group_barrier(CLK_LOCAL_MEM_FENCE);
410 if (item_id_x < Q_1D && item_id_y < P_1D) {
411 for (CeedInt i = 0; i < Q_1D; i++) {
412 V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction
413 }
414 }
415 work_group_barrier(CLK_LOCAL_MEM_FENCE);
416 }
417 }
418
419 //------------------------------------------------------------------------------
420 // 3D transpose tensor contract x
421 //------------------------------------------------------------------------------
ContractTransposeX3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)422 inline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
423 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
424 const CeedInt item_id_x = get_local_id(0);
425 const CeedInt item_id_y = get_local_id(1);
426
427 CeedScalar r_B[T_1D];
428 for (CeedInt i = 0; i < Q_1D; i++) {
429 r_B[i] = B[item_id_x + i * P_1D];
430 }
431
432 for (CeedInt k = 0; k < P_1D; k++) {
433 scratch[item_id_x + item_id_y * T_1D] = U[k];
434 work_group_barrier(CLK_LOCAL_MEM_FENCE);
435 V[k] = 0.0;
436 if (item_id_x < P_1D && item_id_y < P_1D) {
437 for (CeedInt i = 0; i < Q_1D; i++) {
438 V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction
439 }
440 }
441 work_group_barrier(CLK_LOCAL_MEM_FENCE);
442 }
443 }
444
445 //------------------------------------------------------------------------------
446 // 3D transpose tensor contract add x
447 //------------------------------------------------------------------------------
ContractTransposeAddX3d(const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict U,local const CeedScalar * restrict B,private CeedScalar * restrict V,local CeedScalar * restrict scratch)448 inline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B,
449 private CeedScalar *restrict V, local CeedScalar *restrict scratch) {
450 const CeedInt item_id_x = get_local_id(0);
451 const CeedInt item_id_y = get_local_id(1);
452
453 CeedScalar r_B[T_1D];
454 for (CeedInt i = 0; i < Q_1D; i++) {
455 r_B[i] = B[item_id_x + i * P_1D];
456 }
457
458 for (CeedInt k = 0; k < P_1D; k++) {
459 scratch[item_id_x + item_id_y * T_1D] = U[k];
460 work_group_barrier(CLK_LOCAL_MEM_FENCE);
461
462 if (item_id_x < P_1D && item_id_y < P_1D) {
463 for (CeedInt i = 0; i < Q_1D; i++) {
464 V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction
465 }
466 }
467 work_group_barrier(CLK_LOCAL_MEM_FENCE);
468 }
469 }
470
471 //------------------------------------------------------------------------------
472 // 3D interpolate to quadrature points
473 //------------------------------------------------------------------------------
InterpTensor3d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)474 inline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
475 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
476 CeedScalar r_t1[T_1D];
477 CeedScalar r_t2[T_1D];
478
479 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
480 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
481 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
482 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch);
483 }
484 }
485
486 //------------------------------------------------------------------------------
487 // 3D interpolate transpose
488 //------------------------------------------------------------------------------
InterpTransposeTensor3d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)489 inline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
490 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
491 CeedScalar r_t1[T_1D];
492 CeedScalar r_t2[T_1D];
493
494 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
495 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch);
496 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
497 ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
498 }
499 }
500
501 //------------------------------------------------------------------------------
502 // 3D derivatives at quadrature points
503 //------------------------------------------------------------------------------
GradTensor3d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)504 inline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
505 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
506 local CeedScalar *restrict scratch) {
507 CeedScalar r_t1[T_1D];
508 CeedScalar r_t2[T_1D];
509
510 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
511 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch);
512 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
513 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
514 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
515 ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
516 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
517 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
518 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
519 ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
520 }
521 }
522
523 //------------------------------------------------------------------------------
524 // 3D derivatives transpose
525 //------------------------------------------------------------------------------
GradTransposeTensor3d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)526 inline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
527 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
528 local CeedScalar *restrict scratch) {
529 CeedScalar r_t1[T_1D];
530 CeedScalar r_t2[T_1D];
531
532 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
533 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
534 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
535 ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch);
536 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch);
537 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch);
538 ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
539 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch);
540 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
541 ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
542 }
543 }
544
545 //------------------------------------------------------------------------------
546 // 3D derivatives at quadrature points
547 //------------------------------------------------------------------------------
GradTensorCollocated3d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)548 inline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
549 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V,
550 local CeedScalar *restrict scratch) {
551 CeedScalar r_t1[T_1D];
552 CeedScalar r_t2[T_1D];
553
554 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
555 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch);
556 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
557 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
558 ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch);
559 ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch);
560 ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch);
561 }
562 }
563
564 //------------------------------------------------------------------------------
565 // 3D derivatives transpose
566 //------------------------------------------------------------------------------
GradTransposeTensorCollocated3d(const CeedInt NUM_COMP,const CeedInt P_1D,const CeedInt Q_1D,private const CeedScalar * restrict r_U,local const CeedScalar * restrict s_B,local const CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)567 inline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U,
568 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G,
569 private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
570 CeedScalar r_t1[T_1D];
571 CeedScalar r_t2[T_1D];
572
573 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
574 ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
575 ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
576 ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch);
577 ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch);
578 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch);
579 ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch);
580 }
581 }
582
583 //------------------------------------------------------------------------------
584 // 3D quadrature weights
585 //------------------------------------------------------------------------------
586 // template <int Q_1D>
WeightTensor3d(const CeedInt Q_1D,const CeedScalar * restrict q_weight_1d,CeedScalar * restrict w)587 inline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) {
588 const CeedInt item_id_x = get_local_id(0);
589 const CeedInt item_id_y = get_local_id(1);
590
591 if (item_id_x < Q_1D && item_id_y < Q_1D) {
592 const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y];
593 for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q];
594 } else {
595 for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0;
596 }
597 }
598