1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for SYCL backend macro and type definitions for JiT source
10 #include <ceed/types.h>
11
12 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
13 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
14 // TODO: Handle FP32 case
15 typedef atomic_double CeedAtomicScalar;
16
17 //------------------------------------------------------------------------------
18 // Load matrices for basis actions
19 //------------------------------------------------------------------------------
loadMatrix(const CeedInt N,const CeedScalar * restrict d_B,CeedScalar * restrict B)20 inline void loadMatrix(const CeedInt N, const CeedScalar *restrict d_B, CeedScalar *restrict B) {
21 const CeedInt item_id = get_local_linear_id();
22 const CeedInt group_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
23 for (CeedInt i = item_id; i < N; i += group_size) B[i] = d_B[i];
24 }
25
26 //------------------------------------------------------------------------------
27 // 1D
28 //------------------------------------------------------------------------------
29
30 //------------------------------------------------------------------------------
31 // L-vector -> E-vector, offsets provided
32 //------------------------------------------------------------------------------
readDofsOffset1d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt P_1D,const CeedInt num_elem,const global CeedInt * restrict indices,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)33 inline void readDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
34 const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
35 const CeedInt item_id_x = get_local_id(0);
36 const CeedInt elem = get_global_id(2);
37
38 if (item_id_x < P_1D && elem < num_elem) {
39 const CeedInt node = item_id_x;
40 const CeedInt ind = indices[node + elem * P_1D];
41 for (CeedInt comp = 0; comp < num_comp; ++comp) {
42 r_u[comp] = d_u[ind + strides_comp * comp];
43 }
44 }
45 }
46
47 //------------------------------------------------------------------------------
48 // L-vector -> E-vector, strided
49 //------------------------------------------------------------------------------
readDofsStrided1d(const CeedInt num_comp,const CeedInt P_1D,const CeedInt strides_node,const CeedInt strides_comp,const CeedInt strides_elem,const CeedInt num_elem,global const CeedScalar * restrict d_u,private CeedScalar * restrict r_u)50 inline void readDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
51 const CeedInt strides_elem, const CeedInt num_elem, global const CeedScalar *restrict d_u,
52 private CeedScalar *restrict r_u) {
53 const CeedInt item_id_x = get_local_id(0);
54 const CeedInt elem = get_global_id(2);
55
56 if (item_id_x < P_1D && elem < num_elem) {
57 const CeedInt node = item_id_x;
58 const CeedInt ind = node * strides_node + elem * strides_elem;
59 for (CeedInt comp = 0; comp < num_comp; comp++) {
60 r_u[comp] = d_u[ind + comp * strides_comp];
61 }
62 }
63 }
64
65 //------------------------------------------------------------------------------
66 // E-vector -> L-vector, offsets provided
67 //------------------------------------------------------------------------------
writeDofsOffset1d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt P_1D,const CeedInt num_elem,const global CeedInt * restrict indices,const private CeedScalar * restrict r_v,global CeedAtomicScalar * restrict d_v)68 inline void writeDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
69 const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) {
70 const CeedInt item_id_x = get_local_id(0);
71 const CeedInt elem = get_global_id(2);
72
73 if (item_id_x < P_1D && elem < num_elem) {
74 const CeedInt node = item_id_x;
75 const CeedInt ind = indices[node + elem * P_1D];
76 for (CeedInt comp = 0; comp < num_comp; ++comp)
77 atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device);
78 }
79 }
80
81 //------------------------------------------------------------------------------
82 // E-vector -> L-vector, strided
83 //------------------------------------------------------------------------------
writeDofsStrided1d(const CeedInt num_comp,const CeedInt P_1D,const CeedInt strides_node,const CeedInt strides_comp,const CeedInt strides_elem,const CeedInt num_elem,private const CeedScalar * restrict r_v,global CeedScalar * restrict d_v)84 inline void writeDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
85 const CeedInt strides_elem, const CeedInt num_elem, private const CeedScalar *restrict r_v,
86 global CeedScalar *restrict d_v) {
87 const CeedInt item_id_x = get_local_id(0);
88 const CeedInt elem = get_global_id(2);
89
90 if (item_id_x < P_1D && elem < num_elem) {
91 const CeedInt node = item_id_x;
92 const CeedInt ind = node * strides_node + elem * strides_elem;
93 for (CeedInt comp = 0; comp < num_comp; comp++) {
94 d_v[ind + comp * strides_comp] = r_v[comp];
95 }
96 }
97 }
98
99 //------------------------------------------------------------------------------
100 // 2D
101 //------------------------------------------------------------------------------
102
103 //------------------------------------------------------------------------------
104 // L-vector -> E-vector, offsets provided
105 //------------------------------------------------------------------------------
readDofsOffset2d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt P_1D,const CeedInt num_elem,const global CeedInt * restrict indices,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)106 inline void readDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
107 const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
108 const CeedInt item_id_x = get_local_id(0);
109 const CeedInt item_id_y = get_local_id(1);
110 const CeedInt elem = get_global_id(2);
111
112 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
113 const CeedInt node = item_id_x + item_id_y * P_1D;
114 const CeedInt ind = indices[node + elem * P_1D * P_1D];
115 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp];
116 }
117 }
118
119 //------------------------------------------------------------------------------
120 // L-vector -> E-vector, strided
121 //------------------------------------------------------------------------------
readDofsStrided2d(const CeedInt num_comp,const CeedInt P_1D,const CeedInt strides_node,const CeedInt strides_comp,const CeedInt strides_elem,const CeedInt num_elem,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)122 inline void readDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
123 const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar *restrict d_u,
124 private CeedScalar *restrict r_u) {
125 const CeedInt item_id_x = get_local_id(0);
126 const CeedInt item_id_y = get_local_id(1);
127 const CeedInt elem = get_global_id(2);
128
129 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
130 const CeedInt node = item_id_x + item_id_y * P_1D;
131 const CeedInt ind = node * strides_node + elem * strides_elem;
132 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp];
133 }
134 }
135
136 //------------------------------------------------------------------------------
137 // E-vector -> L-vector, offsets provided
138 //------------------------------------------------------------------------------
writeDofsOffset2d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt P_1D,const CeedInt num_elem,const global CeedInt * restrict indices,const private CeedScalar * restrict r_v,global CeedAtomicScalar * restrict d_v)139 inline void writeDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
140 const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) {
141 const CeedInt item_id_x = get_local_id(0);
142 const CeedInt item_id_y = get_local_id(1);
143 const CeedInt elem = get_global_id(2);
144
145 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
146 const CeedInt node = item_id_x + item_id_y * P_1D;
147 const CeedInt ind = indices[node + elem * P_1D * P_1D];
148 for (CeedInt comp = 0; comp < num_comp; ++comp)
149 atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device);
150 }
151 }
152
153 //------------------------------------------------------------------------------
154 // E-vector -> L-vector, strided
155 //------------------------------------------------------------------------------
writeDofsStrided2d(const CeedInt num_comp,const CeedInt P_1D,const CeedInt strides_node,const CeedInt strides_comp,const CeedInt strides_elem,const CeedInt num_elem,const private CeedScalar * restrict r_v,global CeedScalar * restrict d_v)156 inline void writeDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
157 const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar *restrict r_v,
158 global CeedScalar *restrict d_v) {
159 const CeedInt item_id_x = get_local_id(0);
160 const CeedInt item_id_y = get_local_id(1);
161 const CeedInt elem = get_global_id(2);
162
163 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
164 const CeedInt node = item_id_x + item_id_y * P_1D;
165 const CeedInt ind = node * strides_node + elem * strides_elem;
166 for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[comp];
167 }
168 }
169
170 //------------------------------------------------------------------------------
171 // 3D
172 //------------------------------------------------------------------------------
173
174 //------------------------------------------------------------------------------
175 // L-vector -> E-vector, offsets provided
176 //------------------------------------------------------------------------------
readDofsOffset3d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt P_1D,const CeedInt num_elem,const global CeedInt * restrict indices,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)177 inline void readDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
178 const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
179 const CeedInt item_id_x = get_local_id(0);
180 const CeedInt item_id_y = get_local_id(1);
181 const CeedInt elem = get_global_id(2);
182
183 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
184 for (CeedInt z = 0; z < P_1D; ++z) {
185 const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
186 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D];
187 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + strides_comp * comp];
188 }
189 }
190 }
191
192 //------------------------------------------------------------------------------
193 // L-vector -> E-vector, strided
194 //------------------------------------------------------------------------------
readDofsStrided3d(const CeedInt num_comp,const CeedInt P_1D,const CeedInt strides_node,const CeedInt strides_comp,const CeedInt strides_elem,const CeedInt num_elem,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)195 inline void readDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
196 const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar *restrict d_u,
197 private CeedScalar *restrict r_u) {
198 const CeedInt item_id_x = get_local_id(0);
199 const CeedInt item_id_y = get_local_id(1);
200 const CeedInt elem = get_global_id(2);
201
202 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
203 for (CeedInt z = 0; z < P_1D; ++z) {
204 const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
205 const CeedInt ind = node * strides_node + elem * strides_elem;
206 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp];
207 }
208 }
209 }
210
211 //------------------------------------------------------------------------------
212 // E-vector -> Q-vector, offests provided
213 //------------------------------------------------------------------------------
readSliceQuadsOffset3d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt Q_1D,const CeedInt num_elem,const CeedInt q,const global CeedInt * restrict indices,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)214 inline void readSliceQuadsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt Q_1D, const CeedInt num_elem, const CeedInt q,
215 const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
216 const CeedInt item_id_x = get_local_id(0);
217 const CeedInt item_id_y = get_local_id(1);
218 const CeedInt elem = get_global_id(2);
219
220 if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) {
221 const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q);
222 const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D];
223 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp];
224 }
225 }
226
227 //------------------------------------------------------------------------------
228 // E-vector -> Q-vector, strided
229 //------------------------------------------------------------------------------
readSliceQuadsStrided3d(const CeedInt num_comp,const CeedInt Q_1D,CeedInt strides_node,CeedInt strides_comp,CeedInt strides_elem,const CeedInt num_elem,const CeedInt q,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)230 inline void readSliceQuadsStrided3d(const CeedInt num_comp, const CeedInt Q_1D, CeedInt strides_node, CeedInt strides_comp, CeedInt strides_elem,
231 const CeedInt num_elem, const CeedInt q, const global CeedScalar *restrict d_u,
232 private CeedScalar *restrict r_u) {
233 const CeedInt item_id_x = get_local_id(0);
234 const CeedInt item_id_y = get_local_id(1);
235 const CeedInt elem = get_global_id(2);
236
237 if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) {
238 const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q);
239 const CeedInt ind = node * strides_node + elem * strides_elem;
240 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp];
241 }
242 }
243
244 //------------------------------------------------------------------------------
245 // E-vector -> L-vector, offsets provided
246 //------------------------------------------------------------------------------
writeDofsOffset3d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt P_1D,const CeedInt num_elem,const global CeedInt * restrict indices,const private CeedScalar * restrict r_v,global CeedAtomicScalar * restrict d_v)247 inline void writeDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
248 const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) {
249 const CeedInt item_id_x = get_local_id(0);
250 const CeedInt item_id_y = get_local_id(1);
251 const CeedInt elem = get_global_id(2);
252
253 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
254 for (CeedInt z = 0; z < P_1D; ++z) {
255 const CeedInt node = item_id_x + item_id_y * P_1D + z * P_1D * P_1D;
256 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D];
257 for (CeedInt comp = 0; comp < num_comp; ++comp)
258 atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[z + comp * P_1D], memory_order_relaxed, memory_scope_device);
259 }
260 }
261 }
262
263 //------------------------------------------------------------------------------
264 // E-vector -> L-vector, strided
265 //------------------------------------------------------------------------------
writeDofsStrided3d(const CeedInt num_comp,const CeedInt P_1D,const CeedInt strides_node,const CeedInt strides_comp,const CeedInt strides_elem,const CeedInt num_elem,const private CeedScalar * restrict r_v,global CeedScalar * restrict d_v)266 inline void writeDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
267 const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar *restrict r_v,
268 global CeedScalar *restrict d_v) {
269 const CeedInt item_id_x = get_local_id(0);
270 const CeedInt item_id_y = get_local_id(1);
271 const CeedInt elem = get_global_id(2);
272
273 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
274 for (CeedInt z = 0; z < P_1D; ++z) {
275 const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
276 const CeedInt ind = node * strides_node + elem * strides_elem;
277 for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
278 }
279 }
280 }
281
282 //------------------------------------------------------------------------------
283 // 3D collocated derivatives computation
284 //------------------------------------------------------------------------------
gradCollo3d(const CeedInt num_comp,const CeedInt Q_1D,const CeedInt q,const private CeedScalar * restrict r_U,const local CeedScalar * s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)285 inline void gradCollo3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar *restrict r_U,
286 const local CeedScalar *s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
287 const CeedInt item_id_x = get_local_id(0);
288 const CeedInt item_id_y = get_local_id(1);
289
290 for (CeedInt comp = 0; comp < num_comp; ++comp) {
291 if (item_id_x < Q_1D && item_id_y < Q_1D) {
292 scratch[item_id_x + item_id_y * T_1D] = r_U[q + comp * Q_1D];
293 }
294 work_group_barrier(CLK_LOCAL_MEM_FENCE);
295
296 if (item_id_x < Q_1D && item_id_y < Q_1D) {
297 // X derivative
298 r_V[comp + 0 * num_comp] = 0.0;
299 for (CeedInt i = 0; i < Q_1D; ++i)
300 r_V[comp + 0 * num_comp] += s_G[i + item_id_x * Q_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction (X derivative)
301
302 // Y derivative
303 r_V[comp + 1 * num_comp] = 0.0;
304 for (CeedInt i = 0; i < Q_1D; ++i)
305 r_V[comp + 1 * num_comp] += s_G[i + item_id_y * Q_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction (Y derivative)
306
307 // Z derivative
308 r_V[comp + 2 * num_comp] = 0.0;
309 for (CeedInt i = 0; i < Q_1D; ++i) r_V[comp + 2 * num_comp] += s_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; // Contract z direction (Z derivative)
310 }
311
312 work_group_barrier(CLK_LOCAL_MEM_FENCE);
313 }
314 }
315
316 //------------------------------------------------------------------------------
317 // 3D collocated derivatives transpose
318 //------------------------------------------------------------------------------
gradColloTranspose3d(const CeedInt num_comp,const CeedInt Q_1D,const CeedInt q,const private CeedScalar * restrict r_U,const local CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)319 inline void gradColloTranspose3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar *restrict r_U,
320 const local CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
321 const CeedInt item_id_x = get_local_id(0);
322 const CeedInt item_id_y = get_local_id(1);
323
324 for (CeedInt comp = 0; comp < num_comp; ++comp) {
325 // X derivative
326 if (item_id_x < Q_1D && item_id_y < Q_1D) {
327 scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 0 * num_comp];
328 }
329 work_group_barrier(CLK_LOCAL_MEM_FENCE);
330
331 if (item_id_x < Q_1D && item_id_y < Q_1D) {
332 for (CeedInt i = 0; i < Q_1D; ++i)
333 r_V[q + comp * Q_1D] += s_G[item_id_x + i * Q_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction (X derivative)
334 }
335 work_group_barrier(CLK_LOCAL_MEM_FENCE);
336
337 // Y derivative
338 if (item_id_x < Q_1D && item_id_y < Q_1D) {
339 scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 1 * num_comp];
340 }
341 work_group_barrier(CLK_LOCAL_MEM_FENCE);
342
343 if (item_id_x < Q_1D && item_id_y < Q_1D) {
344 for (CeedInt i = 0; i < Q_1D; ++i)
345 r_V[q + comp * Q_1D] += s_G[item_id_y + i * Q_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction (Y derivative)
346 }
347 work_group_barrier(CLK_LOCAL_MEM_FENCE);
348
349 // Z derivative
350 if (item_id_x < Q_1D && item_id_y < Q_1D) {
351 for (CeedInt i = 0; i < Q_1D; ++i)
352 r_V[i + comp * Q_1D] += s_G[i + q * Q_1D] * r_U[comp + 2 * num_comp]; // PARTIAL contract z direction (Z derivative)
353 }
354 }
355 }
356