xref: /libCEED/include/ceed/jit-source/sycl/sycl-gen-templates.h (revision c2cc34eeef4dafbcd1d00ef770c45974c5ea3da2)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for SYCL backend macro and type definitions for JiT source
10 #include <ceed.h>
11 
12 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
13 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
14 // TODO: Handle FP32 case
15 typedef atomic_double CeedAtomicScalar;
16 
17 //------------------------------------------------------------------------------
18 // Load matrices for basis actions
19 //------------------------------------------------------------------------------
20 inline void loadMatrix(const CeedInt N, const CeedScalar *restrict d_B, CeedScalar *restrict B) {
21   const CeedInt item_id    = get_local_linear_id();
22   const CeedInt group_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
23   for (CeedInt i = item_id; i < N; i += group_size) B[i] = d_B[i];
24 }
25 
26 //------------------------------------------------------------------------------
27 // 1D
28 //------------------------------------------------------------------------------
29 
30 //------------------------------------------------------------------------------
31 // L-vector -> E-vector, offsets provided
32 //------------------------------------------------------------------------------
33 inline void readDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
34                              const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
35   const CeedInt item_id_x = get_local_id(0);
36   const CeedInt elem      = get_global_id(2);
37 
38   if (item_id_x < P_1D && elem < num_elem) {
39     const CeedInt node = item_id_x;
40     const CeedInt ind  = indices[node + elem * P_1D];
41     for (CeedInt comp = 0; comp < num_comp; ++comp) {
42       r_u[comp] = d_u[ind + strides_comp * comp];
43     }
44   }
45 }
46 
47 //------------------------------------------------------------------------------
48 // L-vector -> E-vector, strided
49 //------------------------------------------------------------------------------
50 inline void readDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
51                               const CeedInt strides_elem, const CeedInt num_elem, global const CeedScalar *restrict d_u,
52                               private CeedScalar *restrict r_u) {
53   const CeedInt item_id_x = get_local_id(0);
54   const CeedInt elem      = get_global_id(2);
55 
56   if (item_id_x < P_1D && elem < num_elem) {
57     const CeedInt node = item_id_x;
58     const CeedInt ind  = node * strides_node + elem * strides_elem;
59     for (CeedInt comp = 0; comp < num_comp; comp++) {
60       r_u[comp] = d_u[ind + comp * strides_comp];
61     }
62   }
63 }
64 
65 //------------------------------------------------------------------------------
66 // E-vector -> L-vector, offsets provided
67 //------------------------------------------------------------------------------
68 inline void writeDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
69                               const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) {
70   const CeedInt item_id_x = get_local_id(0);
71   const CeedInt elem      = get_global_id(2);
72 
73   if (item_id_x < P_1D && elem < num_elem) {
74     const CeedInt node = item_id_x;
75     const CeedInt ind  = indices[node + elem * P_1D];
76     for (CeedInt comp = 0; comp < num_comp; ++comp)
77       atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device);
78   }
79 }
80 
81 //------------------------------------------------------------------------------
82 // E-vector -> L-vector, strided
83 //------------------------------------------------------------------------------
84 inline void writeDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
85                                const CeedInt strides_elem, const CeedInt num_elem, private const CeedScalar *restrict r_v,
86                                global CeedScalar *restrict d_v) {
87   const CeedInt item_id_x = get_local_id(0);
88   const CeedInt elem      = get_global_id(2);
89 
90   if (item_id_x < P_1D && elem < num_elem) {
91     const CeedInt node = item_id_x;
92     const CeedInt ind  = node * strides_node + elem * strides_elem;
93     for (CeedInt comp = 0; comp < num_comp; comp++) {
94       d_v[ind + comp * strides_comp] = r_v[comp];
95     }
96   }
97 }
98 
99 //------------------------------------------------------------------------------
100 // 2D
101 //------------------------------------------------------------------------------
102 
103 //------------------------------------------------------------------------------
104 // L-vector -> E-vector, offsets provided
105 //------------------------------------------------------------------------------
106 inline void readDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
107                              const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
108   const CeedInt item_id_x = get_local_id(0);
109   const CeedInt item_id_y = get_local_id(1);
110   const CeedInt elem      = get_global_id(2);
111 
112   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
113     const CeedInt node = item_id_x + item_id_y * P_1D;
114     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
115     for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp];
116   }
117 }
118 
119 //------------------------------------------------------------------------------
120 // L-vector -> E-vector, strided
121 //------------------------------------------------------------------------------
122 inline void readDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
123                               const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar *restrict d_u,
124                               private CeedScalar *restrict r_u) {
125   const CeedInt item_id_x = get_local_id(0);
126   const CeedInt item_id_y = get_local_id(1);
127   const CeedInt elem      = get_global_id(2);
128 
129   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
130     const CeedInt node = item_id_x + item_id_y * P_1D;
131     const CeedInt ind  = node * strides_node + elem * strides_elem;
132     for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp];
133   }
134 }
135 
136 //------------------------------------------------------------------------------
137 // E-vector -> L-vector, offsets provided
138 //------------------------------------------------------------------------------
139 inline void writeDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
140                               const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) {
141   const CeedInt item_id_x = get_local_id(0);
142   const CeedInt item_id_y = get_local_id(1);
143   const CeedInt elem      = get_global_id(2);
144 
145   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
146     const CeedInt node = item_id_x + item_id_y * P_1D;
147     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
148     for (CeedInt comp = 0; comp < num_comp; ++comp)
149       atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device);
150   }
151 }
152 
153 //------------------------------------------------------------------------------
154 // E-vector -> L-vector, strided
155 //------------------------------------------------------------------------------
156 inline void writeDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
157                                const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar *restrict r_v,
158                                global CeedScalar *restrict d_v) {
159   const CeedInt item_id_x = get_local_id(0);
160   const CeedInt item_id_y = get_local_id(1);
161   const CeedInt elem      = get_global_id(2);
162 
163   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
164     const CeedInt node = item_id_x + item_id_y * P_1D;
165     const CeedInt ind  = node * strides_node + elem * strides_elem;
166     for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[comp];
167   }
168 }
169 
170 //------------------------------------------------------------------------------
171 // 3D
172 //------------------------------------------------------------------------------
173 
174 //------------------------------------------------------------------------------
175 // L-vector -> E-vector, offsets provided
176 //------------------------------------------------------------------------------
177 inline void readDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
178                              const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
179   const CeedInt item_id_x = get_local_id(0);
180   const CeedInt item_id_y = get_local_id(1);
181   const CeedInt elem      = get_global_id(2);
182 
183   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
184     for (CeedInt z = 0; z < P_1D; ++z) {
185       const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
186       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
187       for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + strides_comp * comp];
188     }
189   }
190 }
191 
192 //------------------------------------------------------------------------------
193 // L-vector -> E-vector, strided
194 //------------------------------------------------------------------------------
195 inline void readDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
196                               const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar *restrict d_u,
197                               private CeedScalar *restrict r_u) {
198   const CeedInt item_id_x = get_local_id(0);
199   const CeedInt item_id_y = get_local_id(1);
200   const CeedInt elem      = get_global_id(2);
201 
202   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
203     for (CeedInt z = 0; z < P_1D; ++z) {
204       const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
205       const CeedInt ind  = node * strides_node + elem * strides_elem;
206       for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp];
207     }
208   }
209 }
210 
211 //------------------------------------------------------------------------------
212 // E-vector -> Q-vector, offests provided
213 //------------------------------------------------------------------------------
214 inline void readSliceQuadsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt Q_1D, const CeedInt num_elem, const CeedInt q,
215                                    const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
216   const CeedInt item_id_x = get_local_id(0);
217   const CeedInt item_id_y = get_local_id(1);
218   const CeedInt elem      = get_global_id(2);
219 
220   if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) {
221     const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q);
222     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
223     for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp];
224   }
225 }
226 
227 //------------------------------------------------------------------------------
228 // E-vector -> Q-vector, strided
229 //------------------------------------------------------------------------------
230 inline void readSliceQuadsStrided3d(const CeedInt num_comp, const CeedInt Q_1D, CeedInt strides_node, CeedInt strides_comp, CeedInt strides_elem,
231                                     const CeedInt num_elem, const CeedInt q, const global CeedScalar *restrict d_u,
232                                     private CeedScalar *restrict r_u) {
233   const CeedInt item_id_x = get_local_id(0);
234   const CeedInt item_id_y = get_local_id(1);
235   const CeedInt elem      = get_global_id(2);
236 
237   if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) {
238     const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q);
239     const CeedInt ind  = node * strides_node + elem * strides_elem;
240     for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp];
241   }
242 }
243 
244 //------------------------------------------------------------------------------
245 // E-vector -> L-vector, offsets provided
246 //------------------------------------------------------------------------------
247 inline void writeDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
248                               const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) {
249   const CeedInt item_id_x = get_local_id(0);
250   const CeedInt item_id_y = get_local_id(1);
251   const CeedInt elem      = get_global_id(2);
252 
253   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
254     for (CeedInt z = 0; z < P_1D; ++z) {
255       const CeedInt node = item_id_x + item_id_y * P_1D + z * P_1D * P_1D;
256       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
257       for (CeedInt comp = 0; comp < num_comp; ++comp)
258         atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[z + comp * P_1D], memory_order_relaxed, memory_scope_device);
259     }
260   }
261 }
262 
263 //------------------------------------------------------------------------------
264 // E-vector -> L-vector, strided
265 //------------------------------------------------------------------------------
266 inline void writeDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
267                                const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar *restrict r_v,
268                                global CeedScalar *restrict d_v) {
269   const CeedInt item_id_x = get_local_id(0);
270   const CeedInt item_id_y = get_local_id(1);
271   const CeedInt elem      = get_global_id(2);
272 
273   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
274     for (CeedInt z = 0; z < P_1D; ++z) {
275       const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
276       const CeedInt ind  = node * strides_node + elem * strides_elem;
277       for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
278     }
279   }
280 }
281 
282 //------------------------------------------------------------------------------
283 // 3D collocated derivatives computation
284 //------------------------------------------------------------------------------
285 inline void gradCollo3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar *restrict r_U,
286                         const local CeedScalar *s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
287   const CeedInt item_id_x = get_local_id(0);
288   const CeedInt item_id_y = get_local_id(1);
289 
290   for (CeedInt comp = 0; comp < num_comp; ++comp) {
291     if (item_id_x < Q_1D && item_id_y < Q_1D) {
292       scratch[item_id_x + item_id_y * T_1D] = r_U[q + comp * Q_1D];
293     }
294     work_group_barrier(CLK_LOCAL_MEM_FENCE);
295 
296     if (item_id_x < Q_1D && item_id_y < Q_1D) {
297       // X derivative
298       r_V[comp + 0 * num_comp] = 0.0;
299       for (CeedInt i = 0; i < Q_1D; ++i)
300         r_V[comp + 0 * num_comp] += s_G[i + item_id_x * Q_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction (X derivative)
301 
302       // Y derivative
303       r_V[comp + 1 * num_comp] = 0.0;
304       for (CeedInt i = 0; i < Q_1D; ++i)
305         r_V[comp + 1 * num_comp] += s_G[i + item_id_y * Q_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction (Y derivative)
306 
307       // Z derivative
308       r_V[comp + 2 * num_comp] = 0.0;
309       for (CeedInt i = 0; i < Q_1D; ++i) r_V[comp + 2 * num_comp] += s_G[i + q * Q_1D] * r_U[i + comp * Q_1D];  // Contract z direction (Z derivative)
310     }
311 
312     work_group_barrier(CLK_LOCAL_MEM_FENCE);
313   }
314 }
315 
316 //------------------------------------------------------------------------------
317 // 3D collocated derivatives transpose
318 //------------------------------------------------------------------------------
319 inline void gradColloTranspose3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar *restrict r_U,
320                                  const local CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
321   const CeedInt item_id_x = get_local_id(0);
322   const CeedInt item_id_y = get_local_id(1);
323 
324   for (CeedInt comp = 0; comp < num_comp; ++comp) {
325     // X derivative
326     if (item_id_x < Q_1D && item_id_y < Q_1D) {
327       scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 0 * num_comp];
328     }
329     work_group_barrier(CLK_LOCAL_MEM_FENCE);
330 
331     if (item_id_x < Q_1D && item_id_y < Q_1D) {
332       for (CeedInt i = 0; i < Q_1D; ++i)
333         r_V[q + comp * Q_1D] += s_G[item_id_x + i * Q_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction (X derivative)
334     }
335     work_group_barrier(CLK_LOCAL_MEM_FENCE);
336 
337     // Y derivative
338     if (item_id_x < Q_1D && item_id_y < Q_1D) {
339       scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 1 * num_comp];
340     }
341     work_group_barrier(CLK_LOCAL_MEM_FENCE);
342 
343     if (item_id_x < Q_1D && item_id_y < Q_1D) {
344       for (CeedInt i = 0; i < Q_1D; ++i)
345         r_V[q + comp * Q_1D] += s_G[item_id_y + i * Q_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction (Y derivative)
346     }
347     work_group_barrier(CLK_LOCAL_MEM_FENCE);
348 
349     // Z derivative
350     if (item_id_x < Q_1D && item_id_y < Q_1D) {
351       for (CeedInt i = 0; i < Q_1D; ++i)
352         r_V[i + comp * Q_1D] += s_G[i + q * Q_1D] * r_U[comp + 2 * num_comp];  // PARTIAL contract z direction (Z derivative)
353     }
354   }
355 }
356