xref: /libCEED/include/ceed/jit-source/sycl/sycl-gen-templates.h (revision 58f118fde9e934489c18d94d567a96d887adab93)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for SYCL backend macro and type definitions for JiT source
10 #ifndef CEED_SYCL_GEN_TEMPLATES_H
11 #define CEED_SYCL_GEN_TEMPLATES_H
12 
13 #include <ceed.h>
14 
15 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
16 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
17 // TODO: Handle FP32 case
18 typedef atomic_double CeedAtomicScalar;
19 
20 //------------------------------------------------------------------------------
21 // Load matrices for basis actions
22 //------------------------------------------------------------------------------
23 inline void loadMatrix(const CeedInt N, const CeedScalar *restrict d_B, CeedScalar *restrict B) {
24   const CeedInt item_id    = get_local_linear_id();
25   const CeedInt group_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
26   for (CeedInt i = item_id; i < N; i += group_size) B[i] = d_B[i];
27 }
28 
29 //------------------------------------------------------------------------------
30 // 1D
31 //------------------------------------------------------------------------------
32 
33 //------------------------------------------------------------------------------
34 // L-vector -> E-vector, offsets provided
35 //------------------------------------------------------------------------------
36 inline void readDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
37                              const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
38   const CeedInt item_id_x = get_local_id(0);
39   const CeedInt elem      = get_global_id(2);
40 
41   if (item_id_x < P_1D && elem < num_elem) {
42     const CeedInt node = item_id_x;
43     const CeedInt ind  = indices[node + elem * P_1D];
44     for (CeedInt comp = 0; comp < num_comp; ++comp) {
45       r_u[comp] = d_u[ind + strides_comp * comp];
46     }
47   }
48 }
49 
50 //------------------------------------------------------------------------------
51 // L-vector -> E-vector, strided
52 //------------------------------------------------------------------------------
53 inline void readDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
54                               const CeedInt strides_elem, const CeedInt num_elem, global const CeedScalar *restrict d_u,
55                               private CeedScalar *restrict r_u) {
56   const CeedInt item_id_x = get_local_id(0);
57   const CeedInt elem      = get_global_id(2);
58 
59   if (item_id_x < P_1D && elem < num_elem) {
60     const CeedInt node = item_id_x;
61     const CeedInt ind  = node * strides_node + elem * strides_elem;
62     for (CeedInt comp = 0; comp < num_comp; comp++) {
63       r_u[comp] = d_u[ind + comp * strides_comp];
64     }
65   }
66 }
67 
68 //------------------------------------------------------------------------------
69 // E-vector -> L-vector, offsets provided
70 //------------------------------------------------------------------------------
71 inline void writeDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
72                               const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) {
73   const CeedInt item_id_x = get_local_id(0);
74   const CeedInt elem      = get_global_id(2);
75 
76   if (item_id_x < P_1D && elem < num_elem) {
77     const CeedInt node = item_id_x;
78     const CeedInt ind  = indices[node + elem * P_1D];
79     for (CeedInt comp = 0; comp < num_comp; ++comp)
80       atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device);
81   }
82 }
83 
84 //------------------------------------------------------------------------------
85 // E-vector -> L-vector, strided
86 //------------------------------------------------------------------------------
87 inline void writeDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
88                                const CeedInt strides_elem, const CeedInt num_elem, private const CeedScalar *restrict r_v,
89                                global CeedScalar *restrict d_v) {
90   const CeedInt item_id_x = get_local_id(0);
91   const CeedInt elem      = get_global_id(2);
92 
93   if (item_id_x < P_1D && elem < num_elem) {
94     const CeedInt node = item_id_x;
95     const CeedInt ind  = node * strides_node + elem * strides_elem;
96     for (CeedInt comp = 0; comp < num_comp; comp++) {
97       d_v[ind + comp * strides_comp] = r_v[comp];
98     }
99   }
100 }
101 
102 //------------------------------------------------------------------------------
103 // 2D
104 //------------------------------------------------------------------------------
105 
106 //------------------------------------------------------------------------------
107 // L-vector -> E-vector, offsets provided
108 //------------------------------------------------------------------------------
109 inline void readDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
110                              const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
111   const CeedInt item_id_x = get_local_id(0);
112   const CeedInt item_id_y = get_local_id(1);
113   const CeedInt elem      = get_global_id(2);
114 
115   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
116     const CeedInt node = item_id_x + item_id_y * P_1D;
117     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
118     for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp];
119   }
120 }
121 
122 //------------------------------------------------------------------------------
123 // L-vector -> E-vector, strided
124 //------------------------------------------------------------------------------
125 inline void readDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
126                               const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar *restrict d_u,
127                               private CeedScalar *restrict r_u) {
128   const CeedInt item_id_x = get_local_id(0);
129   const CeedInt item_id_y = get_local_id(1);
130   const CeedInt elem      = get_global_id(2);
131 
132   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
133     const CeedInt node = item_id_x + item_id_y * P_1D;
134     const CeedInt ind  = node * strides_node + elem * strides_elem;
135     for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp];
136   }
137 }
138 
139 //------------------------------------------------------------------------------
140 // E-vector -> L-vector, offsets provided
141 //------------------------------------------------------------------------------
142 inline void writeDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
143                               const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) {
144   const CeedInt item_id_x = get_local_id(0);
145   const CeedInt item_id_y = get_local_id(1);
146   const CeedInt elem      = get_global_id(2);
147 
148   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
149     const CeedInt node = item_id_x + item_id_y * P_1D;
150     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
151     for (CeedInt comp = 0; comp < num_comp; ++comp)
152       atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device);
153   }
154 }
155 
156 //------------------------------------------------------------------------------
157 // E-vector -> L-vector, strided
158 //------------------------------------------------------------------------------
159 inline void writeDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
160                                const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar *restrict r_v,
161                                global CeedScalar *restrict d_v) {
162   const CeedInt item_id_x = get_local_id(0);
163   const CeedInt item_id_y = get_local_id(1);
164   const CeedInt elem      = get_global_id(2);
165 
166   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
167     const CeedInt node = item_id_x + item_id_y * P_1D;
168     const CeedInt ind  = node * strides_node + elem * strides_elem;
169     for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[comp];
170   }
171 }
172 
173 //------------------------------------------------------------------------------
174 // 3D
175 //------------------------------------------------------------------------------
176 
177 //------------------------------------------------------------------------------
178 // L-vector -> E-vector, offsets provided
179 //------------------------------------------------------------------------------
180 inline void readDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
181                              const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
182   const CeedInt item_id_x = get_local_id(0);
183   const CeedInt item_id_y = get_local_id(1);
184   const CeedInt elem      = get_global_id(2);
185 
186   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
187     for (CeedInt z = 0; z < P_1D; ++z) {
188       const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
189       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
190       for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + strides_comp * comp];
191     }
192   }
193 }
194 
195 //------------------------------------------------------------------------------
196 // L-vector -> E-vector, strided
197 //------------------------------------------------------------------------------
198 inline void readDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
199                               const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar *restrict d_u,
200                               private CeedScalar *restrict r_u) {
201   const CeedInt item_id_x = get_local_id(0);
202   const CeedInt item_id_y = get_local_id(1);
203   const CeedInt elem      = get_global_id(2);
204 
205   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
206     for (CeedInt z = 0; z < P_1D; ++z) {
207       const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
208       const CeedInt ind  = node * strides_node + elem * strides_elem;
209       for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp];
210     }
211   }
212 }
213 
214 //------------------------------------------------------------------------------
215 // E-vector -> Q-vector, offests provided
216 //------------------------------------------------------------------------------
217 inline void readSliceQuadsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt Q_1D, const CeedInt num_elem, const CeedInt q,
218                                    const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
219   const CeedInt item_id_x = get_local_id(0);
220   const CeedInt item_id_y = get_local_id(1);
221   const CeedInt elem      = get_global_id(2);
222 
223   if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) {
224     const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q);
225     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
226     for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp];
227   }
228 }
229 
230 //------------------------------------------------------------------------------
231 // E-vector -> Q-vector, strided
232 //------------------------------------------------------------------------------
233 inline void readSliceQuadsStrided3d(const CeedInt num_comp, const CeedInt Q_1D, CeedInt strides_node, CeedInt strides_comp, CeedInt strides_elem,
234                                     const CeedInt num_elem, const CeedInt q, const global CeedScalar *restrict d_u,
235                                     private CeedScalar *restrict r_u) {
236   const CeedInt item_id_x = get_local_id(0);
237   const CeedInt item_id_y = get_local_id(1);
238   const CeedInt elem      = get_global_id(2);
239 
240   if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) {
241     const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q);
242     const CeedInt ind  = node * strides_node + elem * strides_elem;
243     for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp];
244   }
245 }
246 
247 //------------------------------------------------------------------------------
248 // E-vector -> L-vector, offsets provided
249 //------------------------------------------------------------------------------
250 inline void writeDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
251                               const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) {
252   const CeedInt item_id_x = get_local_id(0);
253   const CeedInt item_id_y = get_local_id(1);
254   const CeedInt elem      = get_global_id(2);
255 
256   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
257     for (CeedInt z = 0; z < P_1D; ++z) {
258       const CeedInt node = item_id_x + item_id_y * P_1D + z * P_1D * P_1D;
259       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
260       for (CeedInt comp = 0; comp < num_comp; ++comp)
261         atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[z + comp * P_1D], memory_order_relaxed, memory_scope_device);
262     }
263   }
264 }
265 
266 //------------------------------------------------------------------------------
267 // E-vector -> L-vector, strided
268 //------------------------------------------------------------------------------
269 inline void writeDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
270                                const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar *restrict r_v,
271                                global CeedScalar *restrict d_v) {
272   const CeedInt item_id_x = get_local_id(0);
273   const CeedInt item_id_y = get_local_id(1);
274   const CeedInt elem      = get_global_id(2);
275 
276   if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
277     for (CeedInt z = 0; z < P_1D; ++z) {
278       const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
279       const CeedInt ind  = node * strides_node + elem * strides_elem;
280       for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
281     }
282   }
283 }
284 
285 //------------------------------------------------------------------------------
286 // 3D collocated derivatives computation
287 //------------------------------------------------------------------------------
288 inline void gradCollo3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar *restrict r_U,
289                         const local CeedScalar *s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
290   const CeedInt item_id_x = get_local_id(0);
291   const CeedInt item_id_y = get_local_id(1);
292 
293   for (CeedInt comp = 0; comp < num_comp; ++comp) {
294     if (item_id_x < Q_1D && item_id_y < Q_1D) {
295       scratch[item_id_x + item_id_y * T_1D] = r_U[q + comp * Q_1D];
296     }
297     work_group_barrier(CLK_LOCAL_MEM_FENCE);
298 
299     if (item_id_x < Q_1D && item_id_y < Q_1D) {
300       // X derivative
301       r_V[comp + 0 * num_comp] = 0.0;
302       for (CeedInt i = 0; i < Q_1D; ++i)
303         r_V[comp + 0 * num_comp] += s_G[i + item_id_x * Q_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction (X derivative)
304 
305       // Y derivative
306       r_V[comp + 1 * num_comp] = 0.0;
307       for (CeedInt i = 0; i < Q_1D; ++i)
308         r_V[comp + 1 * num_comp] += s_G[i + item_id_y * Q_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction (Y derivative)
309 
310       // Z derivative
311       r_V[comp + 2 * num_comp] = 0.0;
312       for (CeedInt i = 0; i < Q_1D; ++i) r_V[comp + 2 * num_comp] += s_G[i + q * Q_1D] * r_U[i + comp * Q_1D];  // Contract z direction (Z derivative)
313     }
314 
315     work_group_barrier(CLK_LOCAL_MEM_FENCE);
316   }
317 }
318 
319 //------------------------------------------------------------------------------
320 // 3D collocated derivatives transpose
321 //------------------------------------------------------------------------------
322 inline void gradColloTranspose3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar *restrict r_U,
323                                  const local CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
324   const CeedInt item_id_x = get_local_id(0);
325   const CeedInt item_id_y = get_local_id(1);
326 
327   for (CeedInt comp = 0; comp < num_comp; ++comp) {
328     // X derivative
329     if (item_id_x < Q_1D && item_id_y < Q_1D) {
330       scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 0 * num_comp];
331     }
332     work_group_barrier(CLK_LOCAL_MEM_FENCE);
333 
334     if (item_id_x < Q_1D && item_id_y < Q_1D) {
335       for (CeedInt i = 0; i < Q_1D; ++i)
336         r_V[q + comp * Q_1D] += s_G[item_id_x + i * Q_1D] * scratch[i + item_id_y * T_1D];  // Contract x direction (X derivative)
337     }
338     work_group_barrier(CLK_LOCAL_MEM_FENCE);
339 
340     // Y derivative
341     if (item_id_x < Q_1D && item_id_y < Q_1D) {
342       scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 1 * num_comp];
343     }
344     work_group_barrier(CLK_LOCAL_MEM_FENCE);
345 
346     if (item_id_x < Q_1D && item_id_y < Q_1D) {
347       for (CeedInt i = 0; i < Q_1D; ++i)
348         r_V[q + comp * Q_1D] += s_G[item_id_y + i * Q_1D] * scratch[item_id_x + i * T_1D];  // Contract y direction (Y derivative)
349     }
350     work_group_barrier(CLK_LOCAL_MEM_FENCE);
351 
352     // Z derivative
353     if (item_id_x < Q_1D && item_id_y < Q_1D) {
354       for (CeedInt i = 0; i < Q_1D; ++i)
355         r_V[i + comp * Q_1D] += s_G[i + q * Q_1D] * r_U[comp + 2 * num_comp];  // PARTIAL contract z direction (Z derivative)
356     }
357   }
358 }
359 
360 //------------------------------------------------------------------------------
361 
362 #endif  // CEED_SYCL_GEN_TEMPLATES_H
363