xref: /libCEED/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp (revision b3de639df132887ed4e21d8d409407a2c1d2e41c)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other
2 // CEED contributors. All Rights Reserved. See the top-level LICENSE and NOTICE
3 // files for details.
4 //
5 // SPDX-License-Identifier: BSD-2-Clause
6 //
7 // This file is part of CEED:  http://github.com/ceed
8 
9 #include <ceed/backend.h>
10 #include <ceed/ceed.h>
11 #include <ceed/jit-tools.h>
12 
13 #include <string>
14 #include <sycl/sycl.hpp>
15 
16 #include "../sycl/ceed-sycl-compile.hpp"
17 #include "ceed-sycl-ref.hpp"
18 
19 class CeedElemRestrSyclStridedNT;
20 class CeedElemRestrSyclOffsetNT;
21 class CeedElemRestrSyclStridedT;
22 class CeedElemRestrSyclOffsetT;
23 
24 //------------------------------------------------------------------------------
25 // Restriction Kernel : L-vector -> E-vector, strided
26 //------------------------------------------------------------------------------
27 static int CeedElemRestrictionStridedNoTranspose_Sycl(sycl::queue &sycl_queue, const CeedElemRestriction_Sycl *impl, const CeedScalar *u,
28                                                       CeedScalar *v) {
29   const CeedInt  elem_size    = impl->elem_size;
30   const CeedInt  num_elem     = impl->num_elem;
31   const CeedInt  num_comp     = impl->num_comp;
32   const CeedInt  stride_nodes = impl->strides[0];
33   const CeedInt  stride_comp  = impl->strides[1];
34   const CeedInt  stride_elem  = impl->strides[2];
35   sycl::range<1> kernel_range(num_elem * elem_size);
36 
37   // Order queue
38   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
39   sycl_queue.parallel_for<CeedElemRestrSyclStridedNT>(kernel_range, {e}, [=](sycl::id<1> node) {
40     const CeedInt loc_node = node % elem_size;
41     const CeedInt elem     = node / elem_size;
42 
43     for (CeedInt comp = 0; comp < num_comp; comp++) {
44       v[loc_node + comp * elem_size * num_elem + elem * elem_size] = u[loc_node * stride_nodes + comp * stride_comp + elem * stride_elem];
45     }
46   });
47   return CEED_ERROR_SUCCESS;
48 }
49 
50 //------------------------------------------------------------------------------
51 // Restriction Kernel : L-vector -> E-vector, offsets provided
52 //------------------------------------------------------------------------------
53 static int CeedElemRestrictionOffsetNoTranspose_Sycl(sycl::queue &sycl_queue, const CeedElemRestriction_Sycl *impl, const CeedScalar *u,
54                                                      CeedScalar *v) {
55   const CeedInt  elem_size   = impl->elem_size;
56   const CeedInt  num_elem    = impl->num_elem;
57   const CeedInt  num_comp    = impl->num_comp;
58   const CeedInt  comp_stride = impl->comp_stride;
59   const CeedInt *indices     = impl->d_ind;
60 
61   sycl::range<1> kernel_range(num_elem * elem_size);
62 
63   // Order queue
64   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
65   sycl_queue.parallel_for<CeedElemRestrSyclOffsetNT>(kernel_range, {e}, [=](sycl::id<1> node) {
66     const CeedInt ind      = indices[node];
67     const CeedInt loc_node = node % elem_size;
68     const CeedInt elem     = node / elem_size;
69 
70     for (CeedInt comp = 0; comp < num_comp; comp++) {
71       v[loc_node + comp * elem_size * num_elem + elem * elem_size] = u[ind + comp * comp_stride];
72     }
73   });
74   return CEED_ERROR_SUCCESS;
75 }
76 
77 //------------------------------------------------------------------------------
78 // Kernel: E-vector -> L-vector, strided
79 //------------------------------------------------------------------------------
80 static int CeedElemRestrictionStridedTranspose_Sycl(sycl::queue &sycl_queue, const CeedElemRestriction_Sycl *impl, const CeedScalar *u,
81                                                     CeedScalar *v) {
82   const CeedInt elem_size    = impl->elem_size;
83   const CeedInt num_elem     = impl->num_elem;
84   const CeedInt num_comp     = impl->num_comp;
85   const CeedInt stride_nodes = impl->strides[0];
86   const CeedInt stride_comp  = impl->strides[1];
87   const CeedInt stride_elem  = impl->strides[2];
88 
89   sycl::range<1> kernel_range(num_elem * elem_size);
90 
91   // Order queue
92   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
93   sycl_queue.parallel_for<CeedElemRestrSyclStridedT>(kernel_range, {e}, [=](sycl::id<1> node) {
94     const CeedInt loc_node = node % elem_size;
95     const CeedInt elem     = node / elem_size;
96 
97     for (CeedInt comp = 0; comp < num_comp; comp++) {
98       v[loc_node * stride_nodes + comp * stride_comp + elem * stride_elem] += u[loc_node + comp * elem_size * num_elem + elem * elem_size];
99     }
100   });
101   return CEED_ERROR_SUCCESS;
102 }
103 
104 //------------------------------------------------------------------------------
105 // Kernel: E-vector -> L-vector, offsets provided
106 //------------------------------------------------------------------------------
107 static int CeedElemRestrictionOffsetTranspose_Sycl(sycl::queue &sycl_queue, const CeedElemRestriction_Sycl *impl, const CeedScalar *u,
108                                                    CeedScalar *v) {
109   const CeedInt  num_nodes     = impl->num_nodes;
110   const CeedInt  elem_size     = impl->elem_size;
111   const CeedInt  num_elem      = impl->num_elem;
112   const CeedInt  num_comp      = impl->num_comp;
113   const CeedInt  comp_stride   = impl->comp_stride;
114   const CeedInt *l_vec_indices = impl->d_l_vec_indices;
115   const CeedInt *t_offsets     = impl->d_t_offsets;
116   const CeedInt *t_indices     = impl->d_t_indices;
117 
118   sycl::range<1> kernel_range(num_nodes * num_comp);
119 
120   // Order queue
121   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
122   sycl_queue.parallel_for<CeedElemRestrSyclOffsetT>(kernel_range, {e}, [=](sycl::id<1> id) {
123     const CeedInt node    = id % num_nodes;
124     const CeedInt comp    = id / num_nodes;
125     const CeedInt ind     = l_vec_indices[node];
126     const CeedInt range_1 = t_offsets[node];
127     const CeedInt range_N = t_offsets[node + 1];
128     CeedScalar    value   = 0.0;
129 
130     for (CeedInt j = range_1; j < range_N; j++) {
131       const CeedInt t_ind    = t_indices[j];
132       CeedInt       loc_node = t_ind % elem_size;
133       CeedInt       elem     = t_ind / elem_size;
134 
135       value += u[loc_node + comp * elem_size * num_elem + elem * elem_size];
136     }
137     v[ind + comp * comp_stride] += value;
138   });
139   return CEED_ERROR_SUCCESS;
140 }
141 
142 //------------------------------------------------------------------------------
143 // Apply restriction
144 //------------------------------------------------------------------------------
145 static int CeedElemRestrictionApply_Sycl(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) {
146   Ceed                      ceed;
147   Ceed_Sycl                *data;
148   const CeedScalar         *d_u;
149   CeedScalar               *d_v;
150   CeedElemRestriction_Sycl *impl;
151 
152   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
153   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
154   CeedCallBackend(CeedGetData(ceed, &data));
155 
156   // Get vectors
157   CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
158   if (t_mode == CEED_TRANSPOSE) {
159     // Sum into for transpose mode, e-vec to l-vec
160     CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
161   } else {
162     // Overwrite for notranspose mode, l-vec to e-vec
163     CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
164   }
165 
166   // Restrict
167   if (t_mode == CEED_NOTRANSPOSE) {
168     // L-vector -> E-vector
169     if (impl->d_ind) {
170       // -- Offsets provided
171       CeedCallBackend(CeedElemRestrictionOffsetNoTranspose_Sycl(data->sycl_queue, impl, d_u, d_v));
172     } else {
173       // -- Strided restriction
174       CeedCallBackend(CeedElemRestrictionStridedNoTranspose_Sycl(data->sycl_queue, impl, d_u, d_v));
175     }
176   } else {
177     // E-vector -> L-vector
178     if (impl->d_ind) {
179       // -- Offsets provided
180       CeedCallBackend(CeedElemRestrictionOffsetTranspose_Sycl(data->sycl_queue, impl, d_u, d_v));
181     } else {
182       // -- Strided restriction
183       CeedCallBackend(CeedElemRestrictionStridedTranspose_Sycl(data->sycl_queue, impl, d_u, d_v));
184     }
185   }
186   // Wait for queues to be completed. NOTE: This may not be necessary
187   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
188 
189   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL;
190 
191   // Restore arrays
192   CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
193   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
194   return CEED_ERROR_SUCCESS;
195 }
196 
197 //------------------------------------------------------------------------------
198 // Get offsets
199 //------------------------------------------------------------------------------
200 static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction rstr, CeedMemType m_type, const CeedInt **offsets) {
201   Ceed                      ceed;
202   CeedElemRestriction_Sycl *impl;
203 
204   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
205   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
206 
207   switch (m_type) {
208     case CEED_MEM_HOST:
209       *offsets = impl->h_ind;
210       break;
211     case CEED_MEM_DEVICE:
212       *offsets = impl->d_ind;
213       break;
214   }
215   return CEED_ERROR_SUCCESS;
216 }
217 
218 //------------------------------------------------------------------------------
219 // Destroy restriction
220 //------------------------------------------------------------------------------
221 static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction rstr) {
222   Ceed                      ceed;
223   Ceed_Sycl                *data;
224   CeedElemRestriction_Sycl *impl;
225 
226   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
227   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
228   CeedCallBackend(CeedGetData(ceed, &data));
229 
230   // Wait for all work to finish before freeing memory
231   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
232 
233   CeedCallBackend(CeedFree(&impl->h_ind_allocated));
234   CeedCallSycl(ceed, sycl::free(impl->d_ind_allocated, data->sycl_context));
235   CeedCallSycl(ceed, sycl::free(impl->d_t_offsets, data->sycl_context));
236   CeedCallSycl(ceed, sycl::free(impl->d_t_indices, data->sycl_context));
237   CeedCallSycl(ceed, sycl::free(impl->d_l_vec_indices, data->sycl_context));
238   CeedCallBackend(CeedFree(&impl));
239   return CEED_ERROR_SUCCESS;
240 }
241 
242 //------------------------------------------------------------------------------
243 // Create transpose offsets and indices
244 //------------------------------------------------------------------------------
245 static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction rstr, const CeedInt *indices) {
246   Ceed                      ceed;
247   Ceed_Sycl                *data;
248   bool                     *is_node;
249   CeedSize                  l_size;
250   CeedInt                   num_elem, elem_size, num_comp, num_nodes = 0, *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices;
251   CeedElemRestriction_Sycl *impl;
252 
253   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
254   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
255   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
256   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
257   CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
258   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
259 
260   // Count num_nodes
261   CeedCallBackend(CeedCalloc(l_size, &is_node));
262   const CeedInt size_indices = num_elem * elem_size;
263 
264   for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1;
265   for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i];
266   impl->num_nodes = num_nodes;
267 
268   // L-vector offsets array
269   CeedCallBackend(CeedCalloc(l_size, &ind_to_offset));
270   CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices));
271   for (CeedInt i = 0, j = 0; i < l_size; i++) {
272     if (is_node[i]) {
273       l_vec_indices[j] = i;
274       ind_to_offset[i] = j++;
275     }
276   }
277   CeedCallBackend(CeedFree(&is_node));
278 
279   // Compute transpose offsets and indices
280   const CeedInt size_offsets = num_nodes + 1;
281 
282   CeedCallBackend(CeedCalloc(size_offsets, &t_offsets));
283   CeedCallBackend(CeedMalloc(size_indices, &t_indices));
284   // Count node multiplicity
285   for (CeedInt e = 0; e < num_elem; ++e) {
286     for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1];
287   }
288   // Convert to running sum
289   for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1];
290   // List all E-vec indices associated with L-vec node
291   for (CeedInt e = 0; e < num_elem; ++e) {
292     for (CeedInt i = 0; i < elem_size; ++i) {
293       const CeedInt lid                          = elem_size * e + i;
294       const CeedInt gid                          = indices[lid];
295       t_indices[t_offsets[ind_to_offset[gid]]++] = lid;
296     }
297   }
298   // Reset running sum
299   for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1];
300   t_offsets[0] = 0;
301 
302   // Copy data to device
303   CeedCallBackend(CeedGetData(ceed, &data));
304 
305   // Order queue
306   sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
307 
308   // -- L-vector indices
309   CeedCallSycl(ceed, impl->d_l_vec_indices = sycl::malloc_device<CeedInt>(num_nodes, data->sycl_device, data->sycl_context));
310   sycl::event copy_lvec = data->sycl_queue.copy<CeedInt>(l_vec_indices, impl->d_l_vec_indices, num_nodes, {e});
311   // -- Transpose offsets
312   CeedCallSycl(ceed, impl->d_t_offsets = sycl::malloc_device<CeedInt>(size_offsets, data->sycl_device, data->sycl_context));
313   sycl::event copy_offsets = data->sycl_queue.copy<CeedInt>(t_offsets, impl->d_t_offsets, size_offsets, {e});
314   // -- Transpose indices
315   CeedCallSycl(ceed, impl->d_t_indices = sycl::malloc_device<CeedInt>(size_indices, data->sycl_device, data->sycl_context));
316   sycl::event copy_indices = data->sycl_queue.copy<CeedInt>(t_indices, impl->d_t_indices, size_indices, {e});
317 
318   // Wait for all copies to complete and handle exceptions
319   CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_lvec, copy_offsets, copy_indices}));
320 
321   // Cleanup
322   CeedCallBackend(CeedFree(&ind_to_offset));
323   CeedCallBackend(CeedFree(&l_vec_indices));
324   CeedCallBackend(CeedFree(&t_offsets));
325   CeedCallBackend(CeedFree(&t_indices));
326   return CEED_ERROR_SUCCESS;
327 }
328 
329 //------------------------------------------------------------------------------
330 // Create restriction
331 //------------------------------------------------------------------------------
332 int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients,
333                                    const CeedInt8 *curl_orients, CeedElemRestriction rstr) {
334   Ceed                      ceed;
335   Ceed_Sycl                *data;
336   bool                      is_strided;
337   CeedInt                   num_elem, num_comp, elem_size, comp_stride = 1;
338   CeedRestrictionType       rstr_type;
339   CeedElemRestriction_Sycl *impl;
340 
341   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
342   CeedCallBackend(CeedGetData(ceed, &data));
343   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
344   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
345   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
346   const CeedInt size       = num_elem * elem_size;
347   CeedInt       strides[3] = {1, size, elem_size};
348 
349   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
350   CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND,
351             "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented");
352 
353   // Stride data
354   CeedCallBackend(CeedElemRestrictionIsStrided(rstr, &is_strided));
355   if (is_strided) {
356     bool has_backend_strides;
357 
358     CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
359     if (!has_backend_strides) {
360       CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
361     }
362   } else {
363     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
364   }
365 
366   CeedCallBackend(CeedCalloc(1, &impl));
367   impl->h_ind           = NULL;
368   impl->h_ind_allocated = NULL;
369   impl->d_ind           = NULL;
370   impl->d_ind_allocated = NULL;
371   impl->d_t_indices     = NULL;
372   impl->d_t_offsets     = NULL;
373   impl->num_nodes       = size;
374   impl->num_elem        = num_elem;
375   impl->num_comp        = num_comp;
376   impl->elem_size       = elem_size;
377   impl->comp_stride     = comp_stride;
378   impl->strides[0]      = strides[0];
379   impl->strides[1]      = strides[1];
380   impl->strides[2]      = strides[2];
381   CeedCallBackend(CeedElemRestrictionSetData(rstr, impl));
382 
383   // Set layouts
384   {
385     bool    has_backend_strides;
386     CeedInt layout[3] = {1, size, elem_size};
387 
388     CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout));
389     if (rstr_type == CEED_RESTRICTION_STRIDED) {
390       CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
391       if (has_backend_strides) {
392         CeedCallBackend(CeedElemRestrictionSetLLayout(rstr, layout));
393       }
394     }
395   }
396 
397   // Set up device indices/offset arrays
398   switch (mem_type) {
399     case CEED_MEM_HOST: {
400       switch (copy_mode) {
401         case CEED_OWN_POINTER:
402           impl->h_ind_allocated = (CeedInt *)indices;
403           impl->h_ind           = (CeedInt *)indices;
404           break;
405         case CEED_USE_POINTER:
406           impl->h_ind = (CeedInt *)indices;
407           break;
408         case CEED_COPY_VALUES:
409           if (indices != NULL) {
410             CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated));
411             memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt));
412             impl->h_ind = impl->h_ind_allocated;
413           }
414           break;
415       }
416       if (indices != NULL) {
417         CeedCallSycl(ceed, impl->d_ind = sycl::malloc_device<CeedInt>(size, data->sycl_device, data->sycl_context));
418         impl->d_ind_allocated = impl->d_ind;  // We own the device memory
419         // Order queue
420         sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
421         // Copy from host to device
422         sycl::event copy_event = data->sycl_queue.copy<CeedInt>(indices, impl->d_ind, size, {e});
423         // Wait for copy to finish and handle exceptions
424         CeedCallSycl(ceed, copy_event.wait_and_throw());
425         CeedCallBackend(CeedElemRestrictionOffset_Sycl(rstr, indices));
426       }
427     } break;
428     case CEED_MEM_DEVICE: {
429       switch (copy_mode) {
430         case CEED_COPY_VALUES:
431           if (indices != NULL) {
432             CeedCallSycl(ceed, impl->d_ind = sycl::malloc_device<CeedInt>(size, data->sycl_device, data->sycl_context));
433             impl->d_ind_allocated = impl->d_ind;  // We own the device memory
434                                                   // Copy from device to device
435             // Order queue
436             sycl::event e          = data->sycl_queue.ext_oneapi_submit_barrier();
437             sycl::event copy_event = data->sycl_queue.copy<CeedInt>(indices, impl->d_ind, size, {e});
438             // Wait for copy to finish and handle exceptions
439             CeedCallSycl(ceed, copy_event.wait_and_throw());
440           }
441           break;
442         case CEED_OWN_POINTER:
443           impl->d_ind           = (CeedInt *)indices;
444           impl->d_ind_allocated = impl->d_ind;
445           break;
446         case CEED_USE_POINTER:
447           impl->d_ind = (CeedInt *)indices;
448       }
449       if (indices != NULL) {
450         CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated));
451         // Order queue
452         sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
453         // Copy from device to host
454         sycl::event copy_event = data->sycl_queue.copy<CeedInt>(impl->d_ind, impl->h_ind_allocated, elem_size * num_elem, {e});
455         CeedCallSycl(ceed, copy_event.wait_and_throw());
456         impl->h_ind = impl->h_ind_allocated;
457         CeedCallBackend(CeedElemRestrictionOffset_Sycl(rstr, indices));
458       }
459     }
460   }
461 
462   // Register backend functions
463   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Sycl));
464   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApply_Sycl));
465   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApply_Sycl));
466   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Sycl));
467   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Sycl));
468   return CEED_ERROR_SUCCESS;
469 }
470