xref: /libCEED/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp (revision 6b3fb9aeb84aa459c3cafb15157ad7b579ff74f0)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/backend.h>
9 #include <ceed/ceed.h>
10 #include <ceed/jit-tools.h>
11 
12 #include <sycl/sycl.hpp>
13 #include <vector>
14 
15 #include "../sycl/ceed-sycl-compile.hpp"
16 #include "ceed-sycl-ref.hpp"
17 
18 template <int>
19 class CeedBasisSyclInterp;
20 template <int>
21 class CeedBasisSyclGrad;
22 class CeedBasisSyclWeight;
23 
24 class CeedBasisSyclInterpNT;
25 class CeedBasisSyclGradNT;
26 class CeedBasisSyclWeightNT;
27 
28 using SpecID = sycl::specialization_id<CeedInt>;
29 
30 static constexpr SpecID BASIS_DIM_ID;
31 static constexpr SpecID BASIS_NUM_COMP_ID;
32 static constexpr SpecID BASIS_P_1D_ID;
33 static constexpr SpecID BASIS_Q_1D_ID;
34 
35 //------------------------------------------------------------------------------
36 // Interpolation kernel - tensor
37 //------------------------------------------------------------------------------
38 template <int is_transpose>
39 static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
40                                      const CeedScalar *u, CeedScalar *v) {
41   const CeedInt     buf_len   = impl->buf_len;
42   const CeedInt     op_len    = impl->op_len;
43   const CeedScalar *interp_1d = impl->d_interp_1d;
44 
45   const sycl::device &sycl_device         = sycl_queue.get_device();
46   const CeedInt       max_work_group_size = 32;
47   const CeedInt       work_group_size     = CeedIntMin(impl->num_qpts, max_work_group_size);
48   sycl::range<1>      local_range(work_group_size);
49   sycl::range<1>      global_range(num_elem * work_group_size);
50   sycl::nd_range<1>   kernel_range(global_range, local_range);
51 
52   // Order queue
53   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
54   sycl_queue.submit([&](sycl::handler &cgh) {
55     cgh.depends_on({e});
56     cgh.use_kernel_bundle(sycl_module);
57 
58     sycl::local_accessor<CeedScalar> s_mem(op_len + 2 * buf_len, cgh);
59 
60     cgh.parallel_for<CeedBasisSyclInterp<is_transpose>>(kernel_range, [=](sycl::nd_item<1> work_item, sycl::kernel_handler kh) {
61       //-------------------------------------------------------------->
62       // Retrieve spec constant values
63       const CeedInt dim      = kh.get_specialization_constant<BASIS_DIM_ID>();
64       const CeedInt num_comp = kh.get_specialization_constant<BASIS_NUM_COMP_ID>();
65       const CeedInt P_1d     = kh.get_specialization_constant<BASIS_P_1D_ID>();
66       const CeedInt Q_1d     = kh.get_specialization_constant<BASIS_Q_1D_ID>();
67       //-------------------------------------------------------------->
68       const CeedInt num_nodes     = CeedIntPow(P_1d, dim);
69       const CeedInt num_qpts      = CeedIntPow(Q_1d, dim);
70       const CeedInt P             = is_transpose ? Q_1d : P_1d;
71       const CeedInt Q             = is_transpose ? P_1d : Q_1d;
72       const CeedInt stride_0      = is_transpose ? 1 : P_1d;
73       const CeedInt stride_1      = is_transpose ? P_1d : 1;
74       const CeedInt u_stride      = is_transpose ? num_qpts : num_nodes;
75       const CeedInt v_stride      = is_transpose ? num_nodes : num_qpts;
76       const CeedInt u_comp_stride = num_elem * u_stride;
77       const CeedInt v_comp_stride = num_elem * v_stride;
78       const CeedInt u_size        = u_stride;
79 
80       sycl::group   work_group = work_item.get_group();
81       const CeedInt i          = work_item.get_local_linear_id();
82       const CeedInt group_size = work_group.get_local_linear_range();
83       const CeedInt elem       = work_group.get_group_linear_id();
84 
85       CeedScalar *s_interp_1d = s_mem.get_multi_ptr<sycl::access::decorated::yes>().get();
86       CeedScalar *s_buffer_1  = s_interp_1d + Q * P;
87       CeedScalar *s_buffer_2  = s_buffer_1 + buf_len;
88 
89       for (CeedInt k = i; k < P * Q; k += group_size) {
90         s_interp_1d[k] = interp_1d[k];
91       }
92 
93       // Apply basis element by element
94       for (CeedInt comp = 0; comp < num_comp; comp++) {
95         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
96         CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
97 
98         for (CeedInt k = i; k < u_size; k += group_size) {
99           s_buffer_1[k] = cur_u[k];
100         }
101 
102         CeedInt pre  = u_size;
103         CeedInt post = 1;
104 
105         for (CeedInt d = 0; d < dim; d++) {
106           // Use older version of sycl workgroup barrier for performance reasons
107           // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed
108           // sycl::group_barrier(work_group);
109           work_item.barrier(sycl::access::fence_space::local_space);
110 
111           pre /= P;
112           const CeedScalar *in  = d % 2 ? s_buffer_2 : s_buffer_1;
113           CeedScalar       *out = d == dim - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
114 
115           // Contract along middle index
116           const CeedInt writeLen = pre * post * Q;
117           for (CeedInt k = i; k < writeLen; k += group_size) {
118             const CeedInt c = k % post;
119             const CeedInt j = (k / post) % Q;
120             const CeedInt a = k / (post * Q);
121 
122             CeedScalar vk = 0;
123             for (CeedInt b = 0; b < P; b++) {
124               vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
125             }
126             out[k] = vk;
127           }
128           post *= Q;
129         }
130       }
131     });
132   });
133   return CEED_ERROR_SUCCESS;
134 }
135 
136 //------------------------------------------------------------------------------
137 // Gradient kernel - tensor
138 //------------------------------------------------------------------------------
139 template <int is_transpose>
140 static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
141                                    const CeedScalar *u, CeedScalar *v) {
142   const CeedInt     buf_len   = impl->buf_len;
143   const CeedInt     op_len    = impl->op_len;
144   const CeedScalar *interp_1d = impl->d_interp_1d;
145   const CeedScalar *grad_1d   = impl->d_grad_1d;
146 
147   const sycl::device &sycl_device     = sycl_queue.get_device();
148   const CeedInt       work_group_size = 32;
149   sycl::range<1>      local_range(work_group_size);
150   sycl::range<1>      global_range(num_elem * work_group_size);
151   sycl::nd_range<1>   kernel_range(global_range, local_range);
152 
153   // Order queue
154   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
155   sycl_queue.submit([&](sycl::handler &cgh) {
156     cgh.depends_on({e});
157     cgh.use_kernel_bundle(sycl_module);
158 
159     sycl::local_accessor<CeedScalar> s_mem(2 * (op_len + buf_len), cgh);
160 
161     cgh.parallel_for<CeedBasisSyclGrad<is_transpose>>(kernel_range, [=](sycl::nd_item<1> work_item, sycl::kernel_handler kh) {
162       //-------------------------------------------------------------->
163       // Retrieve spec constant values
164       const CeedInt dim      = kh.get_specialization_constant<BASIS_DIM_ID>();
165       const CeedInt num_comp = kh.get_specialization_constant<BASIS_NUM_COMP_ID>();
166       const CeedInt P_1d     = kh.get_specialization_constant<BASIS_P_1D_ID>();
167       const CeedInt Q_1d     = kh.get_specialization_constant<BASIS_Q_1D_ID>();
168       //-------------------------------------------------------------->
169       const CeedInt num_nodes     = CeedIntPow(P_1d, dim);
170       const CeedInt num_qpts      = CeedIntPow(Q_1d, dim);
171       const CeedInt P             = is_transpose ? Q_1d : P_1d;
172       const CeedInt Q             = is_transpose ? P_1d : Q_1d;
173       const CeedInt stride_0      = is_transpose ? 1 : P_1d;
174       const CeedInt stride_1      = is_transpose ? P_1d : 1;
175       const CeedInt u_stride      = is_transpose ? num_qpts : num_nodes;
176       const CeedInt v_stride      = is_transpose ? num_nodes : num_qpts;
177       const CeedInt u_comp_stride = num_elem * u_stride;
178       const CeedInt v_comp_stride = num_elem * v_stride;
179       const CeedInt u_dim_stride  = is_transpose ? num_elem * num_qpts * num_comp : 0;
180       const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * num_qpts * num_comp;
181       sycl::group   work_group    = work_item.get_group();
182       const CeedInt i             = work_item.get_local_linear_id();
183       const CeedInt group_size    = work_group.get_local_linear_range();
184       const CeedInt elem          = work_group.get_group_linear_id();
185 
186       CeedScalar *s_interp_1d = s_mem.get_multi_ptr<sycl::access::decorated::yes>().get();
187       CeedScalar *s_grad_1d   = s_interp_1d + P * Q;
188       CeedScalar *s_buffer_1  = s_grad_1d + P * Q;
189       CeedScalar *s_buffer_2  = s_buffer_1 + buf_len;
190 
191       for (CeedInt k = i; k < P * Q; k += group_size) {
192         s_interp_1d[k] = interp_1d[k];
193         s_grad_1d[k]   = grad_1d[k];
194       }
195 
196       // Apply basis element by element
197       for (CeedInt comp = 0; comp < num_comp; comp++) {
198         for (CeedInt dim_1 = 0; dim_1 < dim; dim_1++) {
199           CeedInt           pre   = is_transpose ? num_qpts : num_nodes;
200           CeedInt           post  = 1;
201           const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
202           CeedScalar       *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
203 
204           for (CeedInt dim_2 = 0; dim_2 < dim; dim_2++) {
205             // Use older version of sycl workgroup barrier for performance reasons
206             // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed
207             // sycl::group_barrier(work_group);
208             work_item.barrier(sycl::access::fence_space::local_space);
209 
210             pre /= P;
211             const CeedScalar *op  = dim_1 == dim_2 ? s_grad_1d : s_interp_1d;
212             const CeedScalar *in  = (dim_2 == 0 ? cur_u : (dim_2 % 2 ? s_buffer_2 : s_buffer_1));
213             CeedScalar       *out = dim_2 == dim - 1 ? cur_v : (dim_2 % 2 ? s_buffer_1 : s_buffer_2);
214 
215             // Contract along middle index
216             const CeedInt writeLen = pre * post * Q;
217             for (CeedInt k = i; k < writeLen; k += group_size) {
218               const CeedInt c = k % post;
219               const CeedInt j = (k / post) % Q;
220               const CeedInt a = k / (post * Q);
221 
222               CeedScalar v_k = 0;
223               for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
224 
225               if (is_transpose && dim_2 == dim - 1) out[k] += v_k;
226               else out[k] = v_k;
227             }
228 
229             post *= Q;
230           }
231         }
232       }
233     });
234   });
235   return CEED_ERROR_SUCCESS;
236 }
237 
238 //------------------------------------------------------------------------------
239 // Weight kernel - tensor
240 //------------------------------------------------------------------------------
241 static int CeedBasisApplyWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, const CeedBasis_Sycl *impl, CeedScalar *w) {
242   const CeedInt     dim         = impl->dim;
243   const CeedInt     Q_1d        = impl->Q_1d;
244   const CeedScalar *q_weight_1d = impl->d_q_weight_1d;
245 
246   const CeedInt  num_quad_x = Q_1d;
247   const CeedInt  num_quad_y = (dim > 1) ? Q_1d : 1;
248   const CeedInt  num_quad_z = (dim > 2) ? Q_1d : 1;
249   sycl::range<3> kernel_range(num_elem * num_quad_z, num_quad_y, num_quad_x);
250 
251   // Order queue
252   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
253   sycl_queue.parallel_for<CeedBasisSyclWeight>(kernel_range, {e}, [=](sycl::item<3> work_item) {
254     if (dim == 1) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]];
255     if (dim == 2) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]];
256     if (dim == 3) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]] * q_weight_1d[work_item[0] % Q_1d];
257   });
258   return CEED_ERROR_SUCCESS;
259 }
260 
261 //------------------------------------------------------------------------------
262 // Basis apply - tensor
263 //------------------------------------------------------------------------------
264 static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
265                                CeedVector v) {
266   Ceed              ceed;
267   const CeedInt     is_transpose = t_mode == CEED_TRANSPOSE;
268   const CeedScalar *d_u;
269   CeedScalar       *d_v;
270   Ceed_Sycl        *data;
271   CeedBasis_Sycl   *impl;
272 
273   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
274   CeedCallBackend(CeedGetData(ceed, &data));
275   CeedCallBackend(CeedBasisGetData(basis, &impl));
276 
277   // Get read/write access to u, v
278   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
279   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
280   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
281 
282   // Clear v for transpose operation
283   if (is_transpose) {
284     CeedSize length;
285     CeedCallBackend(CeedVectorGetLength(v, &length));
286     // Order queue
287     sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
288     data->sycl_queue.fill<CeedScalar>(d_v, 0, length, {e});
289   }
290 
291   // Basis action
292   switch (eval_mode) {
293     case CEED_EVAL_INTERP: {
294       if (is_transpose) {
295         CeedCallBackend(CeedBasisApplyInterp_Sycl<CEED_TRANSPOSE>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
296       } else {
297         CeedCallBackend(CeedBasisApplyInterp_Sycl<CEED_NOTRANSPOSE>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
298       }
299     } break;
300     case CEED_EVAL_GRAD: {
301       if (is_transpose) {
302         CeedCallBackend(CeedBasisApplyGrad_Sycl<1>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
303       } else {
304         CeedCallBackend(CeedBasisApplyGrad_Sycl<0>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
305       }
306     } break;
307     case CEED_EVAL_WEIGHT: {
308       CeedCallBackend(CeedBasisApplyWeight_Sycl(data->sycl_queue, num_elem, impl, d_v));
309     } break;
310     case CEED_EVAL_NONE: /* handled separately below */
311       break;
312     // LCOV_EXCL_START
313     case CEED_EVAL_DIV:
314     case CEED_EVAL_CURL:
315       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
316       // LCOV_EXCL_STOP
317   }
318 
319   // Restore vectors, cover CEED_EVAL_NONE
320   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
321   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
322   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
323   return CEED_ERROR_SUCCESS;
324 }
325 
326 //------------------------------------------------------------------------------
327 // Interpolation kernel - non-tensor
328 //------------------------------------------------------------------------------
329 static int CeedBasisApplyNonTensorInterp_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, CeedInt is_transpose, const CeedBasisNonTensor_Sycl *impl,
330                                               const CeedScalar *d_U, CeedScalar *d_V) {
331   const CeedInt     num_comp      = impl->num_comp;
332   const CeedInt     P             = is_transpose ? impl->num_qpts : impl->num_nodes;
333   const CeedInt     Q             = is_transpose ? impl->num_nodes : impl->num_qpts;
334   const CeedInt     stride_0      = is_transpose ? 1 : impl->num_nodes;
335   const CeedInt     stride_1      = is_transpose ? impl->num_nodes : 1;
336   const CeedInt     u_stride      = P;
337   const CeedInt     v_stride      = Q;
338   const CeedInt     u_comp_stride = u_stride * num_elem;
339   const CeedInt     v_comp_stride = v_stride * num_elem;
340   const CeedInt     u_size        = P;
341   const CeedInt     v_size        = Q;
342   const CeedScalar *d_B           = impl->d_interp;
343 
344   sycl::range<2> kernel_range(num_elem, v_size);
345 
346   // Order queue
347   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
348   sycl_queue.parallel_for<CeedBasisSyclInterpNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
349     const CeedInt i    = indx[1];
350     const CeedInt elem = indx[0];
351 
352     for (CeedInt comp = 0; comp < num_comp; comp++) {
353       const CeedScalar *U = d_U + elem * u_stride + comp * u_comp_stride;
354       CeedScalar        V = 0.0;
355 
356       for (CeedInt j = 0; j < u_size; ++j) {
357         V += d_B[i * stride_0 + j * stride_1] * U[j];
358       }
359       d_V[i + elem * v_stride + comp * v_comp_stride] = V;
360     }
361   });
362   return CEED_ERROR_SUCCESS;
363 }
364 
365 //------------------------------------------------------------------------------
366 // Gradient kernel - non-tensor
367 //------------------------------------------------------------------------------
368 static int CeedBasisApplyNonTensorGrad_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, CeedInt is_transpose, const CeedBasisNonTensor_Sycl *impl,
369                                             const CeedScalar *d_U, CeedScalar *d_V) {
370   const CeedInt     num_comp      = impl->num_comp;
371   const CeedInt     P             = is_transpose ? impl->num_qpts : impl->num_nodes;
372   const CeedInt     Q             = is_transpose ? impl->num_nodes : impl->num_qpts;
373   const CeedInt     stride_0      = is_transpose ? 1 : impl->num_nodes;
374   const CeedInt     stride_1      = is_transpose ? impl->num_nodes : 1;
375   const CeedInt     g_dim_stride  = P * Q;
376   const CeedInt     u_stride      = P;
377   const CeedInt     v_stride      = Q;
378   const CeedInt     u_comp_stride = u_stride * num_elem;
379   const CeedInt     v_comp_stride = v_stride * num_elem;
380   const CeedInt     u_dim_stride  = u_comp_stride * num_comp;
381   const CeedInt     v_dim_stride  = v_comp_stride * num_comp;
382   const CeedInt     u_size        = P;
383   const CeedInt     v_size        = Q;
384   const CeedInt     in_dim        = is_transpose ? impl->dim : 1;
385   const CeedInt     out_dim       = is_transpose ? 1 : impl->dim;
386   const CeedScalar *d_G           = impl->d_grad;
387 
388   sycl::range<2> kernel_range(num_elem, v_size);
389 
390   // Order queue
391   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
392   sycl_queue.parallel_for<CeedBasisSyclGradNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
393     const CeedInt i    = indx[1];
394     const CeedInt elem = indx[0];
395 
396     for (CeedInt comp = 0; comp < num_comp; comp++) {
397       CeedScalar V[3] = {0.0, 0.0, 0.0};
398 
399       for (CeedInt d1 = 0; d1 < in_dim; ++d1) {
400         const CeedScalar *U = d_U + elem * u_stride + comp * u_comp_stride + d1 * u_dim_stride;
401         const CeedScalar *G = d_G + i * stride_0 + d1 * g_dim_stride;
402 
403         for (CeedInt j = 0; j < u_size; ++j) {
404           const CeedScalar Uj = U[j];
405 
406           for (CeedInt d0 = 0; d0 < out_dim; ++d0) {
407             V[d0] += G[j * stride_1 + d0 * g_dim_stride] * Uj;
408           }
409         }
410       }
411       for (CeedInt d0 = 0; d0 < out_dim; ++d0) {
412         d_V[i + elem * v_stride + comp * v_comp_stride + d0 * v_dim_stride] = V[d0];
413       }
414     }
415   });
416   return CEED_ERROR_SUCCESS;
417 }
418 
419 //------------------------------------------------------------------------------
420 // Weight kernel - non-tensor
421 //------------------------------------------------------------------------------
422 static int CeedBasisApplyNonTensorWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, const CeedBasisNonTensor_Sycl *impl, CeedScalar *d_V) {
423   const CeedInt     num_qpts = impl->num_qpts;
424   const CeedScalar *q_weight = impl->d_q_weight;
425 
426   sycl::range<2> kernel_range(num_elem, num_qpts);
427 
428   // Order queue
429   sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
430   sycl_queue.parallel_for<CeedBasisSyclWeightNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
431     const CeedInt i          = indx[1];
432     const CeedInt elem       = indx[0];
433     d_V[i + elem * num_qpts] = q_weight[i];
434   });
435   return CEED_ERROR_SUCCESS;
436 }
437 
438 //------------------------------------------------------------------------------
439 // Basis apply - non-tensor
440 //------------------------------------------------------------------------------
441 static int CeedBasisApplyNonTensor_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
442                                         CeedVector v) {
443   Ceed                     ceed;
444   const CeedInt            is_transpose = t_mode == CEED_TRANSPOSE;
445   const CeedScalar        *d_u;
446   CeedScalar              *d_v;
447   CeedBasisNonTensor_Sycl *impl;
448   Ceed_Sycl               *data;
449 
450   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
451   CeedCallBackend(CeedBasisGetData(basis, &impl));
452   CeedCallBackend(CeedGetData(ceed, &data));
453 
454   // Get read/write access to u, v
455   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
456   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
457   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
458 
459   // Clear v for transpose operation
460   if (is_transpose) {
461     CeedSize length;
462     CeedCallBackend(CeedVectorGetLength(v, &length));
463     // Order queue
464     sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
465     data->sycl_queue.fill<CeedScalar>(d_v, 0, length, {e});
466   }
467 
468   // Apply basis operation
469   switch (eval_mode) {
470     case CEED_EVAL_INTERP: {
471       CeedCallBackend(CeedBasisApplyNonTensorInterp_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v));
472     } break;
473     case CEED_EVAL_GRAD: {
474       CeedCallBackend(CeedBasisApplyNonTensorGrad_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v));
475     } break;
476     case CEED_EVAL_WEIGHT: {
477       CeedCallBackend(CeedBasisApplyNonTensorWeight_Sycl(data->sycl_queue, num_elem, impl, d_v));
478     } break;
479     case CEED_EVAL_NONE: /* handled separately below */
480       break;
481     // LCOV_EXCL_START
482     case CEED_EVAL_DIV:
483     case CEED_EVAL_CURL:
484       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
485       // LCOV_EXCL_STOP
486   }
487 
488   // Restore vectors, cover CEED_EVAL_NONE
489   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
490   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
491   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
492 
493   return CEED_ERROR_SUCCESS;
494 }
495 
496 //------------------------------------------------------------------------------
497 // Destroy tensor basis
498 //------------------------------------------------------------------------------
499 static int CeedBasisDestroy_Sycl(CeedBasis basis) {
500   Ceed ceed;
501   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
502   CeedBasis_Sycl *impl;
503   CeedCallBackend(CeedBasisGetData(basis, &impl));
504   Ceed_Sycl *data;
505   CeedCallBackend(CeedGetData(ceed, &data));
506 
507   // Wait for all work to finish before freeing memory
508   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
509 
510   CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
511   CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context));
512   CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context));
513 
514   CeedCallBackend(CeedFree(&impl));
515   return CEED_ERROR_SUCCESS;
516 }
517 
518 //------------------------------------------------------------------------------
519 // Destroy non-tensor basis
520 //------------------------------------------------------------------------------
521 static int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) {
522   Ceed ceed;
523   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
524   CeedBasisNonTensor_Sycl *impl;
525   CeedCallBackend(CeedBasisGetData(basis, &impl));
526   Ceed_Sycl *data;
527   CeedCallBackend(CeedGetData(ceed, &data));
528 
529   // Wait for all work to finish before freeing memory
530   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
531 
532   CeedCallSycl(ceed, sycl::free(impl->d_q_weight, data->sycl_context));
533   CeedCallSycl(ceed, sycl::free(impl->d_interp, data->sycl_context));
534   CeedCallSycl(ceed, sycl::free(impl->d_grad, data->sycl_context));
535 
536   CeedCallBackend(CeedFree(&impl));
537   return CEED_ERROR_SUCCESS;
538 }
539 
540 //------------------------------------------------------------------------------
541 // Create tensor
542 //------------------------------------------------------------------------------
543 int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
544                                  const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
545   Ceed ceed;
546   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
547   CeedBasis_Sycl *impl;
548   CeedCallBackend(CeedCalloc(1, &impl));
549   Ceed_Sycl *data;
550   CeedCallBackend(CeedGetData(ceed, &data));
551 
552   CeedInt num_comp;
553   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
554 
555   const CeedInt num_nodes = CeedIntPow(P_1d, dim);
556   const CeedInt num_qpts  = CeedIntPow(Q_1d, dim);
557 
558   impl->dim       = dim;
559   impl->P_1d      = P_1d;
560   impl->Q_1d      = Q_1d;
561   impl->num_comp  = num_comp;
562   impl->num_nodes = num_nodes;
563   impl->num_qpts  = num_qpts;
564   impl->buf_len   = num_comp * CeedIntMax(num_nodes, num_qpts);
565   impl->op_len    = Q_1d * P_1d;
566 
567   // Order queue
568   sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
569 
570   CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
571   sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, {e});
572 
573   const CeedInt interp_length = Q_1d * P_1d;
574   CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
575   sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, {e});
576 
577   CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
578   sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, {e});
579 
580   CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
581 
582   std::vector<sycl::kernel_id> kernel_ids = {sycl::get_kernel_id<CeedBasisSyclInterp<1>>(), sycl::get_kernel_id<CeedBasisSyclInterp<0>>(),
583                                              sycl::get_kernel_id<CeedBasisSyclGrad<1>>(), sycl::get_kernel_id<CeedBasisSyclGrad<0>>()};
584 
585   sycl::kernel_bundle<sycl::bundle_state::input> input_bundle = sycl::get_kernel_bundle<sycl::bundle_state::input>(data->sycl_context, kernel_ids);
586   input_bundle.set_specialization_constant<BASIS_DIM_ID>(dim);
587   input_bundle.set_specialization_constant<BASIS_NUM_COMP_ID>(num_comp);
588   input_bundle.set_specialization_constant<BASIS_Q_1D_ID>(Q_1d);
589   input_bundle.set_specialization_constant<BASIS_P_1D_ID>(P_1d);
590 
591   CeedCallSycl(ceed, impl->sycl_module = new SyclModule_t(sycl::build(input_bundle)));
592 
593   CeedCallBackend(CeedBasisSetData(basis, impl));
594 
595   // Register backend functions
596   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApply_Sycl));
597   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl));
598   return CEED_ERROR_SUCCESS;
599 }
600 
601 //------------------------------------------------------------------------------
602 // Create non-tensor
603 //------------------------------------------------------------------------------
604 int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
605                            const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
606   Ceed ceed;
607   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
608   CeedBasisNonTensor_Sycl *impl;
609   CeedCallBackend(CeedCalloc(1, &impl));
610   Ceed_Sycl *data;
611   CeedCallBackend(CeedGetData(ceed, &data));
612 
613   CeedInt num_comp;
614   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
615 
616   impl->dim       = dim;
617   impl->num_comp  = num_comp;
618   impl->num_nodes = num_nodes;
619   impl->num_qpts  = num_qpts;
620 
621   // Order queue
622   sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
623 
624   CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
625   sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, {e});
626 
627   const CeedInt interp_length = num_qpts * num_nodes;
628   CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
629   sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, {e});
630 
631   const CeedInt grad_length = num_qpts * num_nodes * dim;
632   CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context));
633   sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, {e});
634 
635   CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
636 
637   CeedCallBackend(CeedBasisSetData(basis, impl));
638 
639   // Register backend functions
640   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Sycl));
641   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Sycl));
642   return CEED_ERROR_SUCCESS;
643 }
644 
645 //------------------------------------------------------------------------------
646