xref: /libCEED/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp (revision 9ba83ac0e4b1fca39d6fa6737a318a9f0cbc172d)
1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/backend.h>
9 #include <ceed/ceed.h>
10 #include <ceed/jit-tools.h>
11 
12 #include <sycl/sycl.hpp>
13 #include <vector>
14 
15 #include "../sycl/ceed-sycl-compile.hpp"
16 #include "ceed-sycl-ref.hpp"
17 
18 template <int>
19 class CeedBasisSyclInterp;
20 template <int>
21 class CeedBasisSyclGrad;
22 class CeedBasisSyclWeight;
23 
24 class CeedBasisSyclInterpNT;
25 class CeedBasisSyclGradNT;
26 class CeedBasisSyclWeightNT;
27 
28 using SpecID = sycl::specialization_id<CeedInt>;
29 
30 static constexpr SpecID BASIS_DIM_ID;
31 static constexpr SpecID BASIS_NUM_COMP_ID;
32 static constexpr SpecID BASIS_P_1D_ID;
33 static constexpr SpecID BASIS_Q_1D_ID;
34 
35 //------------------------------------------------------------------------------
36 // Interpolation kernel - tensor
37 //------------------------------------------------------------------------------
38 template <int is_transpose>
39 static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
40                                      const CeedScalar *u, CeedScalar *v) {
41   const CeedInt     buf_len   = impl->buf_len;
42   const CeedInt     op_len    = impl->op_len;
43   const CeedScalar *interp_1d = impl->d_interp_1d;
44 
45   const sycl::device &sycl_device         = sycl_queue.get_device();
46   const CeedInt       max_work_group_size = 32;
47   const CeedInt       work_group_size     = CeedIntMin(impl->num_qpts, max_work_group_size);
48   sycl::range<1>      local_range(work_group_size);
49   sycl::range<1>      global_range(num_elem * work_group_size);
50   sycl::nd_range<1>   kernel_range(global_range, local_range);
51 
52   std::vector<sycl::event> e;
53 
54   if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
55 
56   sycl_queue.submit([&](sycl::handler &cgh) {
57     cgh.depends_on(e);
58     cgh.use_kernel_bundle(sycl_module);
59 
60     sycl::local_accessor<CeedScalar> s_mem(op_len + 2 * buf_len, cgh);
61 
62     cgh.parallel_for<CeedBasisSyclInterp<is_transpose>>(kernel_range, [=](sycl::nd_item<1> work_item, sycl::kernel_handler kh) {
63       //-------------------------------------------------------------->
64       // Retrieve spec constant values
65       const CeedInt dim      = kh.get_specialization_constant<BASIS_DIM_ID>();
66       const CeedInt num_comp = kh.get_specialization_constant<BASIS_NUM_COMP_ID>();
67       const CeedInt P_1d     = kh.get_specialization_constant<BASIS_P_1D_ID>();
68       const CeedInt Q_1d     = kh.get_specialization_constant<BASIS_Q_1D_ID>();
69       //-------------------------------------------------------------->
70       const CeedInt num_nodes     = CeedIntPow(P_1d, dim);
71       const CeedInt num_qpts      = CeedIntPow(Q_1d, dim);
72       const CeedInt P             = is_transpose ? Q_1d : P_1d;
73       const CeedInt Q             = is_transpose ? P_1d : Q_1d;
74       const CeedInt stride_0      = is_transpose ? 1 : P_1d;
75       const CeedInt stride_1      = is_transpose ? P_1d : 1;
76       const CeedInt u_stride      = is_transpose ? num_qpts : num_nodes;
77       const CeedInt v_stride      = is_transpose ? num_nodes : num_qpts;
78       const CeedInt u_comp_stride = num_elem * u_stride;
79       const CeedInt v_comp_stride = num_elem * v_stride;
80       const CeedInt u_size        = u_stride;
81 
82       sycl::group   work_group = work_item.get_group();
83       const CeedInt i          = work_item.get_local_linear_id();
84       const CeedInt group_size = work_group.get_local_linear_range();
85       const CeedInt elem       = work_group.get_group_linear_id();
86 
87       CeedScalar *s_interp_1d = s_mem.get_multi_ptr<sycl::access::decorated::yes>().get();
88       CeedScalar *s_buffer_1  = s_interp_1d + Q * P;
89       CeedScalar *s_buffer_2  = s_buffer_1 + buf_len;
90 
91       for (CeedInt k = i; k < P * Q; k += group_size) {
92         s_interp_1d[k] = interp_1d[k];
93       }
94 
95       // Apply basis element by element
96       for (CeedInt comp = 0; comp < num_comp; comp++) {
97         const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
98         CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
99 
100         for (CeedInt k = i; k < u_size; k += group_size) {
101           s_buffer_1[k] = cur_u[k];
102         }
103 
104         CeedInt pre  = u_size;
105         CeedInt post = 1;
106 
107         for (CeedInt d = 0; d < dim; d++) {
108           // Use older version of sycl workgroup barrier for performance reasons
109           // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed
110           // sycl::group_barrier(work_group);
111           work_item.barrier(sycl::access::fence_space::local_space);
112 
113           pre /= P;
114           const CeedScalar *in  = d % 2 ? s_buffer_2 : s_buffer_1;
115           CeedScalar       *out = d == dim - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
116 
117           // Contract along middle index
118           const CeedInt writeLen = pre * post * Q;
119           for (CeedInt k = i; k < writeLen; k += group_size) {
120             const CeedInt c = k % post;
121             const CeedInt j = (k / post) % Q;
122             const CeedInt a = k / (post * Q);
123 
124             CeedScalar vk = 0;
125             for (CeedInt b = 0; b < P; b++) {
126               vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
127             }
128             out[k] = vk;
129           }
130           post *= Q;
131         }
132       }
133     });
134   });
135   return CEED_ERROR_SUCCESS;
136 }
137 
138 //------------------------------------------------------------------------------
139 // Gradient kernel - tensor
140 //------------------------------------------------------------------------------
141 template <int is_transpose>
142 static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
143                                    const CeedScalar *u, CeedScalar *v) {
144   const CeedInt     buf_len   = impl->buf_len;
145   const CeedInt     op_len    = impl->op_len;
146   const CeedScalar *interp_1d = impl->d_interp_1d;
147   const CeedScalar *grad_1d   = impl->d_grad_1d;
148 
149   const sycl::device &sycl_device     = sycl_queue.get_device();
150   const CeedInt       work_group_size = 32;
151   sycl::range<1>      local_range(work_group_size);
152   sycl::range<1>      global_range(num_elem * work_group_size);
153   sycl::nd_range<1>   kernel_range(global_range, local_range);
154 
155   std::vector<sycl::event> e;
156 
157   if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
158 
159   sycl_queue.submit([&](sycl::handler &cgh) {
160     cgh.depends_on(e);
161     cgh.use_kernel_bundle(sycl_module);
162 
163     sycl::local_accessor<CeedScalar> s_mem(2 * (op_len + buf_len), cgh);
164 
165     cgh.parallel_for<CeedBasisSyclGrad<is_transpose>>(kernel_range, [=](sycl::nd_item<1> work_item, sycl::kernel_handler kh) {
166       //-------------------------------------------------------------->
167       // Retrieve spec constant values
168       const CeedInt dim      = kh.get_specialization_constant<BASIS_DIM_ID>();
169       const CeedInt num_comp = kh.get_specialization_constant<BASIS_NUM_COMP_ID>();
170       const CeedInt P_1d     = kh.get_specialization_constant<BASIS_P_1D_ID>();
171       const CeedInt Q_1d     = kh.get_specialization_constant<BASIS_Q_1D_ID>();
172       //-------------------------------------------------------------->
173       const CeedInt num_nodes     = CeedIntPow(P_1d, dim);
174       const CeedInt num_qpts      = CeedIntPow(Q_1d, dim);
175       const CeedInt P             = is_transpose ? Q_1d : P_1d;
176       const CeedInt Q             = is_transpose ? P_1d : Q_1d;
177       const CeedInt stride_0      = is_transpose ? 1 : P_1d;
178       const CeedInt stride_1      = is_transpose ? P_1d : 1;
179       const CeedInt u_stride      = is_transpose ? num_qpts : num_nodes;
180       const CeedInt v_stride      = is_transpose ? num_nodes : num_qpts;
181       const CeedInt u_comp_stride = num_elem * u_stride;
182       const CeedInt v_comp_stride = num_elem * v_stride;
183       const CeedInt u_dim_stride  = is_transpose ? num_elem * num_qpts * num_comp : 0;
184       const CeedInt v_dim_stride  = is_transpose ? 0 : num_elem * num_qpts * num_comp;
185       sycl::group   work_group    = work_item.get_group();
186       const CeedInt i             = work_item.get_local_linear_id();
187       const CeedInt group_size    = work_group.get_local_linear_range();
188       const CeedInt elem          = work_group.get_group_linear_id();
189 
190       CeedScalar *s_interp_1d = s_mem.get_multi_ptr<sycl::access::decorated::yes>().get();
191       CeedScalar *s_grad_1d   = s_interp_1d + P * Q;
192       CeedScalar *s_buffer_1  = s_grad_1d + P * Q;
193       CeedScalar *s_buffer_2  = s_buffer_1 + buf_len;
194 
195       for (CeedInt k = i; k < P * Q; k += group_size) {
196         s_interp_1d[k] = interp_1d[k];
197         s_grad_1d[k]   = grad_1d[k];
198       }
199 
200       // Apply basis element by element
201       for (CeedInt comp = 0; comp < num_comp; comp++) {
202         for (CeedInt dim_1 = 0; dim_1 < dim; dim_1++) {
203           CeedInt           pre   = is_transpose ? num_qpts : num_nodes;
204           CeedInt           post  = 1;
205           const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
206           CeedScalar       *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
207 
208           for (CeedInt dim_2 = 0; dim_2 < dim; dim_2++) {
209             // Use older version of sycl workgroup barrier for performance reasons
210             // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed
211             // sycl::group_barrier(work_group);
212             work_item.barrier(sycl::access::fence_space::local_space);
213 
214             pre /= P;
215             const CeedScalar *op  = dim_1 == dim_2 ? s_grad_1d : s_interp_1d;
216             const CeedScalar *in  = (dim_2 == 0 ? cur_u : (dim_2 % 2 ? s_buffer_2 : s_buffer_1));
217             CeedScalar       *out = dim_2 == dim - 1 ? cur_v : (dim_2 % 2 ? s_buffer_1 : s_buffer_2);
218 
219             // Contract along middle index
220             const CeedInt writeLen = pre * post * Q;
221             for (CeedInt k = i; k < writeLen; k += group_size) {
222               const CeedInt c = k % post;
223               const CeedInt j = (k / post) % Q;
224               const CeedInt a = k / (post * Q);
225 
226               CeedScalar v_k = 0;
227               for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
228 
229               if (is_transpose && dim_2 == dim - 1) out[k] += v_k;
230               else out[k] = v_k;
231             }
232 
233             post *= Q;
234           }
235         }
236       }
237     });
238   });
239   return CEED_ERROR_SUCCESS;
240 }
241 
242 //------------------------------------------------------------------------------
243 // Weight kernel - tensor
244 //------------------------------------------------------------------------------
245 static int CeedBasisApplyWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, const CeedBasis_Sycl *impl, CeedScalar *w) {
246   const CeedInt     dim         = impl->dim;
247   const CeedInt     Q_1d        = impl->Q_1d;
248   const CeedScalar *q_weight_1d = impl->d_q_weight_1d;
249 
250   const CeedInt  num_quad_x = Q_1d;
251   const CeedInt  num_quad_y = (dim > 1) ? Q_1d : 1;
252   const CeedInt  num_quad_z = (dim > 2) ? Q_1d : 1;
253   sycl::range<3> kernel_range(num_elem * num_quad_z, num_quad_y, num_quad_x);
254 
255   std::vector<sycl::event> e;
256 
257   if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
258 
259   sycl_queue.parallel_for<CeedBasisSyclWeight>(kernel_range, e, [=](sycl::item<3> work_item) {
260     if (dim == 1) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]];
261     if (dim == 2) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]];
262     if (dim == 3) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]] * q_weight_1d[work_item[0] % Q_1d];
263   });
264   return CEED_ERROR_SUCCESS;
265 }
266 
267 //------------------------------------------------------------------------------
268 // Basis apply - tensor
269 //------------------------------------------------------------------------------
270 static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
271                                CeedVector v) {
272   Ceed              ceed;
273   const CeedInt     is_transpose = t_mode == CEED_TRANSPOSE;
274   const CeedScalar *d_u;
275   CeedScalar       *d_v;
276   Ceed_Sycl        *data;
277   CeedBasis_Sycl   *impl;
278 
279   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
280   CeedCallBackend(CeedGetData(ceed, &data));
281   CeedCallBackend(CeedBasisGetData(basis, &impl));
282 
283   // Get read/write access to u, v
284   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
285   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
286   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
287 
288   // Clear v for transpose operation
289   if (is_transpose) {
290     CeedSize length;
291     CeedCallBackend(CeedVectorGetLength(v, &length));
292     std::vector<sycl::event> e;
293 
294     if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
295     data->sycl_queue.fill<CeedScalar>(d_v, 0, length, e);
296   }
297 
298   // Basis action
299   switch (eval_mode) {
300     case CEED_EVAL_INTERP:
301       if (is_transpose) {
302         CeedCallBackend(CeedBasisApplyInterp_Sycl<true>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
303       } else {
304         CeedCallBackend(CeedBasisApplyInterp_Sycl<false>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
305       }
306       break;
307     case CEED_EVAL_GRAD:
308       if (is_transpose) {
309         CeedCallBackend(CeedBasisApplyGrad_Sycl<true>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
310       } else {
311         CeedCallBackend(CeedBasisApplyGrad_Sycl<false>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
312       }
313       break;
314     case CEED_EVAL_WEIGHT:
315       CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[eval_mode]);
316       CeedCallBackend(CeedBasisApplyWeight_Sycl(data->sycl_queue, num_elem, impl, d_v));
317       break;
318     case CEED_EVAL_NONE: /* handled separately below */
319       break;
320     // LCOV_EXCL_START
321     case CEED_EVAL_DIV:
322     case CEED_EVAL_CURL:
323       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
324       // LCOV_EXCL_STOP
325   }
326 
327   // Restore vectors, cover CEED_EVAL_NONE
328   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
329   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
330   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
331   CeedCallBackend(CeedDestroy(&ceed));
332   return CEED_ERROR_SUCCESS;
333 }
334 
335 //------------------------------------------------------------------------------
336 // Interpolation kernel - non-tensor
337 //------------------------------------------------------------------------------
338 static int CeedBasisApplyNonTensorInterp_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, CeedInt is_transpose, const CeedBasisNonTensor_Sycl *impl,
339                                               const CeedScalar *d_U, CeedScalar *d_V) {
340   const CeedInt     num_comp      = impl->num_comp;
341   const CeedInt     P             = is_transpose ? impl->num_qpts : impl->num_nodes;
342   const CeedInt     Q             = is_transpose ? impl->num_nodes : impl->num_qpts;
343   const CeedInt     stride_0      = is_transpose ? 1 : impl->num_nodes;
344   const CeedInt     stride_1      = is_transpose ? impl->num_nodes : 1;
345   const CeedInt     u_stride      = P;
346   const CeedInt     v_stride      = Q;
347   const CeedInt     u_comp_stride = u_stride * num_elem;
348   const CeedInt     v_comp_stride = v_stride * num_elem;
349   const CeedInt     u_size        = P;
350   const CeedInt     v_size        = Q;
351   const CeedScalar *d_B           = impl->d_interp;
352 
353   sycl::range<2> kernel_range(num_elem, v_size);
354 
355   std::vector<sycl::event> e;
356 
357   if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
358 
359   sycl_queue.parallel_for<CeedBasisSyclInterpNT>(kernel_range, e, [=](sycl::id<2> indx) {
360     const CeedInt i    = indx[1];
361     const CeedInt elem = indx[0];
362 
363     for (CeedInt comp = 0; comp < num_comp; comp++) {
364       const CeedScalar *U = d_U + elem * u_stride + comp * u_comp_stride;
365       CeedScalar        V = 0.0;
366 
367       for (CeedInt j = 0; j < u_size; ++j) {
368         V += d_B[i * stride_0 + j * stride_1] * U[j];
369       }
370       d_V[i + elem * v_stride + comp * v_comp_stride] = V;
371     }
372   });
373   return CEED_ERROR_SUCCESS;
374 }
375 
376 //------------------------------------------------------------------------------
377 // Gradient kernel - non-tensor
378 //------------------------------------------------------------------------------
379 static int CeedBasisApplyNonTensorGrad_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, CeedInt is_transpose, const CeedBasisNonTensor_Sycl *impl,
380                                             const CeedScalar *d_U, CeedScalar *d_V) {
381   const CeedInt     num_comp      = impl->num_comp;
382   const CeedInt     P             = is_transpose ? impl->num_qpts : impl->num_nodes;
383   const CeedInt     Q             = is_transpose ? impl->num_nodes : impl->num_qpts;
384   const CeedInt     stride_0      = is_transpose ? 1 : impl->num_nodes;
385   const CeedInt     stride_1      = is_transpose ? impl->num_nodes : 1;
386   const CeedInt     g_dim_stride  = P * Q;
387   const CeedInt     u_stride      = P;
388   const CeedInt     v_stride      = Q;
389   const CeedInt     u_comp_stride = u_stride * num_elem;
390   const CeedInt     v_comp_stride = v_stride * num_elem;
391   const CeedInt     u_dim_stride  = u_comp_stride * num_comp;
392   const CeedInt     v_dim_stride  = v_comp_stride * num_comp;
393   const CeedInt     u_size        = P;
394   const CeedInt     v_size        = Q;
395   const CeedInt     in_dim        = is_transpose ? impl->dim : 1;
396   const CeedInt     out_dim       = is_transpose ? 1 : impl->dim;
397   const CeedScalar *d_G           = impl->d_grad;
398 
399   sycl::range<2> kernel_range(num_elem, v_size);
400 
401   std::vector<sycl::event> e;
402 
403   if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
404 
405   sycl_queue.parallel_for<CeedBasisSyclGradNT>(kernel_range, e, [=](sycl::id<2> indx) {
406     const CeedInt i    = indx[1];
407     const CeedInt elem = indx[0];
408 
409     for (CeedInt comp = 0; comp < num_comp; comp++) {
410       CeedScalar V[3] = {0.0, 0.0, 0.0};
411 
412       for (CeedInt d1 = 0; d1 < in_dim; ++d1) {
413         const CeedScalar *U = d_U + elem * u_stride + comp * u_comp_stride + d1 * u_dim_stride;
414         const CeedScalar *G = d_G + i * stride_0 + d1 * g_dim_stride;
415 
416         for (CeedInt j = 0; j < u_size; ++j) {
417           const CeedScalar Uj = U[j];
418 
419           for (CeedInt d0 = 0; d0 < out_dim; ++d0) {
420             V[d0] += G[j * stride_1 + d0 * g_dim_stride] * Uj;
421           }
422         }
423       }
424       for (CeedInt d0 = 0; d0 < out_dim; ++d0) {
425         d_V[i + elem * v_stride + comp * v_comp_stride + d0 * v_dim_stride] = V[d0];
426       }
427     }
428   });
429   return CEED_ERROR_SUCCESS;
430 }
431 
432 //------------------------------------------------------------------------------
433 // Weight kernel - non-tensor
434 //------------------------------------------------------------------------------
435 static int CeedBasisApplyNonTensorWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, const CeedBasisNonTensor_Sycl *impl, CeedScalar *d_V) {
436   const CeedInt     num_qpts = impl->num_qpts;
437   const CeedScalar *q_weight = impl->d_q_weight;
438 
439   sycl::range<2> kernel_range(num_elem, num_qpts);
440 
441   std::vector<sycl::event> e;
442 
443   if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
444 
445   sycl_queue.parallel_for<CeedBasisSyclWeightNT>(kernel_range, e, [=](sycl::id<2> indx) {
446     const CeedInt i          = indx[1];
447     const CeedInt elem       = indx[0];
448     d_V[i + elem * num_qpts] = q_weight[i];
449   });
450   return CEED_ERROR_SUCCESS;
451 }
452 
453 //------------------------------------------------------------------------------
454 // Basis apply - non-tensor
455 //------------------------------------------------------------------------------
456 static int CeedBasisApplyNonTensor_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
457                                         CeedVector v) {
458   Ceed                     ceed;
459   const CeedInt            is_transpose = t_mode == CEED_TRANSPOSE;
460   const CeedScalar        *d_u;
461   CeedScalar              *d_v;
462   CeedBasisNonTensor_Sycl *impl;
463   Ceed_Sycl               *data;
464 
465   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
466   CeedCallBackend(CeedBasisGetData(basis, &impl));
467   CeedCallBackend(CeedGetData(ceed, &data));
468 
469   // Get read/write access to u, v
470   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
471   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
472   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
473 
474   // Clear v for transpose operation
475   if (is_transpose) {
476     CeedSize length;
477     CeedCallBackend(CeedVectorGetLength(v, &length));
478     // Order queue
479     sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
480     data->sycl_queue.fill<CeedScalar>(d_v, 0, length, {e});
481   }
482 
483   // Apply basis operation
484   switch (eval_mode) {
485     case CEED_EVAL_INTERP:
486       CeedCallBackend(CeedBasisApplyNonTensorInterp_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v));
487       break;
488     case CEED_EVAL_GRAD:
489       CeedCallBackend(CeedBasisApplyNonTensorGrad_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v));
490       break;
491     case CEED_EVAL_WEIGHT:
492       CeedCheck(impl->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
493       CeedCallBackend(CeedBasisApplyNonTensorWeight_Sycl(data->sycl_queue, num_elem, impl, d_v));
494       break;
495     case CEED_EVAL_NONE: /* handled separately below */
496       break;
497     // LCOV_EXCL_START
498     case CEED_EVAL_DIV:
499     case CEED_EVAL_CURL:
500       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
501       // LCOV_EXCL_STOP
502   }
503 
504   // Restore vectors, cover CEED_EVAL_NONE
505   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
506   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
507   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
508   CeedCallBackend(CeedDestroy(&ceed));
509   return CEED_ERROR_SUCCESS;
510 }
511 
512 //------------------------------------------------------------------------------
513 // Destroy tensor basis
514 //------------------------------------------------------------------------------
515 static int CeedBasisDestroy_Sycl(CeedBasis basis) {
516   Ceed ceed;
517   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
518   CeedBasis_Sycl *impl;
519   CeedCallBackend(CeedBasisGetData(basis, &impl));
520   Ceed_Sycl *data;
521   CeedCallBackend(CeedGetData(ceed, &data));
522 
523   // Wait for all work to finish before freeing memory
524   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
525 
526   if (impl->d_q_weight_1d) CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
527   CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context));
528   CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context));
529 
530   CeedCallBackend(CeedFree(&impl));
531   CeedCallBackend(CeedDestroy(&ceed));
532   return CEED_ERROR_SUCCESS;
533 }
534 
535 //------------------------------------------------------------------------------
536 // Destroy non-tensor basis
537 //------------------------------------------------------------------------------
538 static int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) {
539   Ceed ceed;
540   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
541   CeedBasisNonTensor_Sycl *impl;
542   CeedCallBackend(CeedBasisGetData(basis, &impl));
543   Ceed_Sycl *data;
544   CeedCallBackend(CeedGetData(ceed, &data));
545 
546   // Wait for all work to finish before freeing memory
547   CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
548 
549   if (impl->d_q_weight) CeedCallSycl(ceed, sycl::free(impl->d_q_weight, data->sycl_context));
550   CeedCallSycl(ceed, sycl::free(impl->d_interp, data->sycl_context));
551   CeedCallSycl(ceed, sycl::free(impl->d_grad, data->sycl_context));
552 
553   CeedCallBackend(CeedFree(&impl));
554   CeedCallBackend(CeedDestroy(&ceed));
555   return CEED_ERROR_SUCCESS;
556 }
557 
558 //------------------------------------------------------------------------------
559 // Create tensor
560 //------------------------------------------------------------------------------
561 int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
562                                  const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
563   Ceed            ceed;
564   CeedBasis_Sycl *impl;
565   Ceed_Sycl      *data;
566 
567   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
568   CeedCallBackend(CeedCalloc(1, &impl));
569   CeedCallBackend(CeedGetData(ceed, &data));
570 
571   CeedInt num_comp;
572   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
573 
574   const CeedInt num_nodes = CeedIntPow(P_1d, dim);
575   const CeedInt num_qpts  = CeedIntPow(Q_1d, dim);
576 
577   impl->dim       = dim;
578   impl->P_1d      = P_1d;
579   impl->Q_1d      = Q_1d;
580   impl->num_comp  = num_comp;
581   impl->num_nodes = num_nodes;
582   impl->num_qpts  = num_qpts;
583   impl->buf_len   = num_comp * CeedIntMax(num_nodes, num_qpts);
584   impl->op_len    = Q_1d * P_1d;
585 
586   std::vector<sycl::event> e;
587 
588   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
589 
590   std::vector<sycl::event> copy_events;
591   if (q_weight_1d) {
592     CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
593     sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);
594     copy_events.push_back(copy_weight);
595   }
596 
597   const CeedInt interp_length = Q_1d * P_1d;
598   CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
599   sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, e);
600   copy_events.push_back(copy_interp);
601 
602   CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
603   sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, e);
604   copy_events.push_back(copy_grad);
605 
606   CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
607 
608   std::vector<sycl::kernel_id> kernel_ids = {sycl::get_kernel_id<CeedBasisSyclInterp<1>>(), sycl::get_kernel_id<CeedBasisSyclInterp<0>>(),
609                                              sycl::get_kernel_id<CeedBasisSyclGrad<1>>(), sycl::get_kernel_id<CeedBasisSyclGrad<0>>()};
610 
611   sycl::kernel_bundle<sycl::bundle_state::input> input_bundle = sycl::get_kernel_bundle<sycl::bundle_state::input>(data->sycl_context, kernel_ids);
612   input_bundle.set_specialization_constant<BASIS_DIM_ID>(dim);
613   input_bundle.set_specialization_constant<BASIS_NUM_COMP_ID>(num_comp);
614   input_bundle.set_specialization_constant<BASIS_Q_1D_ID>(Q_1d);
615   input_bundle.set_specialization_constant<BASIS_P_1D_ID>(P_1d);
616 
617   CeedCallSycl(ceed, impl->sycl_module = new SyclModule_t(sycl::build(input_bundle)));
618 
619   CeedCallBackend(CeedBasisSetData(basis, impl));
620 
621   // Register backend functions
622   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApply_Sycl));
623   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl));
624   CeedCallBackend(CeedDestroy(&ceed));
625   return CEED_ERROR_SUCCESS;
626 }
627 
628 //------------------------------------------------------------------------------
629 // Create non-tensor
630 //------------------------------------------------------------------------------
631 int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
632                            const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
633   Ceed                     ceed;
634   CeedBasisNonTensor_Sycl *impl;
635   Ceed_Sycl               *data;
636 
637   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
638   CeedCallBackend(CeedCalloc(1, &impl));
639   CeedCallBackend(CeedGetData(ceed, &data));
640 
641   CeedInt num_comp;
642   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
643 
644   impl->dim       = dim;
645   impl->num_comp  = num_comp;
646   impl->num_nodes = num_nodes;
647   impl->num_qpts  = num_qpts;
648 
649   std::vector<sycl::event> e;
650 
651   if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
652 
653   std::vector<sycl::event> copy_events;
654   if (q_weight) {
655     CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
656     sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, e);
657     copy_events.push_back(copy_weight);
658   }
659 
660   const CeedInt interp_length = num_qpts * num_nodes;
661   CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
662   sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, e);
663   copy_events.push_back(copy_interp);
664 
665   const CeedInt grad_length = num_qpts * num_nodes * dim;
666   CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context));
667   sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, e);
668   copy_events.push_back(copy_grad);
669 
670   CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
671 
672   CeedCallBackend(CeedBasisSetData(basis, impl));
673 
674   // Register backend functions
675   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Sycl));
676   CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Sycl));
677   CeedCallBackend(CeedDestroy(&ceed));
678   return CEED_ERROR_SUCCESS;
679 }
680 
681 //------------------------------------------------------------------------------
682