backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp

*bd882c8aSJames Wright// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
*bd882c8aSJames Wright// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
*bd882c8aSJames Wright//
*bd882c8aSJames Wright// SPDX-License-Identifier: BSD-2-Clause
*bd882c8aSJames Wright//
*bd882c8aSJames Wright// This file is part of CEED:  http://github.com/ceed
*bd882c8aSJames Wright
*bd882c8aSJames Wright#include <ceed/backend.h>
*bd882c8aSJames Wright#include <ceed/ceed.h>
*bd882c8aSJames Wright#include <ceed/jit-tools.h>
*bd882c8aSJames Wright
*bd882c8aSJames Wright#include <sycl/sycl.hpp>
*bd882c8aSJames Wright#include <vector>
*bd882c8aSJames Wright
*bd882c8aSJames Wright#include "../sycl/ceed-sycl-compile.hpp"
*bd882c8aSJames Wright#include "ceed-sycl-ref.hpp"
*bd882c8aSJames Wright
*bd882c8aSJames Wrighttemplate <int>
*bd882c8aSJames Wrightclass CeedBasisSyclInterp;
*bd882c8aSJames Wrighttemplate <int>
*bd882c8aSJames Wrightclass CeedBasisSyclGrad;
*bd882c8aSJames Wrightclass CeedBasisSyclWeight;
*bd882c8aSJames Wright
*bd882c8aSJames Wrightclass CeedBasisSyclInterpNT;
*bd882c8aSJames Wrightclass CeedBasisSyclGradNT;
*bd882c8aSJames Wrightclass CeedBasisSyclWeightNT;
*bd882c8aSJames Wright
*bd882c8aSJames Wrightusing SpecID = sycl::specialization_id<CeedInt>;
*bd882c8aSJames Wright
*bd882c8aSJames Wrightstatic constexpr SpecID BASIS_DIM_ID;
*bd882c8aSJames Wrightstatic constexpr SpecID BASIS_NUM_COMP_ID;
*bd882c8aSJames Wrightstatic constexpr SpecID BASIS_P_1D_ID;
*bd882c8aSJames Wrightstatic constexpr SpecID BASIS_Q_1D_ID;
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// Interpolation kernel - tensor
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrighttemplate <int transpose>
*bd882c8aSJames Wrightstatic int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
*bd882c8aSJames Wright                                     const CeedScalar *u, CeedScalar *v) {
*bd882c8aSJames Wright  const CeedInt     buf_len   = impl->buf_len;
*bd882c8aSJames Wright  const CeedInt     op_len    = impl->op_len;
*bd882c8aSJames Wright  const CeedScalar *interp_1d = impl->d_interp_1d;
*bd882c8aSJames Wright
*bd882c8aSJames Wright  const sycl::device &sycl_device         = sycl_queue.get_device();
*bd882c8aSJames Wright  const CeedInt       max_work_group_size = 32;
*bd882c8aSJames Wright  const CeedInt       work_group_size     = CeedIntMin(impl->num_qpts, max_work_group_size);
*bd882c8aSJames Wright  sycl::range<1>      local_range(work_group_size);
*bd882c8aSJames Wright  sycl::range<1>      global_range(num_elem * work_group_size);
*bd882c8aSJames Wright  sycl::nd_range<1>   kernel_range(global_range, local_range);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Order queue
*bd882c8aSJames Wright  sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
*bd882c8aSJames Wright  sycl_queue.submit([&](sycl::handler &cgh) {
*bd882c8aSJames Wright    cgh.depends_on({e});
*bd882c8aSJames Wright    cgh.use_kernel_bundle(sycl_module);
*bd882c8aSJames Wright
*bd882c8aSJames Wright    sycl::local_accessor<CeedScalar> s_mem(op_len + 2 * buf_len, cgh);
*bd882c8aSJames Wright
*bd882c8aSJames Wright    cgh.parallel_for<CeedBasisSyclInterp<transpose>>(kernel_range, [=](sycl::nd_item<1> work_item, sycl::kernel_handler kh) {
*bd882c8aSJames Wright      //-------------------------------------------------------------->
*bd882c8aSJames Wright      // Retrieve spec constant values
*bd882c8aSJames Wright      const CeedInt dim      = kh.get_specialization_constant<BASIS_DIM_ID>();
*bd882c8aSJames Wright      const CeedInt num_comp = kh.get_specialization_constant<BASIS_NUM_COMP_ID>();
*bd882c8aSJames Wright      const CeedInt P_1d     = kh.get_specialization_constant<BASIS_P_1D_ID>();
*bd882c8aSJames Wright      const CeedInt Q_1d     = kh.get_specialization_constant<BASIS_Q_1D_ID>();
*bd882c8aSJames Wright      //-------------------------------------------------------------->
*bd882c8aSJames Wright      const CeedInt num_nodes     = CeedIntPow(P_1d, dim);
*bd882c8aSJames Wright      const CeedInt num_qpts      = CeedIntPow(Q_1d, dim);
*bd882c8aSJames Wright      const CeedInt P             = transpose ? Q_1d : P_1d;
*bd882c8aSJames Wright      const CeedInt Q             = transpose ? P_1d : Q_1d;
*bd882c8aSJames Wright      const CeedInt stride_0      = transpose ? 1 : P_1d;
*bd882c8aSJames Wright      const CeedInt stride_1      = transpose ? P_1d : 1;
*bd882c8aSJames Wright      const CeedInt u_stride      = transpose ? num_qpts : num_nodes;
*bd882c8aSJames Wright      const CeedInt v_stride      = transpose ? num_nodes : num_qpts;
*bd882c8aSJames Wright      const CeedInt u_comp_stride = num_elem * u_stride;
*bd882c8aSJames Wright      const CeedInt v_comp_stride = num_elem * v_stride;
*bd882c8aSJames Wright      const CeedInt u_size        = u_stride;
*bd882c8aSJames Wright
*bd882c8aSJames Wright      sycl::group   work_group = work_item.get_group();
*bd882c8aSJames Wright      const CeedInt i          = work_item.get_local_linear_id();
*bd882c8aSJames Wright      const CeedInt group_size = work_group.get_local_linear_range();
*bd882c8aSJames Wright      const CeedInt elem       = work_group.get_group_linear_id();
*bd882c8aSJames Wright
*bd882c8aSJames Wright      CeedScalar *s_interp_1d = s_mem.get_pointer();
*bd882c8aSJames Wright      CeedScalar *s_buffer_1  = s_interp_1d + Q * P;
*bd882c8aSJames Wright      CeedScalar *s_buffer_2  = s_buffer_1 + buf_len;
*bd882c8aSJames Wright
*bd882c8aSJames Wright      for (CeedInt k = i; k < P * Q; k += group_size) {
*bd882c8aSJames Wright        s_interp_1d[k] = interp_1d[k];
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright
*bd882c8aSJames Wright      // Apply basis element by element
*bd882c8aSJames Wright      for (CeedInt comp = 0; comp < num_comp; comp++) {
*bd882c8aSJames Wright        const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
*bd882c8aSJames Wright        CeedScalar       *cur_v = v + elem * v_stride + comp * v_comp_stride;
*bd882c8aSJames Wright
*bd882c8aSJames Wright        for (CeedInt k = i; k < u_size; k += group_size) {
*bd882c8aSJames Wright          s_buffer_1[k] = cur_u[k];
*bd882c8aSJames Wright        }
*bd882c8aSJames Wright
*bd882c8aSJames Wright        CeedInt pre  = u_size;
*bd882c8aSJames Wright        CeedInt post = 1;
*bd882c8aSJames Wright
*bd882c8aSJames Wright        for (CeedInt d = 0; d < dim; d++) {
*bd882c8aSJames Wright          // Use older version of sycl workgroup barrier for performance reasons
*bd882c8aSJames Wright          // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed
*bd882c8aSJames Wright          // sycl::group_barrier(work_group);
*bd882c8aSJames Wright          work_item.barrier(sycl::access::fence_space::local_space);
*bd882c8aSJames Wright
*bd882c8aSJames Wright          pre /= P;
*bd882c8aSJames Wright          const CeedScalar *in  = d % 2 ? s_buffer_2 : s_buffer_1;
*bd882c8aSJames Wright          CeedScalar       *out = d == dim - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
*bd882c8aSJames Wright
*bd882c8aSJames Wright          // Contract along middle index
*bd882c8aSJames Wright          const CeedInt writeLen = pre * post * Q;
*bd882c8aSJames Wright          for (CeedInt k = i; k < writeLen; k += group_size) {
*bd882c8aSJames Wright            const CeedInt c = k % post;
*bd882c8aSJames Wright            const CeedInt j = (k / post) % Q;
*bd882c8aSJames Wright            const CeedInt a = k / (post * Q);
*bd882c8aSJames Wright
*bd882c8aSJames Wright            CeedScalar vk = 0;
*bd882c8aSJames Wright            for (CeedInt b = 0; b < P; b++) {
*bd882c8aSJames Wright              vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
*bd882c8aSJames Wright            }
*bd882c8aSJames Wright            out[k] = vk;
*bd882c8aSJames Wright          }
*bd882c8aSJames Wright          post *= Q;
*bd882c8aSJames Wright        }
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    });
*bd882c8aSJames Wright  });
*bd882c8aSJames Wright  return CEED_ERROR_SUCCESS;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// Gradient kernel - tensor
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrighttemplate <int transpose>
*bd882c8aSJames Wrightstatic int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
*bd882c8aSJames Wright                                   const CeedScalar *u, CeedScalar *v) {
*bd882c8aSJames Wright  const CeedInt     buf_len   = impl->buf_len;
*bd882c8aSJames Wright  const CeedInt     op_len    = impl->op_len;
*bd882c8aSJames Wright  const CeedScalar *interp_1d = impl->d_interp_1d;
*bd882c8aSJames Wright  const CeedScalar *grad_1d   = impl->d_grad_1d;
*bd882c8aSJames Wright
*bd882c8aSJames Wright  const sycl::device &sycl_device     = sycl_queue.get_device();
*bd882c8aSJames Wright  const CeedInt       work_group_size = 32;
*bd882c8aSJames Wright  sycl::range<1>      local_range(work_group_size);
*bd882c8aSJames Wright  sycl::range<1>      global_range(num_elem * work_group_size);
*bd882c8aSJames Wright  sycl::nd_range<1>   kernel_range(global_range, local_range);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Order queue
*bd882c8aSJames Wright  sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
*bd882c8aSJames Wright  sycl_queue.submit([&](sycl::handler &cgh) {
*bd882c8aSJames Wright    cgh.depends_on({e});
*bd882c8aSJames Wright    cgh.use_kernel_bundle(sycl_module);
*bd882c8aSJames Wright
*bd882c8aSJames Wright    sycl::local_accessor<CeedScalar> s_mem(2 * (op_len + buf_len), cgh);
*bd882c8aSJames Wright
*bd882c8aSJames Wright    cgh.parallel_for<CeedBasisSyclGrad<transpose>>(kernel_range, [=](sycl::nd_item<1> work_item, sycl::kernel_handler kh) {
*bd882c8aSJames Wright      //-------------------------------------------------------------->
*bd882c8aSJames Wright      // Retrieve spec constant values
*bd882c8aSJames Wright      const CeedInt dim      = kh.get_specialization_constant<BASIS_DIM_ID>();
*bd882c8aSJames Wright      const CeedInt num_comp = kh.get_specialization_constant<BASIS_NUM_COMP_ID>();
*bd882c8aSJames Wright      const CeedInt P_1d     = kh.get_specialization_constant<BASIS_P_1D_ID>();
*bd882c8aSJames Wright      const CeedInt Q_1d     = kh.get_specialization_constant<BASIS_Q_1D_ID>();
*bd882c8aSJames Wright      //-------------------------------------------------------------->
*bd882c8aSJames Wright      const CeedInt num_nodes     = CeedIntPow(P_1d, dim);
*bd882c8aSJames Wright      const CeedInt num_qpts      = CeedIntPow(Q_1d, dim);
*bd882c8aSJames Wright      const CeedInt P             = transpose ? Q_1d : P_1d;
*bd882c8aSJames Wright      const CeedInt Q             = transpose ? P_1d : Q_1d;
*bd882c8aSJames Wright      const CeedInt stride_0      = transpose ? 1 : P_1d;
*bd882c8aSJames Wright      const CeedInt stride_1      = transpose ? P_1d : 1;
*bd882c8aSJames Wright      const CeedInt u_stride      = transpose ? num_qpts : num_nodes;
*bd882c8aSJames Wright      const CeedInt v_stride      = transpose ? num_nodes : num_qpts;
*bd882c8aSJames Wright      const CeedInt u_comp_stride = num_elem * u_stride;
*bd882c8aSJames Wright      const CeedInt v_comp_stride = num_elem * v_stride;
*bd882c8aSJames Wright      const CeedInt u_dim_stride  = transpose ? num_elem * num_qpts * num_comp : 0;
*bd882c8aSJames Wright      const CeedInt v_dim_stride  = transpose ? 0 : num_elem * num_qpts * num_comp;
*bd882c8aSJames Wright
*bd882c8aSJames Wright      sycl::group   work_group = work_item.get_group();
*bd882c8aSJames Wright      const CeedInt i          = work_item.get_local_linear_id();
*bd882c8aSJames Wright      const CeedInt group_size = work_group.get_local_linear_range();
*bd882c8aSJames Wright      const CeedInt elem       = work_group.get_group_linear_id();
*bd882c8aSJames Wright
*bd882c8aSJames Wright      CeedScalar *s_interp_1d = s_mem.get_pointer();
*bd882c8aSJames Wright      CeedScalar *s_grad_1d   = s_interp_1d + P * Q;
*bd882c8aSJames Wright      CeedScalar *s_buffer_1  = s_grad_1d + P * Q;
*bd882c8aSJames Wright      CeedScalar *s_buffer_2  = s_buffer_1 + buf_len;
*bd882c8aSJames Wright
*bd882c8aSJames Wright      for (CeedInt k = i; k < P * Q; k += group_size) {
*bd882c8aSJames Wright        s_interp_1d[k] = interp_1d[k];
*bd882c8aSJames Wright        s_grad_1d[k]   = grad_1d[k];
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright
*bd882c8aSJames Wright      // Apply basis element by element
*bd882c8aSJames Wright      for (CeedInt comp = 0; comp < num_comp; comp++) {
*bd882c8aSJames Wright        for (CeedInt dim_1 = 0; dim_1 < dim; dim_1++) {
*bd882c8aSJames Wright          CeedInt           pre   = transpose ? num_qpts : num_nodes;
*bd882c8aSJames Wright          CeedInt           post  = 1;
*bd882c8aSJames Wright          const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
*bd882c8aSJames Wright          CeedScalar       *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
*bd882c8aSJames Wright
*bd882c8aSJames Wright          for (CeedInt dim_2 = 0; dim_2 < dim; dim_2++) {
*bd882c8aSJames Wright            // Use older version of sycl workgroup barrier for performance reasons
*bd882c8aSJames Wright            // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed
*bd882c8aSJames Wright            // sycl::group_barrier(work_group);
*bd882c8aSJames Wright            work_item.barrier(sycl::access::fence_space::local_space);
*bd882c8aSJames Wright
*bd882c8aSJames Wright            pre /= P;
*bd882c8aSJames Wright            const CeedScalar *op  = dim_1 == dim_2 ? s_grad_1d : s_interp_1d;
*bd882c8aSJames Wright            const CeedScalar *in  = (dim_2 == 0 ? cur_u : (dim_2 % 2 ? s_buffer_2 : s_buffer_1));
*bd882c8aSJames Wright            CeedScalar       *out = dim_2 == dim - 1 ? cur_v : (dim_2 % 2 ? s_buffer_1 : s_buffer_2);
*bd882c8aSJames Wright
*bd882c8aSJames Wright            // Contract along middle index
*bd882c8aSJames Wright            const CeedInt writeLen = pre * post * Q;
*bd882c8aSJames Wright            for (CeedInt k = i; k < writeLen; k += group_size) {
*bd882c8aSJames Wright              const CeedInt c = k % post;
*bd882c8aSJames Wright              const CeedInt j = (k / post) % Q;
*bd882c8aSJames Wright              const CeedInt a = k / (post * Q);
*bd882c8aSJames Wright
*bd882c8aSJames Wright              CeedScalar v_k = 0;
*bd882c8aSJames Wright              for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
*bd882c8aSJames Wright
*bd882c8aSJames Wright              if (transpose && dim_2 == dim - 1) out[k] += v_k;
*bd882c8aSJames Wright              else out[k] = v_k;
*bd882c8aSJames Wright            }
*bd882c8aSJames Wright
*bd882c8aSJames Wright            post *= Q;
*bd882c8aSJames Wright          }
*bd882c8aSJames Wright        }
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    });
*bd882c8aSJames Wright  });
*bd882c8aSJames Wright  return CEED_ERROR_SUCCESS;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// Weight kernel - tensor
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightstatic int CeedBasisApplyWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, const CeedBasis_Sycl *impl, CeedScalar *w) {
*bd882c8aSJames Wright  const CeedInt     dim         = impl->dim;
*bd882c8aSJames Wright  const CeedInt     Q_1d        = impl->Q_1d;
*bd882c8aSJames Wright  const CeedScalar *q_weight_1d = impl->d_q_weight_1d;
*bd882c8aSJames Wright
*bd882c8aSJames Wright  const CeedInt  num_quad_x = Q_1d;
*bd882c8aSJames Wright  const CeedInt  num_quad_y = (dim > 1) ? Q_1d : 1;
*bd882c8aSJames Wright  const CeedInt  num_quad_z = (dim > 2) ? Q_1d : 1;
*bd882c8aSJames Wright  sycl::range<3> kernel_range(num_elem * num_quad_z, num_quad_y, num_quad_x);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Order queue
*bd882c8aSJames Wright  sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
*bd882c8aSJames Wright  sycl_queue.parallel_for<CeedBasisSyclWeight>(kernel_range, {e}, [=](sycl::item<3> work_item) {
*bd882c8aSJames Wright    if (dim == 1) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]];
*bd882c8aSJames Wright    if (dim == 2) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]];
*bd882c8aSJames Wright    if (dim == 3) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]] * q_weight_1d[work_item[0] % Q_1d];
*bd882c8aSJames Wright  });
*bd882c8aSJames Wright  return CEED_ERROR_SUCCESS;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// Basis apply - tensor
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightstatic int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
*bd882c8aSJames Wright                               CeedVector v) {
*bd882c8aSJames Wright  Ceed ceed;
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
*bd882c8aSJames Wright  Ceed_Sycl *data;
*bd882c8aSJames Wright  CeedCallBackend(CeedGetData(ceed, &data));
*bd882c8aSJames Wright  CeedBasis_Sycl *impl;
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisGetData(basis, &impl));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  const CeedInt transpose = t_mode == CEED_TRANSPOSE;
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Read vectors
*bd882c8aSJames Wright  const CeedScalar *d_u;
*bd882c8aSJames Wright  CeedScalar       *d_v;
*bd882c8aSJames Wright  if (eval_mode != CEED_EVAL_WEIGHT) {
*bd882c8aSJames Wright    CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Clear v for transpose operation
*bd882c8aSJames Wright  if (t_mode == CEED_TRANSPOSE) {
*bd882c8aSJames Wright    CeedSize length;
*bd882c8aSJames Wright    CeedCallBackend(CeedVectorGetLength(v, &length));
*bd882c8aSJames Wright    // Order queue
*bd882c8aSJames Wright    sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
*bd882c8aSJames Wright    data->sycl_queue.fill<CeedScalar>(d_v, 0, length, {e});
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Basis action
*bd882c8aSJames Wright  switch (eval_mode) {
*bd882c8aSJames Wright    case CEED_EVAL_INTERP: {
*bd882c8aSJames Wright      if (transpose) {
*bd882c8aSJames Wright        CeedCallBackend(CeedBasisApplyInterp_Sycl<CEED_TRANSPOSE>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
*bd882c8aSJames Wright      } else {
*bd882c8aSJames Wright        CeedCallBackend(CeedBasisApplyInterp_Sycl<CEED_NOTRANSPOSE>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    } break;
*bd882c8aSJames Wright    case CEED_EVAL_GRAD: {
*bd882c8aSJames Wright      if (transpose) {
*bd882c8aSJames Wright        CeedCallBackend(CeedBasisApplyGrad_Sycl<1>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
*bd882c8aSJames Wright      } else {
*bd882c8aSJames Wright        CeedCallBackend(CeedBasisApplyGrad_Sycl<0>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    } break;
*bd882c8aSJames Wright    case CEED_EVAL_WEIGHT: {
*bd882c8aSJames Wright      CeedCallBackend(CeedBasisApplyWeight_Sycl(data->sycl_queue, num_elem, impl, d_v));
*bd882c8aSJames Wright    } break;
*bd882c8aSJames Wright    // LCOV_EXCL_START
*bd882c8aSJames Wright    // Evaluate the divergence to/from the quadrature points
*bd882c8aSJames Wright    case CEED_EVAL_DIV:
*bd882c8aSJames Wright      return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
*bd882c8aSJames Wright    // Evaluate the curl to/from the quadrature points
*bd882c8aSJames Wright    case CEED_EVAL_CURL:
*bd882c8aSJames Wright      return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
*bd882c8aSJames Wright    // Take no action, BasisApply should not have been called
*bd882c8aSJames Wright    case CEED_EVAL_NONE:
*bd882c8aSJames Wright      return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
*bd882c8aSJames Wright      // LCOV_EXCL_STOP
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Restore vectors
*bd882c8aSJames Wright  if (eval_mode != CEED_EVAL_WEIGHT) {
*bd882c8aSJames Wright    CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
*bd882c8aSJames Wright  return CEED_ERROR_SUCCESS;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// Interpolation kernel - non-tensor
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightstatic int CeedBasisApplyNonTensorInterp_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, CeedInt transpose, const CeedBasisNonTensor_Sycl *impl,
*bd882c8aSJames Wright                                              const CeedScalar *d_U, CeedScalar *d_V) {
*bd882c8aSJames Wright  const CeedInt     num_comp      = impl->num_comp;
*bd882c8aSJames Wright  const CeedInt     P             = transpose ? impl->num_qpts : impl->num_nodes;
*bd882c8aSJames Wright  const CeedInt     Q             = transpose ? impl->num_nodes : impl->num_qpts;
*bd882c8aSJames Wright  const CeedInt     stride_0      = transpose ? 1 : impl->num_nodes;
*bd882c8aSJames Wright  const CeedInt     stride_1      = transpose ? impl->num_nodes : 1;
*bd882c8aSJames Wright  const CeedInt     u_stride      = P;
*bd882c8aSJames Wright  const CeedInt     v_stride      = Q;
*bd882c8aSJames Wright  const CeedInt     u_comp_stride = u_stride * num_elem;
*bd882c8aSJames Wright  const CeedInt     v_comp_stride = v_stride * num_elem;
*bd882c8aSJames Wright  const CeedInt     u_size        = P;
*bd882c8aSJames Wright  const CeedInt     v_size        = Q;
*bd882c8aSJames Wright  const CeedScalar *d_B           = impl->d_interp;
*bd882c8aSJames Wright
*bd882c8aSJames Wright  sycl::range<2> kernel_range(num_elem, v_size);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Order queue
*bd882c8aSJames Wright  sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
*bd882c8aSJames Wright  sycl_queue.parallel_for<CeedBasisSyclInterpNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
*bd882c8aSJames Wright    const CeedInt i    = indx[1];
*bd882c8aSJames Wright    const CeedInt elem = indx[0];
*bd882c8aSJames Wright
*bd882c8aSJames Wright    for (CeedInt comp = 0; comp < num_comp; comp++) {
*bd882c8aSJames Wright      const CeedScalar *U = d_U + elem * u_stride + comp * u_comp_stride;
*bd882c8aSJames Wright      CeedScalar        V = 0.0;
*bd882c8aSJames Wright
*bd882c8aSJames Wright      for (CeedInt j = 0; j < u_size; ++j) {
*bd882c8aSJames Wright        V += d_B[i * stride_0 + j * stride_1] * U[j];
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright      d_V[i + elem * v_stride + comp * v_comp_stride] = V;
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright  });
*bd882c8aSJames Wright  return CEED_ERROR_SUCCESS;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// Gradient kernel - non-tensor
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightstatic int CeedBasisApplyNonTensorGrad_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, CeedInt transpose, const CeedBasisNonTensor_Sycl *impl,
*bd882c8aSJames Wright                                            const CeedScalar *d_U, CeedScalar *d_V) {
*bd882c8aSJames Wright  const CeedInt     num_comp      = impl->num_comp;
*bd882c8aSJames Wright  const CeedInt     P             = transpose ? impl->num_qpts : impl->num_nodes;
*bd882c8aSJames Wright  const CeedInt     Q             = transpose ? impl->num_nodes : impl->num_qpts;
*bd882c8aSJames Wright  const CeedInt     stride_0      = transpose ? 1 : impl->num_nodes;
*bd882c8aSJames Wright  const CeedInt     stride_1      = transpose ? impl->num_nodes : 1;
*bd882c8aSJames Wright  const CeedInt     g_dim_stride  = P * Q;
*bd882c8aSJames Wright  const CeedInt     u_stride      = P;
*bd882c8aSJames Wright  const CeedInt     v_stride      = Q;
*bd882c8aSJames Wright  const CeedInt     u_comp_stride = u_stride * num_elem;
*bd882c8aSJames Wright  const CeedInt     v_comp_stride = v_stride * num_elem;
*bd882c8aSJames Wright  const CeedInt     u_dim_stride  = u_comp_stride * num_comp;
*bd882c8aSJames Wright  const CeedInt     v_dim_stride  = v_comp_stride * num_comp;
*bd882c8aSJames Wright  const CeedInt     u_size        = P;
*bd882c8aSJames Wright  const CeedInt     v_size        = Q;
*bd882c8aSJames Wright  const CeedInt     in_dim        = transpose ? impl->dim : 1;
*bd882c8aSJames Wright  const CeedInt     out_dim       = transpose ? 1 : impl->dim;
*bd882c8aSJames Wright  const CeedScalar *d_G           = impl->d_grad;
*bd882c8aSJames Wright
*bd882c8aSJames Wright  sycl::range<2> kernel_range(num_elem, v_size);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Order queue
*bd882c8aSJames Wright  sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
*bd882c8aSJames Wright  sycl_queue.parallel_for<CeedBasisSyclGradNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
*bd882c8aSJames Wright    const CeedInt i    = indx[1];
*bd882c8aSJames Wright    const CeedInt elem = indx[0];
*bd882c8aSJames Wright
*bd882c8aSJames Wright    for (CeedInt comp = 0; comp < num_comp; comp++) {
*bd882c8aSJames Wright      CeedScalar V[3] = {0.0, 0.0, 0.0};
*bd882c8aSJames Wright
*bd882c8aSJames Wright      for (CeedInt d1 = 0; d1 < in_dim; ++d1) {
*bd882c8aSJames Wright        const CeedScalar *U = d_U + elem * u_stride + comp * u_comp_stride + d1 * u_dim_stride;
*bd882c8aSJames Wright        const CeedScalar *G = d_G + i * stride_0 + d1 * g_dim_stride;
*bd882c8aSJames Wright
*bd882c8aSJames Wright        for (CeedInt j = 0; j < u_size; ++j) {
*bd882c8aSJames Wright          const CeedScalar Uj = U[j];
*bd882c8aSJames Wright
*bd882c8aSJames Wright          for (CeedInt d0 = 0; d0 < out_dim; ++d0) {
*bd882c8aSJames Wright            V[d0] += G[j * stride_1 + d0 * g_dim_stride] * Uj;
*bd882c8aSJames Wright          }
*bd882c8aSJames Wright        }
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright      for (CeedInt d0 = 0; d0 < out_dim; ++d0) {
*bd882c8aSJames Wright        d_V[i + elem * v_stride + comp * v_comp_stride + d0 * v_dim_stride] = V[d0];
*bd882c8aSJames Wright      }
*bd882c8aSJames Wright    }
*bd882c8aSJames Wright  });
*bd882c8aSJames Wright  return CEED_ERROR_SUCCESS;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// Weight kernel - non-tensor
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightstatic int CeedBasisApplyNonTensorWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, const CeedBasisNonTensor_Sycl *impl, CeedScalar *d_V) {
*bd882c8aSJames Wright  const CeedInt     num_qpts = impl->num_qpts;
*bd882c8aSJames Wright  const CeedScalar *q_weight = impl->d_q_weight;
*bd882c8aSJames Wright
*bd882c8aSJames Wright  sycl::range<2> kernel_range(num_elem, num_qpts);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Order queue
*bd882c8aSJames Wright  sycl::event e = sycl_queue.ext_oneapi_submit_barrier();
*bd882c8aSJames Wright  sycl_queue.parallel_for<CeedBasisSyclWeightNT>(kernel_range, {e}, [=](sycl::id<2> indx) {
*bd882c8aSJames Wright    const CeedInt i          = indx[1];
*bd882c8aSJames Wright    const CeedInt elem       = indx[0];
*bd882c8aSJames Wright    d_V[i + elem * num_qpts] = q_weight[i];
*bd882c8aSJames Wright  });
*bd882c8aSJames Wright  return CEED_ERROR_SUCCESS;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// Basis apply - non-tensor
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightstatic int CeedBasisApplyNonTensor_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
*bd882c8aSJames Wright                                        CeedVector v) {
*bd882c8aSJames Wright  Ceed ceed;
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
*bd882c8aSJames Wright  CeedBasisNonTensor_Sycl *impl;
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisGetData(basis, &impl));
*bd882c8aSJames Wright  Ceed_Sycl *data;
*bd882c8aSJames Wright  CeedCallBackend(CeedGetData(ceed, &data));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  const CeedInt transpose = t_mode == CEED_TRANSPOSE;
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Read vectors
*bd882c8aSJames Wright  const CeedScalar *d_u;
*bd882c8aSJames Wright  CeedScalar       *d_v;
*bd882c8aSJames Wright  if (eval_mode != CEED_EVAL_WEIGHT) {
*bd882c8aSJames Wright    CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright  CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Clear v for transpose operation
*bd882c8aSJames Wright  if (transpose) {
*bd882c8aSJames Wright    CeedSize length;
*bd882c8aSJames Wright    CeedCallBackend(CeedVectorGetLength(v, &length));
*bd882c8aSJames Wright    // Order queue
*bd882c8aSJames Wright    sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
*bd882c8aSJames Wright    data->sycl_queue.fill<CeedScalar>(d_v, 0, length, {e});
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Apply basis operation
*bd882c8aSJames Wright  switch (eval_mode) {
*bd882c8aSJames Wright    case CEED_EVAL_INTERP: {
*bd882c8aSJames Wright      CeedCallBackend(CeedBasisApplyNonTensorInterp_Sycl(data->sycl_queue, num_elem, transpose, impl, d_u, d_v));
*bd882c8aSJames Wright    } break;
*bd882c8aSJames Wright    case CEED_EVAL_GRAD: {
*bd882c8aSJames Wright      CeedCallBackend(CeedBasisApplyNonTensorGrad_Sycl(data->sycl_queue, num_elem, transpose, impl, d_u, d_v));
*bd882c8aSJames Wright    } break;
*bd882c8aSJames Wright    case CEED_EVAL_WEIGHT: {
*bd882c8aSJames Wright      CeedCallBackend(CeedBasisApplyNonTensorWeight_Sycl(data->sycl_queue, num_elem, impl, d_v));
*bd882c8aSJames Wright    } break;
*bd882c8aSJames Wright    // LCOV_EXCL_START
*bd882c8aSJames Wright    // Evaluate the divergence to/from the quadrature points
*bd882c8aSJames Wright    case CEED_EVAL_DIV:
*bd882c8aSJames Wright      return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
*bd882c8aSJames Wright    // Evaluate the curl to/from the quadrature points
*bd882c8aSJames Wright    case CEED_EVAL_CURL:
*bd882c8aSJames Wright      return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
*bd882c8aSJames Wright    // Take no action, BasisApply should not have been called
*bd882c8aSJames Wright    case CEED_EVAL_NONE:
*bd882c8aSJames Wright      return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
*bd882c8aSJames Wright      // LCOV_EXCL_STOP
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Restore vectors
*bd882c8aSJames Wright  if (eval_mode != CEED_EVAL_WEIGHT) {
*bd882c8aSJames Wright    CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
*bd882c8aSJames Wright  }
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
*bd882c8aSJames Wright  return CEED_ERROR_SUCCESS;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// Destroy tensor basis
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightstatic int CeedBasisDestroy_Sycl(CeedBasis basis) {
*bd882c8aSJames Wright  Ceed ceed;
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
*bd882c8aSJames Wright  CeedBasis_Sycl *impl;
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisGetData(basis, &impl));
*bd882c8aSJames Wright  Ceed_Sycl *data;
*bd882c8aSJames Wright  CeedCallBackend(CeedGetData(ceed, &data));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Wait for all work to finish before freeing memory
*bd882c8aSJames Wright  CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
*bd882c8aSJames Wright  CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context));
*bd882c8aSJames Wright  CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallBackend(CeedFree(&impl));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  return CEED_ERROR_SUCCESS;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// Destroy non-tensor basis
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightstatic int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) {
*bd882c8aSJames Wright  Ceed ceed;
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
*bd882c8aSJames Wright  CeedBasisNonTensor_Sycl *impl;
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisGetData(basis, &impl));
*bd882c8aSJames Wright  Ceed_Sycl *data;
*bd882c8aSJames Wright  CeedCallBackend(CeedGetData(ceed, &data));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Wait for all work to finish before freeing memory
*bd882c8aSJames Wright  CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallSycl(ceed, sycl::free(impl->d_q_weight, data->sycl_context));
*bd882c8aSJames Wright  CeedCallSycl(ceed, sycl::free(impl->d_interp, data->sycl_context));
*bd882c8aSJames Wright  CeedCallSycl(ceed, sycl::free(impl->d_grad, data->sycl_context));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallBackend(CeedFree(&impl));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  return CEED_ERROR_SUCCESS;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// Create tensor
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightint CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
*bd882c8aSJames Wright                                 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
*bd882c8aSJames Wright  Ceed ceed;
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
*bd882c8aSJames Wright  CeedBasis_Sycl *impl;
*bd882c8aSJames Wright  CeedCallBackend(CeedCalloc(1, &impl));
*bd882c8aSJames Wright  Ceed_Sycl *data;
*bd882c8aSJames Wright  CeedCallBackend(CeedGetData(ceed, &data));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedInt num_comp;
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  const CeedInt num_nodes = CeedIntPow(P_1d, dim);
*bd882c8aSJames Wright  const CeedInt num_qpts  = CeedIntPow(Q_1d, dim);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  impl->dim       = dim;
*bd882c8aSJames Wright  impl->P_1d      = P_1d;
*bd882c8aSJames Wright  impl->Q_1d      = Q_1d;
*bd882c8aSJames Wright  impl->num_comp  = num_comp;
*bd882c8aSJames Wright  impl->num_nodes = num_nodes;
*bd882c8aSJames Wright  impl->num_qpts  = num_qpts;
*bd882c8aSJames Wright  impl->buf_len   = num_comp * CeedIntMax(num_nodes, num_qpts);
*bd882c8aSJames Wright  impl->op_len    = Q_1d * P_1d;
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Order queue
*bd882c8aSJames Wright  sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
*bd882c8aSJames Wright  sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, {e});
*bd882c8aSJames Wright
*bd882c8aSJames Wright  const CeedInt interp_length = Q_1d * P_1d;
*bd882c8aSJames Wright  CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
*bd882c8aSJames Wright  sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, {e});
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
*bd882c8aSJames Wright  sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, {e});
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  std::vector<sycl::kernel_id> kernel_ids = {sycl::get_kernel_id<CeedBasisSyclInterp<1>>(), sycl::get_kernel_id<CeedBasisSyclInterp<0>>(),
*bd882c8aSJames Wright                                             sycl::get_kernel_id<CeedBasisSyclGrad<1>>(), sycl::get_kernel_id<CeedBasisSyclGrad<0>>()};
*bd882c8aSJames Wright
*bd882c8aSJames Wright  sycl::kernel_bundle<sycl::bundle_state::input> input_bundle = sycl::get_kernel_bundle<sycl::bundle_state::input>(data->sycl_context, kernel_ids);
*bd882c8aSJames Wright  input_bundle.set_specialization_constant<BASIS_DIM_ID>(dim);
*bd882c8aSJames Wright  input_bundle.set_specialization_constant<BASIS_NUM_COMP_ID>(num_comp);
*bd882c8aSJames Wright  input_bundle.set_specialization_constant<BASIS_Q_1D_ID>(Q_1d);
*bd882c8aSJames Wright  input_bundle.set_specialization_constant<BASIS_P_1D_ID>(P_1d);
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallSycl(ceed, impl->sycl_module = new SyclModule_t(sycl::build(input_bundle)));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisSetData(basis, impl));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Register backend functions
*bd882c8aSJames Wright  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApply_Sycl));
*bd882c8aSJames Wright  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl));
*bd882c8aSJames Wright  return CEED_ERROR_SUCCESS;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wright// Create non-tensor
*bd882c8aSJames Wright//------------------------------------------------------------------------------
*bd882c8aSJames Wrightint CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
*bd882c8aSJames Wright                           const CeedScalar *qref, const CeedScalar *q_weight, CeedBasis basis) {
*bd882c8aSJames Wright  Ceed ceed;
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
*bd882c8aSJames Wright  CeedBasisNonTensor_Sycl *impl;
*bd882c8aSJames Wright  CeedCallBackend(CeedCalloc(1, &impl));
*bd882c8aSJames Wright  Ceed_Sycl *data;
*bd882c8aSJames Wright  CeedCallBackend(CeedGetData(ceed, &data));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedInt num_comp;
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  impl->dim       = dim;
*bd882c8aSJames Wright  impl->num_comp  = num_comp;
*bd882c8aSJames Wright  impl->num_nodes = num_nodes;
*bd882c8aSJames Wright  impl->num_qpts  = num_qpts;
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Order queue
*bd882c8aSJames Wright  sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
*bd882c8aSJames Wright  sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, {e});
*bd882c8aSJames Wright
*bd882c8aSJames Wright  const CeedInt interp_length = num_qpts * num_nodes;
*bd882c8aSJames Wright  CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
*bd882c8aSJames Wright  sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, {e});
*bd882c8aSJames Wright
*bd882c8aSJames Wright  const CeedInt grad_length = num_qpts * num_nodes * dim;
*bd882c8aSJames Wright  CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context));
*bd882c8aSJames Wright  sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, {e});
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad}));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  CeedCallBackend(CeedBasisSetData(basis, impl));
*bd882c8aSJames Wright
*bd882c8aSJames Wright  // Register backend functions
*bd882c8aSJames Wright  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Sycl));
*bd882c8aSJames Wright  CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Sycl));
*bd882c8aSJames Wright  return CEED_ERROR_SUCCESS;
*bd882c8aSJames Wright}
*bd882c8aSJames Wright//------------------------------------------------------------------------------