1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed/backend.h> 9 #include <ceed/ceed.h> 10 #include <ceed/jit-tools.h> 11 12 #include <sycl/sycl.hpp> 13 #include <vector> 14 15 #include "../sycl/ceed-sycl-compile.hpp" 16 #include "ceed-sycl-ref.hpp" 17 18 template <int> 19 class CeedBasisSyclInterp; 20 template <int> 21 class CeedBasisSyclGrad; 22 class CeedBasisSyclWeight; 23 24 class CeedBasisSyclInterpNT; 25 class CeedBasisSyclGradNT; 26 class CeedBasisSyclWeightNT; 27 28 using SpecID = sycl::specialization_id<CeedInt>; 29 30 static constexpr SpecID BASIS_DIM_ID; 31 static constexpr SpecID BASIS_NUM_COMP_ID; 32 static constexpr SpecID BASIS_P_1D_ID; 33 static constexpr SpecID BASIS_Q_1D_ID; 34 35 //------------------------------------------------------------------------------ 36 // Interpolation kernel - tensor 37 //------------------------------------------------------------------------------ 38 template <int is_transpose> 39 static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, 40 const CeedScalar *u, CeedScalar *v) { 41 const CeedInt buf_len = impl->buf_len; 42 const CeedInt op_len = impl->op_len; 43 const CeedScalar *interp_1d = impl->d_interp_1d; 44 45 const sycl::device &sycl_device = sycl_queue.get_device(); 46 const CeedInt max_work_group_size = 32; 47 const CeedInt work_group_size = CeedIntMin(impl->num_qpts, max_work_group_size); 48 sycl::range<1> local_range(work_group_size); 49 sycl::range<1> global_range(num_elem * work_group_size); 50 sycl::nd_range<1> kernel_range(global_range, local_range); 51 52 std::vector<sycl::event> e; 53 54 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()}; 55 56 sycl_queue.submit([&](sycl::handler &cgh) { 57 cgh.depends_on(e); 58 cgh.use_kernel_bundle(sycl_module); 59 60 sycl::local_accessor<CeedScalar> s_mem(op_len + 2 * buf_len, cgh); 61 62 cgh.parallel_for<CeedBasisSyclInterp<is_transpose>>(kernel_range, [=](sycl::nd_item<1> work_item, sycl::kernel_handler kh) { 63 //--------------------------------------------------------------> 64 // Retrieve spec constant values 65 const CeedInt dim = kh.get_specialization_constant<BASIS_DIM_ID>(); 66 const CeedInt num_comp = kh.get_specialization_constant<BASIS_NUM_COMP_ID>(); 67 const CeedInt P_1d = kh.get_specialization_constant<BASIS_P_1D_ID>(); 68 const CeedInt Q_1d = kh.get_specialization_constant<BASIS_Q_1D_ID>(); 69 //--------------------------------------------------------------> 70 const CeedInt num_nodes = CeedIntPow(P_1d, dim); 71 const CeedInt num_qpts = CeedIntPow(Q_1d, dim); 72 const CeedInt P = is_transpose ? Q_1d : P_1d; 73 const CeedInt Q = is_transpose ? P_1d : Q_1d; 74 const CeedInt stride_0 = is_transpose ? 1 : P_1d; 75 const CeedInt stride_1 = is_transpose ? P_1d : 1; 76 const CeedInt u_stride = is_transpose ? num_qpts : num_nodes; 77 const CeedInt v_stride = is_transpose ? num_nodes : num_qpts; 78 const CeedInt u_comp_stride = num_elem * u_stride; 79 const CeedInt v_comp_stride = num_elem * v_stride; 80 const CeedInt u_size = u_stride; 81 82 sycl::group work_group = work_item.get_group(); 83 const CeedInt i = work_item.get_local_linear_id(); 84 const CeedInt group_size = work_group.get_local_linear_range(); 85 const CeedInt elem = work_group.get_group_linear_id(); 86 87 CeedScalar *s_interp_1d = s_mem.get_multi_ptr<sycl::access::decorated::yes>().get(); 88 CeedScalar *s_buffer_1 = s_interp_1d + Q * P; 89 CeedScalar *s_buffer_2 = s_buffer_1 + buf_len; 90 91 for (CeedInt k = i; k < P * Q; k += group_size) { 92 s_interp_1d[k] = interp_1d[k]; 93 } 94 95 // Apply basis element by element 96 for (CeedInt comp = 0; comp < num_comp; comp++) { 97 const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 98 CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 99 100 for (CeedInt k = i; k < u_size; k += group_size) { 101 s_buffer_1[k] = cur_u[k]; 102 } 103 104 CeedInt pre = u_size; 105 CeedInt post = 1; 106 107 for (CeedInt d = 0; d < dim; d++) { 108 // Use older version of sycl workgroup barrier for performance reasons 109 // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed 110 // sycl::group_barrier(work_group); 111 work_item.barrier(sycl::access::fence_space::local_space); 112 113 pre /= P; 114 const CeedScalar *in = d % 2 ? s_buffer_2 : s_buffer_1; 115 CeedScalar *out = d == dim - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 116 117 // Contract along middle index 118 const CeedInt writeLen = pre * post * Q; 119 for (CeedInt k = i; k < writeLen; k += group_size) { 120 const CeedInt c = k % post; 121 const CeedInt j = (k / post) % Q; 122 const CeedInt a = k / (post * Q); 123 124 CeedScalar vk = 0; 125 for (CeedInt b = 0; b < P; b++) { 126 vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; 127 } 128 out[k] = vk; 129 } 130 post *= Q; 131 } 132 } 133 }); 134 }); 135 return CEED_ERROR_SUCCESS; 136 } 137 138 //------------------------------------------------------------------------------ 139 // Gradient kernel - tensor 140 //------------------------------------------------------------------------------ 141 template <int is_transpose> 142 static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, 143 const CeedScalar *u, CeedScalar *v) { 144 const CeedInt buf_len = impl->buf_len; 145 const CeedInt op_len = impl->op_len; 146 const CeedScalar *interp_1d = impl->d_interp_1d; 147 const CeedScalar *grad_1d = impl->d_grad_1d; 148 149 const sycl::device &sycl_device = sycl_queue.get_device(); 150 const CeedInt work_group_size = 32; 151 sycl::range<1> local_range(work_group_size); 152 sycl::range<1> global_range(num_elem * work_group_size); 153 sycl::nd_range<1> kernel_range(global_range, local_range); 154 155 std::vector<sycl::event> e; 156 157 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()}; 158 159 sycl_queue.submit([&](sycl::handler &cgh) { 160 cgh.depends_on(e); 161 cgh.use_kernel_bundle(sycl_module); 162 163 sycl::local_accessor<CeedScalar> s_mem(2 * (op_len + buf_len), cgh); 164 165 cgh.parallel_for<CeedBasisSyclGrad<is_transpose>>(kernel_range, [=](sycl::nd_item<1> work_item, sycl::kernel_handler kh) { 166 //--------------------------------------------------------------> 167 // Retrieve spec constant values 168 const CeedInt dim = kh.get_specialization_constant<BASIS_DIM_ID>(); 169 const CeedInt num_comp = kh.get_specialization_constant<BASIS_NUM_COMP_ID>(); 170 const CeedInt P_1d = kh.get_specialization_constant<BASIS_P_1D_ID>(); 171 const CeedInt Q_1d = kh.get_specialization_constant<BASIS_Q_1D_ID>(); 172 //--------------------------------------------------------------> 173 const CeedInt num_nodes = CeedIntPow(P_1d, dim); 174 const CeedInt num_qpts = CeedIntPow(Q_1d, dim); 175 const CeedInt P = is_transpose ? Q_1d : P_1d; 176 const CeedInt Q = is_transpose ? P_1d : Q_1d; 177 const CeedInt stride_0 = is_transpose ? 1 : P_1d; 178 const CeedInt stride_1 = is_transpose ? P_1d : 1; 179 const CeedInt u_stride = is_transpose ? num_qpts : num_nodes; 180 const CeedInt v_stride = is_transpose ? num_nodes : num_qpts; 181 const CeedInt u_comp_stride = num_elem * u_stride; 182 const CeedInt v_comp_stride = num_elem * v_stride; 183 const CeedInt u_dim_stride = is_transpose ? num_elem * num_qpts * num_comp : 0; 184 const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * num_qpts * num_comp; 185 sycl::group work_group = work_item.get_group(); 186 const CeedInt i = work_item.get_local_linear_id(); 187 const CeedInt group_size = work_group.get_local_linear_range(); 188 const CeedInt elem = work_group.get_group_linear_id(); 189 190 CeedScalar *s_interp_1d = s_mem.get_multi_ptr<sycl::access::decorated::yes>().get(); 191 CeedScalar *s_grad_1d = s_interp_1d + P * Q; 192 CeedScalar *s_buffer_1 = s_grad_1d + P * Q; 193 CeedScalar *s_buffer_2 = s_buffer_1 + buf_len; 194 195 for (CeedInt k = i; k < P * Q; k += group_size) { 196 s_interp_1d[k] = interp_1d[k]; 197 s_grad_1d[k] = grad_1d[k]; 198 } 199 200 // Apply basis element by element 201 for (CeedInt comp = 0; comp < num_comp; comp++) { 202 for (CeedInt dim_1 = 0; dim_1 < dim; dim_1++) { 203 CeedInt pre = is_transpose ? num_qpts : num_nodes; 204 CeedInt post = 1; 205 const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; 206 CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; 207 208 for (CeedInt dim_2 = 0; dim_2 < dim; dim_2++) { 209 // Use older version of sycl workgroup barrier for performance reasons 210 // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed 211 // sycl::group_barrier(work_group); 212 work_item.barrier(sycl::access::fence_space::local_space); 213 214 pre /= P; 215 const CeedScalar *op = dim_1 == dim_2 ? s_grad_1d : s_interp_1d; 216 const CeedScalar *in = (dim_2 == 0 ? cur_u : (dim_2 % 2 ? s_buffer_2 : s_buffer_1)); 217 CeedScalar *out = dim_2 == dim - 1 ? cur_v : (dim_2 % 2 ? s_buffer_1 : s_buffer_2); 218 219 // Contract along middle index 220 const CeedInt writeLen = pre * post * Q; 221 for (CeedInt k = i; k < writeLen; k += group_size) { 222 const CeedInt c = k % post; 223 const CeedInt j = (k / post) % Q; 224 const CeedInt a = k / (post * Q); 225 226 CeedScalar v_k = 0; 227 for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; 228 229 if (is_transpose && dim_2 == dim - 1) out[k] += v_k; 230 else out[k] = v_k; 231 } 232 233 post *= Q; 234 } 235 } 236 } 237 }); 238 }); 239 return CEED_ERROR_SUCCESS; 240 } 241 242 //------------------------------------------------------------------------------ 243 // Weight kernel - tensor 244 //------------------------------------------------------------------------------ 245 static int CeedBasisApplyWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, const CeedBasis_Sycl *impl, CeedScalar *w) { 246 const CeedInt dim = impl->dim; 247 const CeedInt Q_1d = impl->Q_1d; 248 const CeedScalar *q_weight_1d = impl->d_q_weight_1d; 249 250 const CeedInt num_quad_x = Q_1d; 251 const CeedInt num_quad_y = (dim > 1) ? Q_1d : 1; 252 const CeedInt num_quad_z = (dim > 2) ? Q_1d : 1; 253 sycl::range<3> kernel_range(num_elem * num_quad_z, num_quad_y, num_quad_x); 254 255 std::vector<sycl::event> e; 256 257 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()}; 258 259 sycl_queue.parallel_for<CeedBasisSyclWeight>(kernel_range, e, [=](sycl::item<3> work_item) { 260 if (dim == 1) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]]; 261 if (dim == 2) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]]; 262 if (dim == 3) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]] * q_weight_1d[work_item[0] % Q_1d]; 263 }); 264 return CEED_ERROR_SUCCESS; 265 } 266 267 //------------------------------------------------------------------------------ 268 // Basis apply - tensor 269 //------------------------------------------------------------------------------ 270 static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 271 CeedVector v) { 272 Ceed ceed; 273 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 274 const CeedScalar *d_u; 275 CeedScalar *d_v; 276 Ceed_Sycl *data; 277 CeedBasis_Sycl *impl; 278 279 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 280 CeedCallBackend(CeedGetData(ceed, &data)); 281 CeedCallBackend(CeedBasisGetData(basis, &impl)); 282 283 // Get read/write access to u, v 284 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 285 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 286 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 287 288 // Clear v for transpose operation 289 if (is_transpose) { 290 CeedSize length; 291 CeedCallBackend(CeedVectorGetLength(v, &length)); 292 std::vector<sycl::event> e; 293 294 if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()}; 295 data->sycl_queue.fill<CeedScalar>(d_v, 0, length, e); 296 } 297 298 // Basis action 299 switch (eval_mode) { 300 case CEED_EVAL_INTERP: 301 if (is_transpose) { 302 CeedCallBackend(CeedBasisApplyInterp_Sycl<true>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); 303 } else { 304 CeedCallBackend(CeedBasisApplyInterp_Sycl<false>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); 305 } 306 break; 307 case CEED_EVAL_GRAD: 308 if (is_transpose) { 309 CeedCallBackend(CeedBasisApplyGrad_Sycl<true>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); 310 } else { 311 CeedCallBackend(CeedBasisApplyGrad_Sycl<false>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); 312 } 313 break; 314 case CEED_EVAL_WEIGHT: 315 CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[eval_mode]); 316 CeedCallBackend(CeedBasisApplyWeight_Sycl(data->sycl_queue, num_elem, impl, d_v)); 317 break; 318 case CEED_EVAL_NONE: /* handled separately below */ 319 break; 320 // LCOV_EXCL_START 321 case CEED_EVAL_DIV: 322 case CEED_EVAL_CURL: 323 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 324 // LCOV_EXCL_STOP 325 } 326 327 // Restore vectors, cover CEED_EVAL_NONE 328 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 329 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 330 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 331 CeedCallBackend(CeedDestroy(&ceed)); 332 return CEED_ERROR_SUCCESS; 333 } 334 335 //------------------------------------------------------------------------------ 336 // Interpolation kernel - non-tensor 337 //------------------------------------------------------------------------------ 338 static int CeedBasisApplyNonTensorInterp_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, CeedInt is_transpose, const CeedBasisNonTensor_Sycl *impl, 339 const CeedScalar *d_U, CeedScalar *d_V) { 340 const CeedInt num_comp = impl->num_comp; 341 const CeedInt P = is_transpose ? impl->num_qpts : impl->num_nodes; 342 const CeedInt Q = is_transpose ? impl->num_nodes : impl->num_qpts; 343 const CeedInt stride_0 = is_transpose ? 1 : impl->num_nodes; 344 const CeedInt stride_1 = is_transpose ? impl->num_nodes : 1; 345 const CeedInt u_stride = P; 346 const CeedInt v_stride = Q; 347 const CeedInt u_comp_stride = u_stride * num_elem; 348 const CeedInt v_comp_stride = v_stride * num_elem; 349 const CeedInt u_size = P; 350 const CeedInt v_size = Q; 351 const CeedScalar *d_B = impl->d_interp; 352 353 sycl::range<2> kernel_range(num_elem, v_size); 354 355 std::vector<sycl::event> e; 356 357 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()}; 358 359 sycl_queue.parallel_for<CeedBasisSyclInterpNT>(kernel_range, e, [=](sycl::id<2> indx) { 360 const CeedInt i = indx[1]; 361 const CeedInt elem = indx[0]; 362 363 for (CeedInt comp = 0; comp < num_comp; comp++) { 364 const CeedScalar *U = d_U + elem * u_stride + comp * u_comp_stride; 365 CeedScalar V = 0.0; 366 367 for (CeedInt j = 0; j < u_size; ++j) { 368 V += d_B[i * stride_0 + j * stride_1] * U[j]; 369 } 370 d_V[i + elem * v_stride + comp * v_comp_stride] = V; 371 } 372 }); 373 return CEED_ERROR_SUCCESS; 374 } 375 376 //------------------------------------------------------------------------------ 377 // Gradient kernel - non-tensor 378 //------------------------------------------------------------------------------ 379 static int CeedBasisApplyNonTensorGrad_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, CeedInt is_transpose, const CeedBasisNonTensor_Sycl *impl, 380 const CeedScalar *d_U, CeedScalar *d_V) { 381 const CeedInt num_comp = impl->num_comp; 382 const CeedInt P = is_transpose ? impl->num_qpts : impl->num_nodes; 383 const CeedInt Q = is_transpose ? impl->num_nodes : impl->num_qpts; 384 const CeedInt stride_0 = is_transpose ? 1 : impl->num_nodes; 385 const CeedInt stride_1 = is_transpose ? impl->num_nodes : 1; 386 const CeedInt g_dim_stride = P * Q; 387 const CeedInt u_stride = P; 388 const CeedInt v_stride = Q; 389 const CeedInt u_comp_stride = u_stride * num_elem; 390 const CeedInt v_comp_stride = v_stride * num_elem; 391 const CeedInt u_dim_stride = u_comp_stride * num_comp; 392 const CeedInt v_dim_stride = v_comp_stride * num_comp; 393 const CeedInt u_size = P; 394 const CeedInt v_size = Q; 395 const CeedInt in_dim = is_transpose ? impl->dim : 1; 396 const CeedInt out_dim = is_transpose ? 1 : impl->dim; 397 const CeedScalar *d_G = impl->d_grad; 398 399 sycl::range<2> kernel_range(num_elem, v_size); 400 401 std::vector<sycl::event> e; 402 403 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()}; 404 405 sycl_queue.parallel_for<CeedBasisSyclGradNT>(kernel_range, e, [=](sycl::id<2> indx) { 406 const CeedInt i = indx[1]; 407 const CeedInt elem = indx[0]; 408 409 for (CeedInt comp = 0; comp < num_comp; comp++) { 410 CeedScalar V[3] = {0.0, 0.0, 0.0}; 411 412 for (CeedInt d1 = 0; d1 < in_dim; ++d1) { 413 const CeedScalar *U = d_U + elem * u_stride + comp * u_comp_stride + d1 * u_dim_stride; 414 const CeedScalar *G = d_G + i * stride_0 + d1 * g_dim_stride; 415 416 for (CeedInt j = 0; j < u_size; ++j) { 417 const CeedScalar Uj = U[j]; 418 419 for (CeedInt d0 = 0; d0 < out_dim; ++d0) { 420 V[d0] += G[j * stride_1 + d0 * g_dim_stride] * Uj; 421 } 422 } 423 } 424 for (CeedInt d0 = 0; d0 < out_dim; ++d0) { 425 d_V[i + elem * v_stride + comp * v_comp_stride + d0 * v_dim_stride] = V[d0]; 426 } 427 } 428 }); 429 return CEED_ERROR_SUCCESS; 430 } 431 432 //------------------------------------------------------------------------------ 433 // Weight kernel - non-tensor 434 //------------------------------------------------------------------------------ 435 static int CeedBasisApplyNonTensorWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, const CeedBasisNonTensor_Sycl *impl, CeedScalar *d_V) { 436 const CeedInt num_qpts = impl->num_qpts; 437 const CeedScalar *q_weight = impl->d_q_weight; 438 439 sycl::range<2> kernel_range(num_elem, num_qpts); 440 441 std::vector<sycl::event> e; 442 443 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()}; 444 445 sycl_queue.parallel_for<CeedBasisSyclWeightNT>(kernel_range, e, [=](sycl::id<2> indx) { 446 const CeedInt i = indx[1]; 447 const CeedInt elem = indx[0]; 448 d_V[i + elem * num_qpts] = q_weight[i]; 449 }); 450 return CEED_ERROR_SUCCESS; 451 } 452 453 //------------------------------------------------------------------------------ 454 // Basis apply - non-tensor 455 //------------------------------------------------------------------------------ 456 static int CeedBasisApplyNonTensor_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 457 CeedVector v) { 458 Ceed ceed; 459 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 460 const CeedScalar *d_u; 461 CeedScalar *d_v; 462 CeedBasisNonTensor_Sycl *impl; 463 Ceed_Sycl *data; 464 465 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 466 CeedCallBackend(CeedBasisGetData(basis, &impl)); 467 CeedCallBackend(CeedGetData(ceed, &data)); 468 469 // Get read/write access to u, v 470 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 471 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 472 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 473 474 // Clear v for transpose operation 475 if (is_transpose) { 476 CeedSize length; 477 CeedCallBackend(CeedVectorGetLength(v, &length)); 478 // Order queue 479 sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); 480 data->sycl_queue.fill<CeedScalar>(d_v, 0, length, {e}); 481 } 482 483 // Apply basis operation 484 switch (eval_mode) { 485 case CEED_EVAL_INTERP: 486 CeedCallBackend(CeedBasisApplyNonTensorInterp_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v)); 487 break; 488 case CEED_EVAL_GRAD: 489 CeedCallBackend(CeedBasisApplyNonTensorGrad_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v)); 490 break; 491 case CEED_EVAL_WEIGHT: 492 CeedCheck(impl->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 493 CeedCallBackend(CeedBasisApplyNonTensorWeight_Sycl(data->sycl_queue, num_elem, impl, d_v)); 494 break; 495 case CEED_EVAL_NONE: /* handled separately below */ 496 break; 497 // LCOV_EXCL_START 498 case CEED_EVAL_DIV: 499 case CEED_EVAL_CURL: 500 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 501 // LCOV_EXCL_STOP 502 } 503 504 // Restore vectors, cover CEED_EVAL_NONE 505 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 506 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 507 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 508 CeedCallBackend(CeedDestroy(&ceed)); 509 return CEED_ERROR_SUCCESS; 510 } 511 512 //------------------------------------------------------------------------------ 513 // Destroy tensor basis 514 //------------------------------------------------------------------------------ 515 static int CeedBasisDestroy_Sycl(CeedBasis basis) { 516 Ceed ceed; 517 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 518 CeedBasis_Sycl *impl; 519 CeedCallBackend(CeedBasisGetData(basis, &impl)); 520 Ceed_Sycl *data; 521 CeedCallBackend(CeedGetData(ceed, &data)); 522 523 // Wait for all work to finish before freeing memory 524 CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); 525 526 if (impl->d_q_weight_1d) CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context)); 527 CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context)); 528 CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context)); 529 530 CeedCallBackend(CeedFree(&impl)); 531 CeedCallBackend(CeedDestroy(&ceed)); 532 return CEED_ERROR_SUCCESS; 533 } 534 535 //------------------------------------------------------------------------------ 536 // Destroy non-tensor basis 537 //------------------------------------------------------------------------------ 538 static int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) { 539 Ceed ceed; 540 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 541 CeedBasisNonTensor_Sycl *impl; 542 CeedCallBackend(CeedBasisGetData(basis, &impl)); 543 Ceed_Sycl *data; 544 CeedCallBackend(CeedGetData(ceed, &data)); 545 546 // Wait for all work to finish before freeing memory 547 CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); 548 549 if (impl->d_q_weight) CeedCallSycl(ceed, sycl::free(impl->d_q_weight, data->sycl_context)); 550 CeedCallSycl(ceed, sycl::free(impl->d_interp, data->sycl_context)); 551 CeedCallSycl(ceed, sycl::free(impl->d_grad, data->sycl_context)); 552 553 CeedCallBackend(CeedFree(&impl)); 554 CeedCallBackend(CeedDestroy(&ceed)); 555 return CEED_ERROR_SUCCESS; 556 } 557 558 //------------------------------------------------------------------------------ 559 // Create tensor 560 //------------------------------------------------------------------------------ 561 int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 562 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 563 Ceed ceed; 564 CeedBasis_Sycl *impl; 565 Ceed_Sycl *data; 566 567 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 568 CeedCallBackend(CeedCalloc(1, &impl)); 569 CeedCallBackend(CeedGetData(ceed, &data)); 570 571 CeedInt num_comp; 572 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 573 574 const CeedInt num_nodes = CeedIntPow(P_1d, dim); 575 const CeedInt num_qpts = CeedIntPow(Q_1d, dim); 576 577 impl->dim = dim; 578 impl->P_1d = P_1d; 579 impl->Q_1d = Q_1d; 580 impl->num_comp = num_comp; 581 impl->num_nodes = num_nodes; 582 impl->num_qpts = num_qpts; 583 impl->buf_len = num_comp * CeedIntMax(num_nodes, num_qpts); 584 impl->op_len = Q_1d * P_1d; 585 586 std::vector<sycl::event> e; 587 588 if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()}; 589 590 std::vector<sycl::event> copy_events; 591 if (q_weight_1d) { 592 CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context)); 593 sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e); 594 copy_events.push_back(copy_weight); 595 } 596 597 const CeedInt interp_length = Q_1d * P_1d; 598 CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context)); 599 sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, e); 600 copy_events.push_back(copy_interp); 601 602 CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context)); 603 sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, e); 604 copy_events.push_back(copy_grad); 605 606 CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events)); 607 608 std::vector<sycl::kernel_id> kernel_ids = {sycl::get_kernel_id<CeedBasisSyclInterp<1>>(), sycl::get_kernel_id<CeedBasisSyclInterp<0>>(), 609 sycl::get_kernel_id<CeedBasisSyclGrad<1>>(), sycl::get_kernel_id<CeedBasisSyclGrad<0>>()}; 610 611 sycl::kernel_bundle<sycl::bundle_state::input> input_bundle = sycl::get_kernel_bundle<sycl::bundle_state::input>(data->sycl_context, kernel_ids); 612 input_bundle.set_specialization_constant<BASIS_DIM_ID>(dim); 613 input_bundle.set_specialization_constant<BASIS_NUM_COMP_ID>(num_comp); 614 input_bundle.set_specialization_constant<BASIS_Q_1D_ID>(Q_1d); 615 input_bundle.set_specialization_constant<BASIS_P_1D_ID>(P_1d); 616 617 CeedCallSycl(ceed, impl->sycl_module = new SyclModule_t(sycl::build(input_bundle))); 618 619 CeedCallBackend(CeedBasisSetData(basis, impl)); 620 621 // Register backend functions 622 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApply_Sycl)); 623 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl)); 624 CeedCallBackend(CeedDestroy(&ceed)); 625 return CEED_ERROR_SUCCESS; 626 } 627 628 //------------------------------------------------------------------------------ 629 // Create non-tensor 630 //------------------------------------------------------------------------------ 631 int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 632 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 633 Ceed ceed; 634 CeedBasisNonTensor_Sycl *impl; 635 Ceed_Sycl *data; 636 637 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 638 CeedCallBackend(CeedCalloc(1, &impl)); 639 CeedCallBackend(CeedGetData(ceed, &data)); 640 641 CeedInt num_comp; 642 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 643 644 impl->dim = dim; 645 impl->num_comp = num_comp; 646 impl->num_nodes = num_nodes; 647 impl->num_qpts = num_qpts; 648 649 std::vector<sycl::event> e; 650 651 if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()}; 652 653 std::vector<sycl::event> copy_events; 654 if (q_weight) { 655 CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context)); 656 sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, e); 657 copy_events.push_back(copy_weight); 658 } 659 660 const CeedInt interp_length = num_qpts * num_nodes; 661 CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context)); 662 sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, e); 663 copy_events.push_back(copy_interp); 664 665 const CeedInt grad_length = num_qpts * num_nodes * dim; 666 CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context)); 667 sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, e); 668 copy_events.push_back(copy_grad); 669 670 CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events)); 671 672 CeedCallBackend(CeedBasisSetData(basis, impl)); 673 674 // Register backend functions 675 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Sycl)); 676 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Sycl)); 677 CeedCallBackend(CeedDestroy(&ceed)); 678 return CEED_ERROR_SUCCESS; 679 } 680 681 //------------------------------------------------------------------------------ 682