1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed/backend.h> 9 #include <ceed/ceed.h> 10 #include <ceed/jit-tools.h> 11 12 #include <sycl/sycl.hpp> 13 #include <vector> 14 15 #include "../sycl/ceed-sycl-compile.hpp" 16 #include "ceed-sycl-ref.hpp" 17 18 template <int> 19 class CeedBasisSyclInterp; 20 template <int> 21 class CeedBasisSyclGrad; 22 class CeedBasisSyclWeight; 23 24 class CeedBasisSyclInterpNT; 25 class CeedBasisSyclGradNT; 26 class CeedBasisSyclWeightNT; 27 28 using SpecID = sycl::specialization_id<CeedInt>; 29 30 static constexpr SpecID BASIS_DIM_ID; 31 static constexpr SpecID BASIS_NUM_COMP_ID; 32 static constexpr SpecID BASIS_P_1D_ID; 33 static constexpr SpecID BASIS_Q_1D_ID; 34 35 //------------------------------------------------------------------------------ 36 // Interpolation kernel - tensor 37 //------------------------------------------------------------------------------ 38 template <int is_transpose> 39 static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, 40 const CeedScalar *u, CeedScalar *v) { 41 const CeedInt buf_len = impl->buf_len; 42 const CeedInt op_len = impl->op_len; 43 const CeedScalar *interp_1d = impl->d_interp_1d; 44 45 const sycl::device &sycl_device = sycl_queue.get_device(); 46 const CeedInt max_work_group_size = 32; 47 const CeedInt work_group_size = CeedIntMin(impl->num_qpts, max_work_group_size); 48 sycl::range<1> local_range(work_group_size); 49 sycl::range<1> global_range(num_elem * work_group_size); 50 sycl::nd_range<1> kernel_range(global_range, local_range); 51 52 // Order queue 53 sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); 54 sycl_queue.submit([&](sycl::handler &cgh) { 55 cgh.depends_on({e}); 56 cgh.use_kernel_bundle(sycl_module); 57 58 sycl::local_accessor<CeedScalar> s_mem(op_len + 2 * buf_len, cgh); 59 60 cgh.parallel_for<CeedBasisSyclInterp<is_transpose>>(kernel_range, [=](sycl::nd_item<1> work_item, sycl::kernel_handler kh) { 61 //--------------------------------------------------------------> 62 // Retrieve spec constant values 63 const CeedInt dim = kh.get_specialization_constant<BASIS_DIM_ID>(); 64 const CeedInt num_comp = kh.get_specialization_constant<BASIS_NUM_COMP_ID>(); 65 const CeedInt P_1d = kh.get_specialization_constant<BASIS_P_1D_ID>(); 66 const CeedInt Q_1d = kh.get_specialization_constant<BASIS_Q_1D_ID>(); 67 //--------------------------------------------------------------> 68 const CeedInt num_nodes = CeedIntPow(P_1d, dim); 69 const CeedInt num_qpts = CeedIntPow(Q_1d, dim); 70 const CeedInt P = is_transpose ? Q_1d : P_1d; 71 const CeedInt Q = is_transpose ? P_1d : Q_1d; 72 const CeedInt stride_0 = is_transpose ? 1 : P_1d; 73 const CeedInt stride_1 = is_transpose ? P_1d : 1; 74 const CeedInt u_stride = is_transpose ? num_qpts : num_nodes; 75 const CeedInt v_stride = is_transpose ? num_nodes : num_qpts; 76 const CeedInt u_comp_stride = num_elem * u_stride; 77 const CeedInt v_comp_stride = num_elem * v_stride; 78 const CeedInt u_size = u_stride; 79 80 sycl::group work_group = work_item.get_group(); 81 const CeedInt i = work_item.get_local_linear_id(); 82 const CeedInt group_size = work_group.get_local_linear_range(); 83 const CeedInt elem = work_group.get_group_linear_id(); 84 85 CeedScalar *s_interp_1d = s_mem.get_multi_ptr<sycl::access::decorated::yes>().get(); 86 CeedScalar *s_buffer_1 = s_interp_1d + Q * P; 87 CeedScalar *s_buffer_2 = s_buffer_1 + buf_len; 88 89 for (CeedInt k = i; k < P * Q; k += group_size) { 90 s_interp_1d[k] = interp_1d[k]; 91 } 92 93 // Apply basis element by element 94 for (CeedInt comp = 0; comp < num_comp; comp++) { 95 const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 96 CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 97 98 for (CeedInt k = i; k < u_size; k += group_size) { 99 s_buffer_1[k] = cur_u[k]; 100 } 101 102 CeedInt pre = u_size; 103 CeedInt post = 1; 104 105 for (CeedInt d = 0; d < dim; d++) { 106 // Use older version of sycl workgroup barrier for performance reasons 107 // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed 108 // sycl::group_barrier(work_group); 109 work_item.barrier(sycl::access::fence_space::local_space); 110 111 pre /= P; 112 const CeedScalar *in = d % 2 ? s_buffer_2 : s_buffer_1; 113 CeedScalar *out = d == dim - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 114 115 // Contract along middle index 116 const CeedInt writeLen = pre * post * Q; 117 for (CeedInt k = i; k < writeLen; k += group_size) { 118 const CeedInt c = k % post; 119 const CeedInt j = (k / post) % Q; 120 const CeedInt a = k / (post * Q); 121 122 CeedScalar vk = 0; 123 for (CeedInt b = 0; b < P; b++) { 124 vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; 125 } 126 out[k] = vk; 127 } 128 post *= Q; 129 } 130 } 131 }); 132 }); 133 return CEED_ERROR_SUCCESS; 134 } 135 136 //------------------------------------------------------------------------------ 137 // Gradient kernel - tensor 138 //------------------------------------------------------------------------------ 139 template <int is_transpose> 140 static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl, 141 const CeedScalar *u, CeedScalar *v) { 142 const CeedInt buf_len = impl->buf_len; 143 const CeedInt op_len = impl->op_len; 144 const CeedScalar *interp_1d = impl->d_interp_1d; 145 const CeedScalar *grad_1d = impl->d_grad_1d; 146 147 const sycl::device &sycl_device = sycl_queue.get_device(); 148 const CeedInt work_group_size = 32; 149 sycl::range<1> local_range(work_group_size); 150 sycl::range<1> global_range(num_elem * work_group_size); 151 sycl::nd_range<1> kernel_range(global_range, local_range); 152 153 // Order queue 154 sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); 155 sycl_queue.submit([&](sycl::handler &cgh) { 156 cgh.depends_on({e}); 157 cgh.use_kernel_bundle(sycl_module); 158 159 sycl::local_accessor<CeedScalar> s_mem(2 * (op_len + buf_len), cgh); 160 161 cgh.parallel_for<CeedBasisSyclGrad<is_transpose>>(kernel_range, [=](sycl::nd_item<1> work_item, sycl::kernel_handler kh) { 162 //--------------------------------------------------------------> 163 // Retrieve spec constant values 164 const CeedInt dim = kh.get_specialization_constant<BASIS_DIM_ID>(); 165 const CeedInt num_comp = kh.get_specialization_constant<BASIS_NUM_COMP_ID>(); 166 const CeedInt P_1d = kh.get_specialization_constant<BASIS_P_1D_ID>(); 167 const CeedInt Q_1d = kh.get_specialization_constant<BASIS_Q_1D_ID>(); 168 //--------------------------------------------------------------> 169 const CeedInt num_nodes = CeedIntPow(P_1d, dim); 170 const CeedInt num_qpts = CeedIntPow(Q_1d, dim); 171 const CeedInt P = is_transpose ? Q_1d : P_1d; 172 const CeedInt Q = is_transpose ? P_1d : Q_1d; 173 const CeedInt stride_0 = is_transpose ? 1 : P_1d; 174 const CeedInt stride_1 = is_transpose ? P_1d : 1; 175 const CeedInt u_stride = is_transpose ? num_qpts : num_nodes; 176 const CeedInt v_stride = is_transpose ? num_nodes : num_qpts; 177 const CeedInt u_comp_stride = num_elem * u_stride; 178 const CeedInt v_comp_stride = num_elem * v_stride; 179 const CeedInt u_dim_stride = is_transpose ? num_elem * num_qpts * num_comp : 0; 180 const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * num_qpts * num_comp; 181 sycl::group work_group = work_item.get_group(); 182 const CeedInt i = work_item.get_local_linear_id(); 183 const CeedInt group_size = work_group.get_local_linear_range(); 184 const CeedInt elem = work_group.get_group_linear_id(); 185 186 CeedScalar *s_interp_1d = s_mem.get_multi_ptr<sycl::access::decorated::yes>().get(); 187 CeedScalar *s_grad_1d = s_interp_1d + P * Q; 188 CeedScalar *s_buffer_1 = s_grad_1d + P * Q; 189 CeedScalar *s_buffer_2 = s_buffer_1 + buf_len; 190 191 for (CeedInt k = i; k < P * Q; k += group_size) { 192 s_interp_1d[k] = interp_1d[k]; 193 s_grad_1d[k] = grad_1d[k]; 194 } 195 196 // Apply basis element by element 197 for (CeedInt comp = 0; comp < num_comp; comp++) { 198 for (CeedInt dim_1 = 0; dim_1 < dim; dim_1++) { 199 CeedInt pre = is_transpose ? num_qpts : num_nodes; 200 CeedInt post = 1; 201 const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; 202 CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; 203 204 for (CeedInt dim_2 = 0; dim_2 < dim; dim_2++) { 205 // Use older version of sycl workgroup barrier for performance reasons 206 // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed 207 // sycl::group_barrier(work_group); 208 work_item.barrier(sycl::access::fence_space::local_space); 209 210 pre /= P; 211 const CeedScalar *op = dim_1 == dim_2 ? s_grad_1d : s_interp_1d; 212 const CeedScalar *in = (dim_2 == 0 ? cur_u : (dim_2 % 2 ? s_buffer_2 : s_buffer_1)); 213 CeedScalar *out = dim_2 == dim - 1 ? cur_v : (dim_2 % 2 ? s_buffer_1 : s_buffer_2); 214 215 // Contract along middle index 216 const CeedInt writeLen = pre * post * Q; 217 for (CeedInt k = i; k < writeLen; k += group_size) { 218 const CeedInt c = k % post; 219 const CeedInt j = (k / post) % Q; 220 const CeedInt a = k / (post * Q); 221 222 CeedScalar v_k = 0; 223 for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; 224 225 if (is_transpose && dim_2 == dim - 1) out[k] += v_k; 226 else out[k] = v_k; 227 } 228 229 post *= Q; 230 } 231 } 232 } 233 }); 234 }); 235 return CEED_ERROR_SUCCESS; 236 } 237 238 //------------------------------------------------------------------------------ 239 // Weight kernel - tensor 240 //------------------------------------------------------------------------------ 241 static int CeedBasisApplyWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, const CeedBasis_Sycl *impl, CeedScalar *w) { 242 const CeedInt dim = impl->dim; 243 const CeedInt Q_1d = impl->Q_1d; 244 const CeedScalar *q_weight_1d = impl->d_q_weight_1d; 245 246 const CeedInt num_quad_x = Q_1d; 247 const CeedInt num_quad_y = (dim > 1) ? Q_1d : 1; 248 const CeedInt num_quad_z = (dim > 2) ? Q_1d : 1; 249 sycl::range<3> kernel_range(num_elem * num_quad_z, num_quad_y, num_quad_x); 250 251 // Order queue 252 sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); 253 sycl_queue.parallel_for<CeedBasisSyclWeight>(kernel_range, {e}, [=](sycl::item<3> work_item) { 254 if (dim == 1) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]]; 255 if (dim == 2) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]]; 256 if (dim == 3) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]] * q_weight_1d[work_item[0] % Q_1d]; 257 }); 258 return CEED_ERROR_SUCCESS; 259 } 260 261 //------------------------------------------------------------------------------ 262 // Basis apply - tensor 263 //------------------------------------------------------------------------------ 264 static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 265 CeedVector v) { 266 Ceed ceed; 267 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 268 const CeedScalar *d_u; 269 CeedScalar *d_v; 270 Ceed_Sycl *data; 271 CeedBasis_Sycl *impl; 272 273 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 274 CeedCallBackend(CeedGetData(ceed, &data)); 275 CeedCallBackend(CeedBasisGetData(basis, &impl)); 276 277 // Get read/write access to u, v 278 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 279 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 280 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 281 282 // Clear v for transpose operation 283 if (is_transpose) { 284 CeedSize length; 285 CeedCallBackend(CeedVectorGetLength(v, &length)); 286 // Order queue 287 sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); 288 data->sycl_queue.fill<CeedScalar>(d_v, 0, length, {e}); 289 } 290 291 // Basis action 292 switch (eval_mode) { 293 case CEED_EVAL_INTERP: { 294 if (is_transpose) { 295 CeedCallBackend(CeedBasisApplyInterp_Sycl<CEED_TRANSPOSE>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); 296 } else { 297 CeedCallBackend(CeedBasisApplyInterp_Sycl<CEED_NOTRANSPOSE>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); 298 } 299 } break; 300 case CEED_EVAL_GRAD: { 301 if (is_transpose) { 302 CeedCallBackend(CeedBasisApplyGrad_Sycl<1>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); 303 } else { 304 CeedCallBackend(CeedBasisApplyGrad_Sycl<0>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v)); 305 } 306 } break; 307 case CEED_EVAL_WEIGHT: { 308 CeedCallBackend(CeedBasisApplyWeight_Sycl(data->sycl_queue, num_elem, impl, d_v)); 309 } break; 310 case CEED_EVAL_NONE: /* handled separately below */ 311 break; 312 // LCOV_EXCL_START 313 case CEED_EVAL_DIV: 314 case CEED_EVAL_CURL: 315 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 316 // LCOV_EXCL_STOP 317 } 318 319 // Restore vectors, cover CEED_EVAL_NONE 320 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 321 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 322 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 323 return CEED_ERROR_SUCCESS; 324 } 325 326 //------------------------------------------------------------------------------ 327 // Interpolation kernel - non-tensor 328 //------------------------------------------------------------------------------ 329 static int CeedBasisApplyNonTensorInterp_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, CeedInt is_transpose, const CeedBasisNonTensor_Sycl *impl, 330 const CeedScalar *d_U, CeedScalar *d_V) { 331 const CeedInt num_comp = impl->num_comp; 332 const CeedInt P = is_transpose ? impl->num_qpts : impl->num_nodes; 333 const CeedInt Q = is_transpose ? impl->num_nodes : impl->num_qpts; 334 const CeedInt stride_0 = is_transpose ? 1 : impl->num_nodes; 335 const CeedInt stride_1 = is_transpose ? impl->num_nodes : 1; 336 const CeedInt u_stride = P; 337 const CeedInt v_stride = Q; 338 const CeedInt u_comp_stride = u_stride * num_elem; 339 const CeedInt v_comp_stride = v_stride * num_elem; 340 const CeedInt u_size = P; 341 const CeedInt v_size = Q; 342 const CeedScalar *d_B = impl->d_interp; 343 344 sycl::range<2> kernel_range(num_elem, v_size); 345 346 // Order queue 347 sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); 348 sycl_queue.parallel_for<CeedBasisSyclInterpNT>(kernel_range, {e}, [=](sycl::id<2> indx) { 349 const CeedInt i = indx[1]; 350 const CeedInt elem = indx[0]; 351 352 for (CeedInt comp = 0; comp < num_comp; comp++) { 353 const CeedScalar *U = d_U + elem * u_stride + comp * u_comp_stride; 354 CeedScalar V = 0.0; 355 356 for (CeedInt j = 0; j < u_size; ++j) { 357 V += d_B[i * stride_0 + j * stride_1] * U[j]; 358 } 359 d_V[i + elem * v_stride + comp * v_comp_stride] = V; 360 } 361 }); 362 return CEED_ERROR_SUCCESS; 363 } 364 365 //------------------------------------------------------------------------------ 366 // Gradient kernel - non-tensor 367 //------------------------------------------------------------------------------ 368 static int CeedBasisApplyNonTensorGrad_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, CeedInt is_transpose, const CeedBasisNonTensor_Sycl *impl, 369 const CeedScalar *d_U, CeedScalar *d_V) { 370 const CeedInt num_comp = impl->num_comp; 371 const CeedInt P = is_transpose ? impl->num_qpts : impl->num_nodes; 372 const CeedInt Q = is_transpose ? impl->num_nodes : impl->num_qpts; 373 const CeedInt stride_0 = is_transpose ? 1 : impl->num_nodes; 374 const CeedInt stride_1 = is_transpose ? impl->num_nodes : 1; 375 const CeedInt g_dim_stride = P * Q; 376 const CeedInt u_stride = P; 377 const CeedInt v_stride = Q; 378 const CeedInt u_comp_stride = u_stride * num_elem; 379 const CeedInt v_comp_stride = v_stride * num_elem; 380 const CeedInt u_dim_stride = u_comp_stride * num_comp; 381 const CeedInt v_dim_stride = v_comp_stride * num_comp; 382 const CeedInt u_size = P; 383 const CeedInt v_size = Q; 384 const CeedInt in_dim = is_transpose ? impl->dim : 1; 385 const CeedInt out_dim = is_transpose ? 1 : impl->dim; 386 const CeedScalar *d_G = impl->d_grad; 387 388 sycl::range<2> kernel_range(num_elem, v_size); 389 390 // Order queue 391 sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); 392 sycl_queue.parallel_for<CeedBasisSyclGradNT>(kernel_range, {e}, [=](sycl::id<2> indx) { 393 const CeedInt i = indx[1]; 394 const CeedInt elem = indx[0]; 395 396 for (CeedInt comp = 0; comp < num_comp; comp++) { 397 CeedScalar V[3] = {0.0, 0.0, 0.0}; 398 399 for (CeedInt d1 = 0; d1 < in_dim; ++d1) { 400 const CeedScalar *U = d_U + elem * u_stride + comp * u_comp_stride + d1 * u_dim_stride; 401 const CeedScalar *G = d_G + i * stride_0 + d1 * g_dim_stride; 402 403 for (CeedInt j = 0; j < u_size; ++j) { 404 const CeedScalar Uj = U[j]; 405 406 for (CeedInt d0 = 0; d0 < out_dim; ++d0) { 407 V[d0] += G[j * stride_1 + d0 * g_dim_stride] * Uj; 408 } 409 } 410 } 411 for (CeedInt d0 = 0; d0 < out_dim; ++d0) { 412 d_V[i + elem * v_stride + comp * v_comp_stride + d0 * v_dim_stride] = V[d0]; 413 } 414 } 415 }); 416 return CEED_ERROR_SUCCESS; 417 } 418 419 //------------------------------------------------------------------------------ 420 // Weight kernel - non-tensor 421 //------------------------------------------------------------------------------ 422 static int CeedBasisApplyNonTensorWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, const CeedBasisNonTensor_Sycl *impl, CeedScalar *d_V) { 423 const CeedInt num_qpts = impl->num_qpts; 424 const CeedScalar *q_weight = impl->d_q_weight; 425 426 sycl::range<2> kernel_range(num_elem, num_qpts); 427 428 // Order queue 429 sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); 430 sycl_queue.parallel_for<CeedBasisSyclWeightNT>(kernel_range, {e}, [=](sycl::id<2> indx) { 431 const CeedInt i = indx[1]; 432 const CeedInt elem = indx[0]; 433 d_V[i + elem * num_qpts] = q_weight[i]; 434 }); 435 return CEED_ERROR_SUCCESS; 436 } 437 438 //------------------------------------------------------------------------------ 439 // Basis apply - non-tensor 440 //------------------------------------------------------------------------------ 441 static int CeedBasisApplyNonTensor_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 442 CeedVector v) { 443 Ceed ceed; 444 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 445 const CeedScalar *d_u; 446 CeedScalar *d_v; 447 CeedBasisNonTensor_Sycl *impl; 448 Ceed_Sycl *data; 449 450 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 451 CeedCallBackend(CeedBasisGetData(basis, &impl)); 452 CeedCallBackend(CeedGetData(ceed, &data)); 453 454 // Get read/write access to u, v 455 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 456 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 457 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 458 459 // Clear v for transpose operation 460 if (is_transpose) { 461 CeedSize length; 462 CeedCallBackend(CeedVectorGetLength(v, &length)); 463 // Order queue 464 sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); 465 data->sycl_queue.fill<CeedScalar>(d_v, 0, length, {e}); 466 } 467 468 // Apply basis operation 469 switch (eval_mode) { 470 case CEED_EVAL_INTERP: { 471 CeedCallBackend(CeedBasisApplyNonTensorInterp_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v)); 472 } break; 473 case CEED_EVAL_GRAD: { 474 CeedCallBackend(CeedBasisApplyNonTensorGrad_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v)); 475 } break; 476 case CEED_EVAL_WEIGHT: { 477 CeedCallBackend(CeedBasisApplyNonTensorWeight_Sycl(data->sycl_queue, num_elem, impl, d_v)); 478 } break; 479 case CEED_EVAL_NONE: /* handled separately below */ 480 break; 481 // LCOV_EXCL_START 482 case CEED_EVAL_DIV: 483 case CEED_EVAL_CURL: 484 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 485 // LCOV_EXCL_STOP 486 } 487 488 // Restore vectors, cover CEED_EVAL_NONE 489 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 490 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 491 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 492 493 return CEED_ERROR_SUCCESS; 494 } 495 496 //------------------------------------------------------------------------------ 497 // Destroy tensor basis 498 //------------------------------------------------------------------------------ 499 static int CeedBasisDestroy_Sycl(CeedBasis basis) { 500 Ceed ceed; 501 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 502 CeedBasis_Sycl *impl; 503 CeedCallBackend(CeedBasisGetData(basis, &impl)); 504 Ceed_Sycl *data; 505 CeedCallBackend(CeedGetData(ceed, &data)); 506 507 // Wait for all work to finish before freeing memory 508 CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); 509 510 CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context)); 511 CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context)); 512 CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context)); 513 514 CeedCallBackend(CeedFree(&impl)); 515 return CEED_ERROR_SUCCESS; 516 } 517 518 //------------------------------------------------------------------------------ 519 // Destroy non-tensor basis 520 //------------------------------------------------------------------------------ 521 static int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) { 522 Ceed ceed; 523 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 524 CeedBasisNonTensor_Sycl *impl; 525 CeedCallBackend(CeedBasisGetData(basis, &impl)); 526 Ceed_Sycl *data; 527 CeedCallBackend(CeedGetData(ceed, &data)); 528 529 // Wait for all work to finish before freeing memory 530 CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); 531 532 CeedCallSycl(ceed, sycl::free(impl->d_q_weight, data->sycl_context)); 533 CeedCallSycl(ceed, sycl::free(impl->d_interp, data->sycl_context)); 534 CeedCallSycl(ceed, sycl::free(impl->d_grad, data->sycl_context)); 535 536 CeedCallBackend(CeedFree(&impl)); 537 return CEED_ERROR_SUCCESS; 538 } 539 540 //------------------------------------------------------------------------------ 541 // Create tensor 542 //------------------------------------------------------------------------------ 543 int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 544 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 545 Ceed ceed; 546 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 547 CeedBasis_Sycl *impl; 548 CeedCallBackend(CeedCalloc(1, &impl)); 549 Ceed_Sycl *data; 550 CeedCallBackend(CeedGetData(ceed, &data)); 551 552 CeedInt num_comp; 553 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 554 555 const CeedInt num_nodes = CeedIntPow(P_1d, dim); 556 const CeedInt num_qpts = CeedIntPow(Q_1d, dim); 557 558 impl->dim = dim; 559 impl->P_1d = P_1d; 560 impl->Q_1d = Q_1d; 561 impl->num_comp = num_comp; 562 impl->num_nodes = num_nodes; 563 impl->num_qpts = num_qpts; 564 impl->buf_len = num_comp * CeedIntMax(num_nodes, num_qpts); 565 impl->op_len = Q_1d * P_1d; 566 567 // Order queue 568 sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); 569 570 CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context)); 571 sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, {e}); 572 573 const CeedInt interp_length = Q_1d * P_1d; 574 CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context)); 575 sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, {e}); 576 577 CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context)); 578 sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, {e}); 579 580 CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad})); 581 582 std::vector<sycl::kernel_id> kernel_ids = {sycl::get_kernel_id<CeedBasisSyclInterp<1>>(), sycl::get_kernel_id<CeedBasisSyclInterp<0>>(), 583 sycl::get_kernel_id<CeedBasisSyclGrad<1>>(), sycl::get_kernel_id<CeedBasisSyclGrad<0>>()}; 584 585 sycl::kernel_bundle<sycl::bundle_state::input> input_bundle = sycl::get_kernel_bundle<sycl::bundle_state::input>(data->sycl_context, kernel_ids); 586 input_bundle.set_specialization_constant<BASIS_DIM_ID>(dim); 587 input_bundle.set_specialization_constant<BASIS_NUM_COMP_ID>(num_comp); 588 input_bundle.set_specialization_constant<BASIS_Q_1D_ID>(Q_1d); 589 input_bundle.set_specialization_constant<BASIS_P_1D_ID>(P_1d); 590 591 CeedCallSycl(ceed, impl->sycl_module = new SyclModule_t(sycl::build(input_bundle))); 592 593 CeedCallBackend(CeedBasisSetData(basis, impl)); 594 595 // Register backend functions 596 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApply_Sycl)); 597 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl)); 598 return CEED_ERROR_SUCCESS; 599 } 600 601 //------------------------------------------------------------------------------ 602 // Create non-tensor 603 //------------------------------------------------------------------------------ 604 int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, 605 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 606 Ceed ceed; 607 CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 608 CeedBasisNonTensor_Sycl *impl; 609 CeedCallBackend(CeedCalloc(1, &impl)); 610 Ceed_Sycl *data; 611 CeedCallBackend(CeedGetData(ceed, &data)); 612 613 CeedInt num_comp; 614 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 615 616 impl->dim = dim; 617 impl->num_comp = num_comp; 618 impl->num_nodes = num_nodes; 619 impl->num_qpts = num_qpts; 620 621 // Order queue 622 sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); 623 624 CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context)); 625 sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, {e}); 626 627 const CeedInt interp_length = num_qpts * num_nodes; 628 CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context)); 629 sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, {e}); 630 631 const CeedInt grad_length = num_qpts * num_nodes * dim; 632 CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context)); 633 sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, {e}); 634 635 CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_weight, copy_interp, copy_grad})); 636 637 CeedCallBackend(CeedBasisSetData(basis, impl)); 638 639 // Register backend functions 640 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Sycl)); 641 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Sycl)); 642 return CEED_ERROR_SUCCESS; 643 } 644 645 //------------------------------------------------------------------------------ 646