1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 #include <ceed/backend.h>
9 #include <ceed/ceed.h>
10 #include <ceed/jit-tools.h>
11
12 #include <sycl/sycl.hpp>
13 #include <vector>
14
15 #include "../sycl/ceed-sycl-compile.hpp"
16 #include "ceed-sycl-ref.hpp"
17
18 template <int>
19 class CeedBasisSyclInterp;
20 template <int>
21 class CeedBasisSyclGrad;
22 class CeedBasisSyclWeight;
23
24 class CeedBasisSyclInterpNT;
25 class CeedBasisSyclGradNT;
26 class CeedBasisSyclWeightNT;
27
28 using SpecID = sycl::specialization_id<CeedInt>;
29
30 static constexpr SpecID BASIS_DIM_ID;
31 static constexpr SpecID BASIS_NUM_COMP_ID;
32 static constexpr SpecID BASIS_P_1D_ID;
33 static constexpr SpecID BASIS_Q_1D_ID;
34
35 //------------------------------------------------------------------------------
36 // Interpolation kernel - tensor
37 //------------------------------------------------------------------------------
38 template <int is_transpose>
CeedBasisApplyInterp_Sycl(sycl::queue & sycl_queue,const SyclModule_t & sycl_module,CeedInt num_elem,const CeedBasis_Sycl * impl,const CeedScalar * u,CeedScalar * v)39 static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
40 const CeedScalar *u, CeedScalar *v) {
41 const CeedInt buf_len = impl->buf_len;
42 const CeedInt op_len = impl->op_len;
43 const CeedScalar *interp_1d = impl->d_interp_1d;
44
45 const sycl::device &sycl_device = sycl_queue.get_device();
46 const CeedInt max_work_group_size = 32;
47 const CeedInt work_group_size = CeedIntMin(impl->num_qpts, max_work_group_size);
48 sycl::range<1> local_range(work_group_size);
49 sycl::range<1> global_range(num_elem * work_group_size);
50 sycl::nd_range<1> kernel_range(global_range, local_range);
51
52 std::vector<sycl::event> e;
53
54 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
55
56 sycl_queue.submit([&](sycl::handler &cgh) {
57 cgh.depends_on(e);
58 cgh.use_kernel_bundle(sycl_module);
59
60 sycl::local_accessor<CeedScalar> s_mem(op_len + 2 * buf_len, cgh);
61
62 cgh.parallel_for<CeedBasisSyclInterp<is_transpose>>(kernel_range, [=](sycl::nd_item<1> work_item, sycl::kernel_handler kh) {
63 //-------------------------------------------------------------->
64 // Retrieve spec constant values
65 const CeedInt dim = kh.get_specialization_constant<BASIS_DIM_ID>();
66 const CeedInt num_comp = kh.get_specialization_constant<BASIS_NUM_COMP_ID>();
67 const CeedInt P_1d = kh.get_specialization_constant<BASIS_P_1D_ID>();
68 const CeedInt Q_1d = kh.get_specialization_constant<BASIS_Q_1D_ID>();
69 //-------------------------------------------------------------->
70 const CeedInt num_nodes = CeedIntPow(P_1d, dim);
71 const CeedInt num_qpts = CeedIntPow(Q_1d, dim);
72 const CeedInt P = is_transpose ? Q_1d : P_1d;
73 const CeedInt Q = is_transpose ? P_1d : Q_1d;
74 const CeedInt stride_0 = is_transpose ? 1 : P_1d;
75 const CeedInt stride_1 = is_transpose ? P_1d : 1;
76 const CeedInt u_stride = is_transpose ? num_qpts : num_nodes;
77 const CeedInt v_stride = is_transpose ? num_nodes : num_qpts;
78 const CeedInt u_comp_stride = num_elem * u_stride;
79 const CeedInt v_comp_stride = num_elem * v_stride;
80 const CeedInt u_size = u_stride;
81
82 sycl::group work_group = work_item.get_group();
83 const CeedInt i = work_item.get_local_linear_id();
84 const CeedInt group_size = work_group.get_local_linear_range();
85 const CeedInt elem = work_group.get_group_linear_id();
86
87 CeedScalar *s_interp_1d = s_mem.get_multi_ptr<sycl::access::decorated::yes>().get();
88 CeedScalar *s_buffer_1 = s_interp_1d + Q * P;
89 CeedScalar *s_buffer_2 = s_buffer_1 + buf_len;
90
91 for (CeedInt k = i; k < P * Q; k += group_size) {
92 s_interp_1d[k] = interp_1d[k];
93 }
94
95 // Apply basis element by element
96 for (CeedInt comp = 0; comp < num_comp; comp++) {
97 const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
98 CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
99
100 for (CeedInt k = i; k < u_size; k += group_size) {
101 s_buffer_1[k] = cur_u[k];
102 }
103
104 CeedInt pre = u_size;
105 CeedInt post = 1;
106
107 for (CeedInt d = 0; d < dim; d++) {
108 // Use older version of sycl workgroup barrier for performance reasons
109 // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed
110 // sycl::group_barrier(work_group);
111 work_item.barrier(sycl::access::fence_space::local_space);
112
113 pre /= P;
114 const CeedScalar *in = d % 2 ? s_buffer_2 : s_buffer_1;
115 CeedScalar *out = d == dim - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
116
117 // Contract along middle index
118 const CeedInt writeLen = pre * post * Q;
119 for (CeedInt k = i; k < writeLen; k += group_size) {
120 const CeedInt c = k % post;
121 const CeedInt j = (k / post) % Q;
122 const CeedInt a = k / (post * Q);
123
124 CeedScalar vk = 0;
125 for (CeedInt b = 0; b < P; b++) {
126 vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
127 }
128 out[k] = vk;
129 }
130 post *= Q;
131 }
132 }
133 });
134 });
135 return CEED_ERROR_SUCCESS;
136 }
137
138 //------------------------------------------------------------------------------
139 // Gradient kernel - tensor
140 //------------------------------------------------------------------------------
141 template <int is_transpose>
CeedBasisApplyGrad_Sycl(sycl::queue & sycl_queue,const SyclModule_t & sycl_module,CeedInt num_elem,const CeedBasis_Sycl * impl,const CeedScalar * u,CeedScalar * v)142 static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
143 const CeedScalar *u, CeedScalar *v) {
144 const CeedInt buf_len = impl->buf_len;
145 const CeedInt op_len = impl->op_len;
146 const CeedScalar *interp_1d = impl->d_interp_1d;
147 const CeedScalar *grad_1d = impl->d_grad_1d;
148
149 const sycl::device &sycl_device = sycl_queue.get_device();
150 const CeedInt work_group_size = 32;
151 sycl::range<1> local_range(work_group_size);
152 sycl::range<1> global_range(num_elem * work_group_size);
153 sycl::nd_range<1> kernel_range(global_range, local_range);
154
155 std::vector<sycl::event> e;
156
157 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
158
159 sycl_queue.submit([&](sycl::handler &cgh) {
160 cgh.depends_on(e);
161 cgh.use_kernel_bundle(sycl_module);
162
163 sycl::local_accessor<CeedScalar> s_mem(2 * (op_len + buf_len), cgh);
164
165 cgh.parallel_for<CeedBasisSyclGrad<is_transpose>>(kernel_range, [=](sycl::nd_item<1> work_item, sycl::kernel_handler kh) {
166 //-------------------------------------------------------------->
167 // Retrieve spec constant values
168 const CeedInt dim = kh.get_specialization_constant<BASIS_DIM_ID>();
169 const CeedInt num_comp = kh.get_specialization_constant<BASIS_NUM_COMP_ID>();
170 const CeedInt P_1d = kh.get_specialization_constant<BASIS_P_1D_ID>();
171 const CeedInt Q_1d = kh.get_specialization_constant<BASIS_Q_1D_ID>();
172 //-------------------------------------------------------------->
173 const CeedInt num_nodes = CeedIntPow(P_1d, dim);
174 const CeedInt num_qpts = CeedIntPow(Q_1d, dim);
175 const CeedInt P = is_transpose ? Q_1d : P_1d;
176 const CeedInt Q = is_transpose ? P_1d : Q_1d;
177 const CeedInt stride_0 = is_transpose ? 1 : P_1d;
178 const CeedInt stride_1 = is_transpose ? P_1d : 1;
179 const CeedInt u_stride = is_transpose ? num_qpts : num_nodes;
180 const CeedInt v_stride = is_transpose ? num_nodes : num_qpts;
181 const CeedInt u_comp_stride = num_elem * u_stride;
182 const CeedInt v_comp_stride = num_elem * v_stride;
183 const CeedInt u_dim_stride = is_transpose ? num_elem * num_qpts * num_comp : 0;
184 const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * num_qpts * num_comp;
185 sycl::group work_group = work_item.get_group();
186 const CeedInt i = work_item.get_local_linear_id();
187 const CeedInt group_size = work_group.get_local_linear_range();
188 const CeedInt elem = work_group.get_group_linear_id();
189
190 CeedScalar *s_interp_1d = s_mem.get_multi_ptr<sycl::access::decorated::yes>().get();
191 CeedScalar *s_grad_1d = s_interp_1d + P * Q;
192 CeedScalar *s_buffer_1 = s_grad_1d + P * Q;
193 CeedScalar *s_buffer_2 = s_buffer_1 + buf_len;
194
195 for (CeedInt k = i; k < P * Q; k += group_size) {
196 s_interp_1d[k] = interp_1d[k];
197 s_grad_1d[k] = grad_1d[k];
198 }
199
200 // Apply basis element by element
201 for (CeedInt comp = 0; comp < num_comp; comp++) {
202 for (CeedInt dim_1 = 0; dim_1 < dim; dim_1++) {
203 CeedInt pre = is_transpose ? num_qpts : num_nodes;
204 CeedInt post = 1;
205 const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride;
206 CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride;
207
208 for (CeedInt dim_2 = 0; dim_2 < dim; dim_2++) {
209 // Use older version of sycl workgroup barrier for performance reasons
210 // Can be updated in future to align with SYCL2020 spec if performance bottleneck is removed
211 // sycl::group_barrier(work_group);
212 work_item.barrier(sycl::access::fence_space::local_space);
213
214 pre /= P;
215 const CeedScalar *op = dim_1 == dim_2 ? s_grad_1d : s_interp_1d;
216 const CeedScalar *in = (dim_2 == 0 ? cur_u : (dim_2 % 2 ? s_buffer_2 : s_buffer_1));
217 CeedScalar *out = dim_2 == dim - 1 ? cur_v : (dim_2 % 2 ? s_buffer_1 : s_buffer_2);
218
219 // Contract along middle index
220 const CeedInt writeLen = pre * post * Q;
221 for (CeedInt k = i; k < writeLen; k += group_size) {
222 const CeedInt c = k % post;
223 const CeedInt j = (k / post) % Q;
224 const CeedInt a = k / (post * Q);
225
226 CeedScalar v_k = 0;
227 for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c];
228
229 if (is_transpose && dim_2 == dim - 1) out[k] += v_k;
230 else out[k] = v_k;
231 }
232
233 post *= Q;
234 }
235 }
236 }
237 });
238 });
239 return CEED_ERROR_SUCCESS;
240 }
241
242 //------------------------------------------------------------------------------
243 // Weight kernel - tensor
244 //------------------------------------------------------------------------------
CeedBasisApplyWeight_Sycl(sycl::queue & sycl_queue,CeedInt num_elem,const CeedBasis_Sycl * impl,CeedScalar * w)245 static int CeedBasisApplyWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, const CeedBasis_Sycl *impl, CeedScalar *w) {
246 const CeedInt dim = impl->dim;
247 const CeedInt Q_1d = impl->Q_1d;
248 const CeedScalar *q_weight_1d = impl->d_q_weight_1d;
249
250 const CeedInt num_quad_x = Q_1d;
251 const CeedInt num_quad_y = (dim > 1) ? Q_1d : 1;
252 const CeedInt num_quad_z = (dim > 2) ? Q_1d : 1;
253 sycl::range<3> kernel_range(num_elem * num_quad_z, num_quad_y, num_quad_x);
254
255 std::vector<sycl::event> e;
256
257 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
258
259 sycl_queue.parallel_for<CeedBasisSyclWeight>(kernel_range, e, [=](sycl::item<3> work_item) {
260 if (dim == 1) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]];
261 if (dim == 2) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]];
262 if (dim == 3) w[work_item.get_linear_id()] = q_weight_1d[work_item[2]] * q_weight_1d[work_item[1]] * q_weight_1d[work_item[0] % Q_1d];
263 });
264 return CEED_ERROR_SUCCESS;
265 }
266
267 //------------------------------------------------------------------------------
268 // Basis apply - tensor
269 //------------------------------------------------------------------------------
CeedBasisApply_Sycl(CeedBasis basis,const CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode eval_mode,CeedVector u,CeedVector v)270 static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
271 CeedVector v) {
272 Ceed ceed;
273 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE;
274 const CeedScalar *d_u;
275 CeedScalar *d_v;
276 Ceed_Sycl *data;
277 CeedBasis_Sycl *impl;
278
279 CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
280 CeedCallBackend(CeedGetData(ceed, &data));
281 CeedCallBackend(CeedBasisGetData(basis, &impl));
282
283 // Get read/write access to u, v
284 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
285 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
286 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
287
288 // Clear v for transpose operation
289 if (is_transpose) {
290 CeedSize length;
291 CeedCallBackend(CeedVectorGetLength(v, &length));
292 std::vector<sycl::event> e;
293
294 if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
295 data->sycl_queue.fill<CeedScalar>(d_v, 0, length, e);
296 }
297
298 // Basis action
299 switch (eval_mode) {
300 case CEED_EVAL_INTERP:
301 if (is_transpose) {
302 CeedCallBackend(CeedBasisApplyInterp_Sycl<true>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
303 } else {
304 CeedCallBackend(CeedBasisApplyInterp_Sycl<false>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
305 }
306 break;
307 case CEED_EVAL_GRAD:
308 if (is_transpose) {
309 CeedCallBackend(CeedBasisApplyGrad_Sycl<true>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
310 } else {
311 CeedCallBackend(CeedBasisApplyGrad_Sycl<false>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
312 }
313 break;
314 case CEED_EVAL_WEIGHT:
315 CeedCheck(impl->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weight_1d not set", CeedEvalModes[eval_mode]);
316 CeedCallBackend(CeedBasisApplyWeight_Sycl(data->sycl_queue, num_elem, impl, d_v));
317 break;
318 case CEED_EVAL_NONE: /* handled separately below */
319 break;
320 // LCOV_EXCL_START
321 case CEED_EVAL_DIV:
322 case CEED_EVAL_CURL:
323 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
324 // LCOV_EXCL_STOP
325 }
326
327 // Restore vectors, cover CEED_EVAL_NONE
328 CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
329 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
330 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
331 CeedCallBackend(CeedDestroy(&ceed));
332 return CEED_ERROR_SUCCESS;
333 }
334
335 //------------------------------------------------------------------------------
336 // Interpolation kernel - non-tensor
337 //------------------------------------------------------------------------------
CeedBasisApplyNonTensorInterp_Sycl(sycl::queue & sycl_queue,CeedInt num_elem,CeedInt is_transpose,const CeedBasisNonTensor_Sycl * impl,const CeedScalar * d_U,CeedScalar * d_V)338 static int CeedBasisApplyNonTensorInterp_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, CeedInt is_transpose, const CeedBasisNonTensor_Sycl *impl,
339 const CeedScalar *d_U, CeedScalar *d_V) {
340 const CeedInt num_comp = impl->num_comp;
341 const CeedInt P = is_transpose ? impl->num_qpts : impl->num_nodes;
342 const CeedInt Q = is_transpose ? impl->num_nodes : impl->num_qpts;
343 const CeedInt stride_0 = is_transpose ? 1 : impl->num_nodes;
344 const CeedInt stride_1 = is_transpose ? impl->num_nodes : 1;
345 const CeedInt u_stride = P;
346 const CeedInt v_stride = Q;
347 const CeedInt u_comp_stride = u_stride * num_elem;
348 const CeedInt v_comp_stride = v_stride * num_elem;
349 const CeedInt u_size = P;
350 const CeedInt v_size = Q;
351 const CeedScalar *d_B = impl->d_interp;
352
353 sycl::range<2> kernel_range(num_elem, v_size);
354
355 std::vector<sycl::event> e;
356
357 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
358
359 sycl_queue.parallel_for<CeedBasisSyclInterpNT>(kernel_range, e, [=](sycl::id<2> indx) {
360 const CeedInt i = indx[1];
361 const CeedInt elem = indx[0];
362
363 for (CeedInt comp = 0; comp < num_comp; comp++) {
364 const CeedScalar *U = d_U + elem * u_stride + comp * u_comp_stride;
365 CeedScalar V = 0.0;
366
367 for (CeedInt j = 0; j < u_size; ++j) {
368 V += d_B[i * stride_0 + j * stride_1] * U[j];
369 }
370 d_V[i + elem * v_stride + comp * v_comp_stride] = V;
371 }
372 });
373 return CEED_ERROR_SUCCESS;
374 }
375
376 //------------------------------------------------------------------------------
377 // Gradient kernel - non-tensor
378 //------------------------------------------------------------------------------
CeedBasisApplyNonTensorGrad_Sycl(sycl::queue & sycl_queue,CeedInt num_elem,CeedInt is_transpose,const CeedBasisNonTensor_Sycl * impl,const CeedScalar * d_U,CeedScalar * d_V)379 static int CeedBasisApplyNonTensorGrad_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, CeedInt is_transpose, const CeedBasisNonTensor_Sycl *impl,
380 const CeedScalar *d_U, CeedScalar *d_V) {
381 const CeedInt num_comp = impl->num_comp;
382 const CeedInt P = is_transpose ? impl->num_qpts : impl->num_nodes;
383 const CeedInt Q = is_transpose ? impl->num_nodes : impl->num_qpts;
384 const CeedInt stride_0 = is_transpose ? 1 : impl->num_nodes;
385 const CeedInt stride_1 = is_transpose ? impl->num_nodes : 1;
386 const CeedInt g_dim_stride = P * Q;
387 const CeedInt u_stride = P;
388 const CeedInt v_stride = Q;
389 const CeedInt u_comp_stride = u_stride * num_elem;
390 const CeedInt v_comp_stride = v_stride * num_elem;
391 const CeedInt u_dim_stride = u_comp_stride * num_comp;
392 const CeedInt v_dim_stride = v_comp_stride * num_comp;
393 const CeedInt u_size = P;
394 const CeedInt v_size = Q;
395 const CeedInt in_dim = is_transpose ? impl->dim : 1;
396 const CeedInt out_dim = is_transpose ? 1 : impl->dim;
397 const CeedScalar *d_G = impl->d_grad;
398
399 sycl::range<2> kernel_range(num_elem, v_size);
400
401 std::vector<sycl::event> e;
402
403 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
404
405 sycl_queue.parallel_for<CeedBasisSyclGradNT>(kernel_range, e, [=](sycl::id<2> indx) {
406 const CeedInt i = indx[1];
407 const CeedInt elem = indx[0];
408
409 for (CeedInt comp = 0; comp < num_comp; comp++) {
410 CeedScalar V[3] = {0.0, 0.0, 0.0};
411
412 for (CeedInt d1 = 0; d1 < in_dim; ++d1) {
413 const CeedScalar *U = d_U + elem * u_stride + comp * u_comp_stride + d1 * u_dim_stride;
414 const CeedScalar *G = d_G + i * stride_0 + d1 * g_dim_stride;
415
416 for (CeedInt j = 0; j < u_size; ++j) {
417 const CeedScalar Uj = U[j];
418
419 for (CeedInt d0 = 0; d0 < out_dim; ++d0) {
420 V[d0] += G[j * stride_1 + d0 * g_dim_stride] * Uj;
421 }
422 }
423 }
424 for (CeedInt d0 = 0; d0 < out_dim; ++d0) {
425 d_V[i + elem * v_stride + comp * v_comp_stride + d0 * v_dim_stride] = V[d0];
426 }
427 }
428 });
429 return CEED_ERROR_SUCCESS;
430 }
431
432 //------------------------------------------------------------------------------
433 // Weight kernel - non-tensor
434 //------------------------------------------------------------------------------
CeedBasisApplyNonTensorWeight_Sycl(sycl::queue & sycl_queue,CeedInt num_elem,const CeedBasisNonTensor_Sycl * impl,CeedScalar * d_V)435 static int CeedBasisApplyNonTensorWeight_Sycl(sycl::queue &sycl_queue, CeedInt num_elem, const CeedBasisNonTensor_Sycl *impl, CeedScalar *d_V) {
436 const CeedInt num_qpts = impl->num_qpts;
437 const CeedScalar *q_weight = impl->d_q_weight;
438
439 sycl::range<2> kernel_range(num_elem, num_qpts);
440
441 std::vector<sycl::event> e;
442
443 if (!sycl_queue.is_in_order()) e = {sycl_queue.ext_oneapi_submit_barrier()};
444
445 sycl_queue.parallel_for<CeedBasisSyclWeightNT>(kernel_range, e, [=](sycl::id<2> indx) {
446 const CeedInt i = indx[1];
447 const CeedInt elem = indx[0];
448 d_V[i + elem * num_qpts] = q_weight[i];
449 });
450 return CEED_ERROR_SUCCESS;
451 }
452
453 //------------------------------------------------------------------------------
454 // Basis apply - non-tensor
455 //------------------------------------------------------------------------------
CeedBasisApplyNonTensor_Sycl(CeedBasis basis,const CeedInt num_elem,CeedTransposeMode t_mode,CeedEvalMode eval_mode,CeedVector u,CeedVector v)456 static int CeedBasisApplyNonTensor_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
457 CeedVector v) {
458 Ceed ceed;
459 const CeedInt is_transpose = t_mode == CEED_TRANSPOSE;
460 const CeedScalar *d_u;
461 CeedScalar *d_v;
462 CeedBasisNonTensor_Sycl *impl;
463 Ceed_Sycl *data;
464
465 CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
466 CeedCallBackend(CeedBasisGetData(basis, &impl));
467 CeedCallBackend(CeedGetData(ceed, &data));
468
469 // Get read/write access to u, v
470 if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
471 else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
472 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
473
474 // Clear v for transpose operation
475 if (is_transpose) {
476 CeedSize length;
477 CeedCallBackend(CeedVectorGetLength(v, &length));
478 // Order queue
479 sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier();
480 data->sycl_queue.fill<CeedScalar>(d_v, 0, length, {e});
481 }
482
483 // Apply basis operation
484 switch (eval_mode) {
485 case CEED_EVAL_INTERP:
486 CeedCallBackend(CeedBasisApplyNonTensorInterp_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v));
487 break;
488 case CEED_EVAL_GRAD:
489 CeedCallBackend(CeedBasisApplyNonTensorGrad_Sycl(data->sycl_queue, num_elem, is_transpose, impl, d_u, d_v));
490 break;
491 case CEED_EVAL_WEIGHT:
492 CeedCheck(impl->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
493 CeedCallBackend(CeedBasisApplyNonTensorWeight_Sycl(data->sycl_queue, num_elem, impl, d_v));
494 break;
495 case CEED_EVAL_NONE: /* handled separately below */
496 break;
497 // LCOV_EXCL_START
498 case CEED_EVAL_DIV:
499 case CEED_EVAL_CURL:
500 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
501 // LCOV_EXCL_STOP
502 }
503
504 // Restore vectors, cover CEED_EVAL_NONE
505 CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
506 if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
507 if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
508 CeedCallBackend(CeedDestroy(&ceed));
509 return CEED_ERROR_SUCCESS;
510 }
511
512 //------------------------------------------------------------------------------
513 // Destroy tensor basis
514 //------------------------------------------------------------------------------
CeedBasisDestroy_Sycl(CeedBasis basis)515 static int CeedBasisDestroy_Sycl(CeedBasis basis) {
516 Ceed ceed;
517 CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
518 CeedBasis_Sycl *impl;
519 CeedCallBackend(CeedBasisGetData(basis, &impl));
520 Ceed_Sycl *data;
521 CeedCallBackend(CeedGetData(ceed, &data));
522
523 // Wait for all work to finish before freeing memory
524 CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
525
526 if (impl->d_q_weight_1d) CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context));
527 CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context));
528 CeedCallSycl(ceed, sycl::free(impl->d_grad_1d, data->sycl_context));
529
530 CeedCallBackend(CeedFree(&impl));
531 CeedCallBackend(CeedDestroy(&ceed));
532 return CEED_ERROR_SUCCESS;
533 }
534
535 //------------------------------------------------------------------------------
536 // Destroy non-tensor basis
537 //------------------------------------------------------------------------------
CeedBasisDestroyNonTensor_Sycl(CeedBasis basis)538 static int CeedBasisDestroyNonTensor_Sycl(CeedBasis basis) {
539 Ceed ceed;
540 CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
541 CeedBasisNonTensor_Sycl *impl;
542 CeedCallBackend(CeedBasisGetData(basis, &impl));
543 Ceed_Sycl *data;
544 CeedCallBackend(CeedGetData(ceed, &data));
545
546 // Wait for all work to finish before freeing memory
547 CeedCallSycl(ceed, data->sycl_queue.wait_and_throw());
548
549 if (impl->d_q_weight) CeedCallSycl(ceed, sycl::free(impl->d_q_weight, data->sycl_context));
550 CeedCallSycl(ceed, sycl::free(impl->d_interp, data->sycl_context));
551 CeedCallSycl(ceed, sycl::free(impl->d_grad, data->sycl_context));
552
553 CeedCallBackend(CeedFree(&impl));
554 CeedCallBackend(CeedDestroy(&ceed));
555 return CEED_ERROR_SUCCESS;
556 }
557
558 //------------------------------------------------------------------------------
559 // Create tensor
560 //------------------------------------------------------------------------------
CeedBasisCreateTensorH1_Sycl(CeedInt dim,CeedInt P_1d,CeedInt Q_1d,const CeedScalar * interp_1d,const CeedScalar * grad_1d,const CeedScalar * q_ref_1d,const CeedScalar * q_weight_1d,CeedBasis basis)561 int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
562 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
563 Ceed ceed;
564 CeedBasis_Sycl *impl;
565 Ceed_Sycl *data;
566
567 CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
568 CeedCallBackend(CeedCalloc(1, &impl));
569 CeedCallBackend(CeedGetData(ceed, &data));
570
571 CeedInt num_comp;
572 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
573
574 const CeedInt num_nodes = CeedIntPow(P_1d, dim);
575 const CeedInt num_qpts = CeedIntPow(Q_1d, dim);
576
577 impl->dim = dim;
578 impl->P_1d = P_1d;
579 impl->Q_1d = Q_1d;
580 impl->num_comp = num_comp;
581 impl->num_nodes = num_nodes;
582 impl->num_qpts = num_qpts;
583 impl->buf_len = num_comp * CeedIntMax(num_nodes, num_qpts);
584 impl->op_len = Q_1d * P_1d;
585
586 std::vector<sycl::event> e;
587
588 if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
589
590 std::vector<sycl::event> copy_events;
591 if (q_weight_1d) {
592 CeedCallSycl(ceed, impl->d_q_weight_1d = sycl::malloc_device<CeedScalar>(Q_1d, data->sycl_device, data->sycl_context));
593 sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight_1d, impl->d_q_weight_1d, Q_1d, e);
594 copy_events.push_back(copy_weight);
595 }
596
597 const CeedInt interp_length = Q_1d * P_1d;
598 CeedCallSycl(ceed, impl->d_interp_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
599 sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp_1d, impl->d_interp_1d, interp_length, e);
600 copy_events.push_back(copy_interp);
601
602 CeedCallSycl(ceed, impl->d_grad_1d = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
603 sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad_1d, impl->d_grad_1d, interp_length, e);
604 copy_events.push_back(copy_grad);
605
606 CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
607
608 std::vector<sycl::kernel_id> kernel_ids = {sycl::get_kernel_id<CeedBasisSyclInterp<1>>(), sycl::get_kernel_id<CeedBasisSyclInterp<0>>(),
609 sycl::get_kernel_id<CeedBasisSyclGrad<1>>(), sycl::get_kernel_id<CeedBasisSyclGrad<0>>()};
610
611 sycl::kernel_bundle<sycl::bundle_state::input> input_bundle = sycl::get_kernel_bundle<sycl::bundle_state::input>(data->sycl_context, kernel_ids);
612 input_bundle.set_specialization_constant<BASIS_DIM_ID>(dim);
613 input_bundle.set_specialization_constant<BASIS_NUM_COMP_ID>(num_comp);
614 input_bundle.set_specialization_constant<BASIS_Q_1D_ID>(Q_1d);
615 input_bundle.set_specialization_constant<BASIS_P_1D_ID>(P_1d);
616
617 CeedCallSycl(ceed, impl->sycl_module = new SyclModule_t(sycl::build(input_bundle)));
618
619 CeedCallBackend(CeedBasisSetData(basis, impl));
620
621 // Register backend functions
622 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApply_Sycl));
623 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Sycl));
624 CeedCallBackend(CeedDestroy(&ceed));
625 return CEED_ERROR_SUCCESS;
626 }
627
628 //------------------------------------------------------------------------------
629 // Create non-tensor
630 //------------------------------------------------------------------------------
CeedBasisCreateH1_Sycl(CeedElemTopology topo,CeedInt dim,CeedInt num_nodes,CeedInt num_qpts,const CeedScalar * interp,const CeedScalar * grad,const CeedScalar * q_ref,const CeedScalar * q_weight,CeedBasis basis)631 int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
632 const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
633 Ceed ceed;
634 CeedBasisNonTensor_Sycl *impl;
635 Ceed_Sycl *data;
636
637 CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
638 CeedCallBackend(CeedCalloc(1, &impl));
639 CeedCallBackend(CeedGetData(ceed, &data));
640
641 CeedInt num_comp;
642 CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
643
644 impl->dim = dim;
645 impl->num_comp = num_comp;
646 impl->num_nodes = num_nodes;
647 impl->num_qpts = num_qpts;
648
649 std::vector<sycl::event> e;
650
651 if (!data->sycl_queue.is_in_order()) e = {data->sycl_queue.ext_oneapi_submit_barrier()};
652
653 std::vector<sycl::event> copy_events;
654 if (q_weight) {
655 CeedCallSycl(ceed, impl->d_q_weight = sycl::malloc_device<CeedScalar>(num_qpts, data->sycl_device, data->sycl_context));
656 sycl::event copy_weight = data->sycl_queue.copy<CeedScalar>(q_weight, impl->d_q_weight, num_qpts, e);
657 copy_events.push_back(copy_weight);
658 }
659
660 const CeedInt interp_length = num_qpts * num_nodes;
661 CeedCallSycl(ceed, impl->d_interp = sycl::malloc_device<CeedScalar>(interp_length, data->sycl_device, data->sycl_context));
662 sycl::event copy_interp = data->sycl_queue.copy<CeedScalar>(interp, impl->d_interp, interp_length, e);
663 copy_events.push_back(copy_interp);
664
665 const CeedInt grad_length = num_qpts * num_nodes * dim;
666 CeedCallSycl(ceed, impl->d_grad = sycl::malloc_device<CeedScalar>(grad_length, data->sycl_device, data->sycl_context));
667 sycl::event copy_grad = data->sycl_queue.copy<CeedScalar>(grad, impl->d_grad, grad_length, e);
668 copy_events.push_back(copy_grad);
669
670 CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events));
671
672 CeedCallBackend(CeedBasisSetData(basis, impl));
673
674 // Register backend functions
675 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Sycl));
676 CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Sycl));
677 CeedCallBackend(CeedDestroy(&ceed));
678 return CEED_ERROR_SUCCESS;
679 }
680
681 //------------------------------------------------------------------------------
682