10e6b6b59SJacob Faibussowitsch #include "sycldevice.hpp"
245a61cdeSJunchao Zhang #include <sycl/sycl.hpp>
3*7a4760caSJunchao Zhang #include <chrono>
4a2158755SJunchao Zhang
5d71ae5a4SJacob Faibussowitsch namespace Petsc
6d71ae5a4SJacob Faibussowitsch {
7a2158755SJunchao Zhang
8d71ae5a4SJacob Faibussowitsch namespace device
9d71ae5a4SJacob Faibussowitsch {
1017f48955SJacob Faibussowitsch
11d71ae5a4SJacob Faibussowitsch namespace sycl
12d71ae5a4SJacob Faibussowitsch {
1317f48955SJacob Faibussowitsch
14d71ae5a4SJacob Faibussowitsch namespace impl
15d71ae5a4SJacob Faibussowitsch {
1617f48955SJacob Faibussowitsch
179371c9d4SSatish Balay class DeviceContext {
18a2158755SJunchao Zhang public:
19ab4ee011SJunchao Zhang struct PetscDeviceContext_SYCL {
200e6b6b59SJacob Faibussowitsch ::sycl::event event;
210e6b6b59SJacob Faibussowitsch ::sycl::event begin; // timer-only
220e6b6b59SJacob Faibussowitsch ::sycl::event end; // timer-only
23a2158755SJunchao Zhang #if PetscDefined(USE_DEBUG)
24ab4ee011SJunchao Zhang PetscBool timerInUse{};
25a2158755SJunchao Zhang #endif
26ab4ee011SJunchao Zhang ::sycl::queue queue;
27*7a4760caSJunchao Zhang
28*7a4760caSJunchao Zhang std::chrono::time_point<std::chrono::steady_clock> timeBegin{};
29a2158755SJunchao Zhang };
30a2158755SJunchao Zhang
31a2158755SJunchao Zhang private:
32a2158755SJunchao Zhang static bool initialized_;
33a2158755SJunchao Zhang
finalize_()34089fb57cSJacob Faibussowitsch static PetscErrorCode finalize_() noexcept
35d71ae5a4SJacob Faibussowitsch {
36a2158755SJunchao Zhang PetscFunctionBegin;
37a2158755SJunchao Zhang initialized_ = false;
383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
39a2158755SJunchao Zhang }
40a2158755SJunchao Zhang
initialize_(PetscInt id,PetscDeviceContext dctx)41ab4ee011SJunchao Zhang static PetscErrorCode initialize_(PetscInt id, PetscDeviceContext dctx) noexcept
42d71ae5a4SJacob Faibussowitsch {
43a2158755SJunchao Zhang PetscFunctionBegin;
449566063dSJacob Faibussowitsch PetscCall(PetscDeviceCheckDeviceCount_Internal(id));
45a2158755SJunchao Zhang if (!initialized_) {
46a2158755SJunchao Zhang initialized_ = true;
479566063dSJacob Faibussowitsch PetscCall(PetscRegisterFinalize(finalize_));
48a2158755SJunchao Zhang }
493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
50a2158755SJunchao Zhang }
51a2158755SJunchao Zhang
52a2158755SJunchao Zhang public:
533f675fcfSPierre Jolivet const struct _DeviceContextOps ops = {destroy, changeStreamType, setUp, query, waitForContext, synchronize, getBlasHandle, getSolverHandle, getStreamHandle, beginTimer, endTimer, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
54a2158755SJunchao Zhang
55a2158755SJunchao Zhang // default constructor
5617f48955SJacob Faibussowitsch DeviceContext() noexcept = default;
57a2158755SJunchao Zhang
58a2158755SJunchao Zhang // All of these functions MUST be static in order to be callable from C, otherwise they
59a2158755SJunchao Zhang // get the implicit 'this' pointer tacked on
destroy(PetscDeviceContext dctx)60089fb57cSJacob Faibussowitsch static PetscErrorCode destroy(PetscDeviceContext dctx) noexcept
61d71ae5a4SJacob Faibussowitsch {
62a2158755SJunchao Zhang PetscFunctionBegin;
63ab4ee011SJunchao Zhang delete static_cast<PetscDeviceContext_SYCL *>(dctx->data);
64a2158755SJunchao Zhang dctx->data = nullptr;
653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
66f4f49eeaSPierre Jolivet }
6731d47070SJunchao Zhang
setUp(PetscDeviceContext dctx)6831d47070SJunchao Zhang static PetscErrorCode setUp(PetscDeviceContext dctx) noexcept
6931d47070SJunchao Zhang {
70*7a4760caSJunchao Zhang PetscDevice dev;
71*7a4760caSJunchao Zhang PetscInt id;
72*7a4760caSJunchao Zhang
7331d47070SJunchao Zhang PetscFunctionBegin;
7431d47070SJunchao Zhang #if PetscDefined(USE_DEBUG)
7531d47070SJunchao Zhang static_cast<PetscDeviceContext_SYCL *>(dctx->data)->timerInUse = PETSC_FALSE;
7631d47070SJunchao Zhang #endif
77*7a4760caSJunchao Zhang PetscCall(PetscDeviceContextGetDevice(dctx, &dev));
78*7a4760caSJunchao Zhang PetscCall(PetscDeviceGetDeviceId(dev, &id));
79*7a4760caSJunchao Zhang const ::sycl::device &syclDevice = (id == PETSC_SYCL_DEVICE_HOST) ? ::sycl::device(::sycl::cpu_selector_v) : ::sycl::device::get_devices(::sycl::info::device_type::gpu)[id];
80*7a4760caSJunchao Zhang
81*7a4760caSJunchao Zhang static_cast<PetscDeviceContext_SYCL *>(dctx->data)->queue = ::sycl::queue(syclDevice, ::sycl::property::queue::in_order());
8231d47070SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS);
83f4f49eeaSPierre Jolivet }
8431d47070SJunchao Zhang
query(PetscDeviceContext dctx,PetscBool * idle)8531d47070SJunchao Zhang static PetscErrorCode query(PetscDeviceContext dctx, PetscBool *idle) noexcept
8631d47070SJunchao Zhang {
8731d47070SJunchao Zhang PetscFunctionBegin;
8831d47070SJunchao Zhang // available in future, https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_oneapi_queue_empty.asciidoc
8931d47070SJunchao Zhang // *idle = static_cast<PetscDeviceContext_SYCL*>(dctx->data)->queue.empty() ? PETSC_TRUE : PETSC_FALSE;
9031d47070SJunchao Zhang *idle = PETSC_FALSE;
9131d47070SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS);
92f4f49eeaSPierre Jolivet }
9331d47070SJunchao Zhang
synchronize(PetscDeviceContext dctx)9431d47070SJunchao Zhang static PetscErrorCode synchronize(PetscDeviceContext dctx) noexcept
9531d47070SJunchao Zhang {
9631d47070SJunchao Zhang PetscBool idle = PETSC_TRUE;
9731d47070SJunchao Zhang const auto dci = static_cast<PetscDeviceContext_SYCL *>(dctx->data);
9831d47070SJunchao Zhang
9931d47070SJunchao Zhang PetscFunctionBegin;
10031d47070SJunchao Zhang PetscCall(query(dctx, &idle));
10131d47070SJunchao Zhang if (!idle) PetscCallCXX(dci->queue.wait());
10231d47070SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS);
103f4f49eeaSPierre Jolivet }
10431d47070SJunchao Zhang
getStreamHandle(PetscDeviceContext dctx,void ** handle)10597cd0981SJacob Faibussowitsch static PetscErrorCode getStreamHandle(PetscDeviceContext dctx, void **handle) noexcept
10631d47070SJunchao Zhang {
10731d47070SJunchao Zhang PetscFunctionBegin;
10897cd0981SJacob Faibussowitsch *reinterpret_cast<::sycl::queue **>(handle) = &(static_cast<PetscDeviceContext_SYCL *>(dctx->data)->queue);
10931d47070SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS);
110f4f49eeaSPierre Jolivet }
11131d47070SJunchao Zhang
beginTimer(PetscDeviceContext dctx)11231d47070SJunchao Zhang static PetscErrorCode beginTimer(PetscDeviceContext dctx) noexcept
11331d47070SJunchao Zhang {
11431d47070SJunchao Zhang const auto dci = static_cast<PetscDeviceContext_SYCL *>(dctx->data);
11531d47070SJunchao Zhang
11631d47070SJunchao Zhang PetscFunctionBegin;
11731d47070SJunchao Zhang #if PetscDefined(USE_DEBUG)
11831d47070SJunchao Zhang PetscCheck(!dci->timerInUse, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Forgot to call PetscLogGpuTimeEnd()?");
11931d47070SJunchao Zhang dci->timerInUse = PETSC_TRUE;
12031d47070SJunchao Zhang #endif
121*7a4760caSJunchao Zhang // It is not a good approach to time SYCL kernels because the timer starts at the kernel launch time at host,
122*7a4760caSJunchao Zhang // not at the start of execution time on device. SYCL provides this style of kernel timing:
123*7a4760caSJunchao Zhang /*
124*7a4760caSJunchao Zhang sycl::queue q(sycl::default_selector_v, sycl::property::queue::enable_profiling{});
125*7a4760caSJunchao Zhang sycl::event e = q.submit([&](sycl::handler &h) {
126*7a4760caSJunchao Zhang ...
127*7a4760caSJunchao Zhang });
128*7a4760caSJunchao Zhang e.wait();
129*7a4760caSJunchao Zhang auto start_time = e.get_profiling_info<sycl::info::event_profiling::command_start>();
130*7a4760caSJunchao Zhang auto end_time = e.get_profiling_info<sycl::info::event_profiling::command_end>();
131*7a4760caSJunchao Zhang long long kernel_duration_ns = end_time - start_time;
132*7a4760caSJunchao Zhang */
133*7a4760caSJunchao Zhang // It requires 1) enable profiling at the queue's creation time, and 2) store the event returned by kernel launch.
134*7a4760caSJunchao Zhang // But neither we have control of the input queue, nor does PetscDeviceContext support 2), so we just use a
135*7a4760caSJunchao Zhang // host side timer.
136*7a4760caSJunchao Zhang PetscCallCXX(dci->timeBegin = std::chrono::steady_clock::now());
13731d47070SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS);
138f4f49eeaSPierre Jolivet }
13931d47070SJunchao Zhang
endTimer(PetscDeviceContext dctx,PetscLogDouble * elapsed)14031d47070SJunchao Zhang static PetscErrorCode endTimer(PetscDeviceContext dctx, PetscLogDouble *elapsed) noexcept
14131d47070SJunchao Zhang {
14231d47070SJunchao Zhang const auto dci = static_cast<PetscDeviceContext_SYCL *>(dctx->data);
143*7a4760caSJunchao Zhang std::chrono::duration<double> duration;
14431d47070SJunchao Zhang
14531d47070SJunchao Zhang PetscFunctionBegin;
14631d47070SJunchao Zhang #if PetscDefined(USE_DEBUG)
14731d47070SJunchao Zhang PetscCheck(dci->timerInUse, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Forgot to call PetscLogGpuTimeBegin()?");
14831d47070SJunchao Zhang dci->timerInUse = PETSC_FALSE;
14931d47070SJunchao Zhang #endif
15031d47070SJunchao Zhang PetscCallCXX(dci->queue.wait());
151*7a4760caSJunchao Zhang PetscCallCXX(duration = std::chrono::steady_clock::now() - dci->timeBegin);
152*7a4760caSJunchao Zhang PetscCallCXX(*elapsed = duration.count());
15331d47070SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS);
154f4f49eeaSPierre Jolivet }
15531d47070SJunchao Zhang
changeStreamType(PetscDeviceContext,PetscStreamType)156f4f49eeaSPierre Jolivet static PetscErrorCode changeStreamType(PetscDeviceContext, PetscStreamType) noexcept { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); }
waitForContext(PetscDeviceContext,PetscDeviceContext)157f4f49eeaSPierre Jolivet static PetscErrorCode waitForContext(PetscDeviceContext, PetscDeviceContext) noexcept { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); }
getBlasHandle(PetscDeviceContext,void *)158f4f49eeaSPierre Jolivet static PetscErrorCode getBlasHandle(PetscDeviceContext, void *) noexcept { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); }
getSolverHandle(PetscDeviceContext,void *)159f4f49eeaSPierre Jolivet static PetscErrorCode getSolverHandle(PetscDeviceContext, void *) noexcept { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); }
160a2158755SJunchao Zhang };
161a2158755SJunchao Zhang
1620e6b6b59SJacob Faibussowitsch } // namespace impl
16317f48955SJacob Faibussowitsch
1640e6b6b59SJacob Faibussowitsch } // namespace sycl
16517f48955SJacob Faibussowitsch
1660e6b6b59SJacob Faibussowitsch } // namespace device
16717f48955SJacob Faibussowitsch
168a2158755SJunchao Zhang } // namespace Petsc
169a2158755SJunchao Zhang
PetscDeviceContextCreate_SYCL(PetscDeviceContext dctx)170d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscDeviceContextCreate_SYCL(PetscDeviceContext dctx)
171d71ae5a4SJacob Faibussowitsch {
1720e6b6b59SJacob Faibussowitsch using namespace Petsc::device::sycl::impl;
17317f48955SJacob Faibussowitsch
17417f48955SJacob Faibussowitsch static const DeviceContext syclctx;
175a2158755SJunchao Zhang
176a2158755SJunchao Zhang PetscFunctionBegin;
177ab4ee011SJunchao Zhang PetscCallCXX(dctx->data = new DeviceContext::PetscDeviceContext_SYCL());
178aea10558SJacob Faibussowitsch dctx->ops[0] = syclctx.ops;
1793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
180a2158755SJunchao Zhang }
181