1 #include "sycldevice.hpp"
2 #include <sycl/sycl.hpp>
3 #include <chrono>
4
5 namespace Petsc
6 {
7
8 namespace device
9 {
10
11 namespace sycl
12 {
13
14 namespace impl
15 {
16
17 class DeviceContext {
18 public:
19 struct PetscDeviceContext_SYCL {
20 ::sycl::event event;
21 ::sycl::event begin; // timer-only
22 ::sycl::event end; // timer-only
23 #if PetscDefined(USE_DEBUG)
24 PetscBool timerInUse{};
25 #endif
26 ::sycl::queue queue;
27
28 std::chrono::time_point<std::chrono::steady_clock> timeBegin{};
29 };
30
31 private:
32 static bool initialized_;
33
finalize_()34 static PetscErrorCode finalize_() noexcept
35 {
36 PetscFunctionBegin;
37 initialized_ = false;
38 PetscFunctionReturn(PETSC_SUCCESS);
39 }
40
initialize_(PetscInt id,PetscDeviceContext dctx)41 static PetscErrorCode initialize_(PetscInt id, PetscDeviceContext dctx) noexcept
42 {
43 PetscFunctionBegin;
44 PetscCall(PetscDeviceCheckDeviceCount_Internal(id));
45 if (!initialized_) {
46 initialized_ = true;
47 PetscCall(PetscRegisterFinalize(finalize_));
48 }
49 PetscFunctionReturn(PETSC_SUCCESS);
50 }
51
52 public:
53 const struct _DeviceContextOps ops = {destroy, changeStreamType, setUp, query, waitForContext, synchronize, getBlasHandle, getSolverHandle, getStreamHandle, beginTimer, endTimer, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
54
55 // default constructor
56 DeviceContext() noexcept = default;
57
58 // All of these functions MUST be static in order to be callable from C, otherwise they
59 // get the implicit 'this' pointer tacked on
destroy(PetscDeviceContext dctx)60 static PetscErrorCode destroy(PetscDeviceContext dctx) noexcept
61 {
62 PetscFunctionBegin;
63 delete static_cast<PetscDeviceContext_SYCL *>(dctx->data);
64 dctx->data = nullptr;
65 PetscFunctionReturn(PETSC_SUCCESS);
66 }
67
setUp(PetscDeviceContext dctx)68 static PetscErrorCode setUp(PetscDeviceContext dctx) noexcept
69 {
70 PetscDevice dev;
71 PetscInt id;
72
73 PetscFunctionBegin;
74 #if PetscDefined(USE_DEBUG)
75 static_cast<PetscDeviceContext_SYCL *>(dctx->data)->timerInUse = PETSC_FALSE;
76 #endif
77 PetscCall(PetscDeviceContextGetDevice(dctx, &dev));
78 PetscCall(PetscDeviceGetDeviceId(dev, &id));
79 const ::sycl::device &syclDevice = (id == PETSC_SYCL_DEVICE_HOST) ? ::sycl::device(::sycl::cpu_selector_v) : ::sycl::device::get_devices(::sycl::info::device_type::gpu)[id];
80
81 static_cast<PetscDeviceContext_SYCL *>(dctx->data)->queue = ::sycl::queue(syclDevice, ::sycl::property::queue::in_order());
82 PetscFunctionReturn(PETSC_SUCCESS);
83 }
84
query(PetscDeviceContext dctx,PetscBool * idle)85 static PetscErrorCode query(PetscDeviceContext dctx, PetscBool *idle) noexcept
86 {
87 PetscFunctionBegin;
88 // available in future, https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_oneapi_queue_empty.asciidoc
89 // *idle = static_cast<PetscDeviceContext_SYCL*>(dctx->data)->queue.empty() ? PETSC_TRUE : PETSC_FALSE;
90 *idle = PETSC_FALSE;
91 PetscFunctionReturn(PETSC_SUCCESS);
92 }
93
synchronize(PetscDeviceContext dctx)94 static PetscErrorCode synchronize(PetscDeviceContext dctx) noexcept
95 {
96 PetscBool idle = PETSC_TRUE;
97 const auto dci = static_cast<PetscDeviceContext_SYCL *>(dctx->data);
98
99 PetscFunctionBegin;
100 PetscCall(query(dctx, &idle));
101 if (!idle) PetscCallCXX(dci->queue.wait());
102 PetscFunctionReturn(PETSC_SUCCESS);
103 }
104
getStreamHandle(PetscDeviceContext dctx,void ** handle)105 static PetscErrorCode getStreamHandle(PetscDeviceContext dctx, void **handle) noexcept
106 {
107 PetscFunctionBegin;
108 *reinterpret_cast<::sycl::queue **>(handle) = &(static_cast<PetscDeviceContext_SYCL *>(dctx->data)->queue);
109 PetscFunctionReturn(PETSC_SUCCESS);
110 }
111
beginTimer(PetscDeviceContext dctx)112 static PetscErrorCode beginTimer(PetscDeviceContext dctx) noexcept
113 {
114 const auto dci = static_cast<PetscDeviceContext_SYCL *>(dctx->data);
115
116 PetscFunctionBegin;
117 #if PetscDefined(USE_DEBUG)
118 PetscCheck(!dci->timerInUse, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Forgot to call PetscLogGpuTimeEnd()?");
119 dci->timerInUse = PETSC_TRUE;
120 #endif
121 // It is not a good approach to time SYCL kernels because the timer starts at the kernel launch time at host,
122 // not at the start of execution time on device. SYCL provides this style of kernel timing:
123 /*
124 sycl::queue q(sycl::default_selector_v, sycl::property::queue::enable_profiling{});
125 sycl::event e = q.submit([&](sycl::handler &h) {
126 ...
127 });
128 e.wait();
129 auto start_time = e.get_profiling_info<sycl::info::event_profiling::command_start>();
130 auto end_time = e.get_profiling_info<sycl::info::event_profiling::command_end>();
131 long long kernel_duration_ns = end_time - start_time;
132 */
133 // It requires 1) enable profiling at the queue's creation time, and 2) store the event returned by kernel launch.
134 // But neither we have control of the input queue, nor does PetscDeviceContext support 2), so we just use a
135 // host side timer.
136 PetscCallCXX(dci->timeBegin = std::chrono::steady_clock::now());
137 PetscFunctionReturn(PETSC_SUCCESS);
138 }
139
endTimer(PetscDeviceContext dctx,PetscLogDouble * elapsed)140 static PetscErrorCode endTimer(PetscDeviceContext dctx, PetscLogDouble *elapsed) noexcept
141 {
142 const auto dci = static_cast<PetscDeviceContext_SYCL *>(dctx->data);
143 std::chrono::duration<double> duration;
144
145 PetscFunctionBegin;
146 #if PetscDefined(USE_DEBUG)
147 PetscCheck(dci->timerInUse, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Forgot to call PetscLogGpuTimeBegin()?");
148 dci->timerInUse = PETSC_FALSE;
149 #endif
150 PetscCallCXX(dci->queue.wait());
151 PetscCallCXX(duration = std::chrono::steady_clock::now() - dci->timeBegin);
152 PetscCallCXX(*elapsed = duration.count());
153 PetscFunctionReturn(PETSC_SUCCESS);
154 }
155
changeStreamType(PetscDeviceContext,PetscStreamType)156 static PetscErrorCode changeStreamType(PetscDeviceContext, PetscStreamType) noexcept { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); }
waitForContext(PetscDeviceContext,PetscDeviceContext)157 static PetscErrorCode waitForContext(PetscDeviceContext, PetscDeviceContext) noexcept { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); }
getBlasHandle(PetscDeviceContext,void *)158 static PetscErrorCode getBlasHandle(PetscDeviceContext, void *) noexcept { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); }
getSolverHandle(PetscDeviceContext,void *)159 static PetscErrorCode getSolverHandle(PetscDeviceContext, void *) noexcept { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); }
160 };
161
162 } // namespace impl
163
164 } // namespace sycl
165
166 } // namespace device
167
168 } // namespace Petsc
169
PetscDeviceContextCreate_SYCL(PetscDeviceContext dctx)170 PetscErrorCode PetscDeviceContextCreate_SYCL(PetscDeviceContext dctx)
171 {
172 using namespace Petsc::device::sycl::impl;
173
174 static const DeviceContext syclctx;
175
176 PetscFunctionBegin;
177 PetscCallCXX(dctx->data = new DeviceContext::PetscDeviceContext_SYCL());
178 dctx->ops[0] = syclctx.ops;
179 PetscFunctionReturn(PETSC_SUCCESS);
180 }
181