1 #include <petsc/private/cpp/memory.hpp> // make_unique
2
3 #include "cupmdevice.hpp"
4
5 #include <algorithm>
6 #include <csetjmp> // for cuda mpi awareness
7 #include <csignal> // SIGSEGV
8 #include <iterator>
9 #include <type_traits>
10
11 namespace Petsc
12 {
13
14 namespace device
15 {
16
17 namespace cupm
18 {
19
20 // internal "impls" class for CUPMDevice. Each instance represents a single cupm device
21 template <DeviceType T>
22 class Device<T>::DeviceInternal {
23 const int id_;
24 bool devInitialized_ = false;
25 cupmDeviceProp_t dprop_{}; // cudaDeviceProp appears to be an actual struct, i.e. you can't
26 // initialize it with nullptr or NULL (i've tried)
27
28 static PetscErrorCode CUPMAwareMPI_(bool *) noexcept;
29
30 public:
31 // default constructor
DeviceInternal(int dev)32 explicit constexpr DeviceInternal(int dev) noexcept : id_(dev) { }
33
34 // gather all relevant information for a particular device, a cupmDeviceProp_t is
35 // usually sufficient here
36 PetscErrorCode initialize() noexcept;
37 PetscErrorCode configure() noexcept;
38 PetscErrorCode view(PetscViewer) const noexcept;
39 PetscErrorCode getattribute(PetscDeviceAttribute, void *) const noexcept;
40 PetscErrorCode shutdown() noexcept;
41
id() const42 PETSC_NODISCARD auto id() const -> decltype(id_) { return id_; }
initialized() const43 PETSC_NODISCARD auto initialized() const -> decltype(devInitialized_) { return devInitialized_; }
prop() const44 PETSC_NODISCARD auto prop() const -> const decltype(dprop_) & { return dprop_; }
45 };
46
47 // the goal here is simply to get the cupm backend to create its context, not to do any type of
48 // modification of it, or create objects (since these may be affected by subsequent
49 // configuration changes)
50 template <DeviceType T>
initialize()51 PetscErrorCode Device<T>::DeviceInternal::initialize() noexcept
52 {
53 PetscFunctionBegin;
54 if (initialized()) PetscFunctionReturn(PETSC_SUCCESS);
55 devInitialized_ = true;
56 // need to do this BEFORE device has been set, although if the user
57 // has already done this then we just ignore it
58 if (cupmSetDeviceFlags(cupmDeviceMapHost) == cupmErrorSetOnActiveProcess) {
59 // reset the error if it was cupmErrorSetOnActiveProcess
60 const auto PETSC_UNUSED unused = cupmGetLastError();
61 } else PetscCallCUPM(cupmGetLastError());
62 // cuda 5.0+ will create a context when cupmSetDevice is called
63 if (cupmSetDevice(id()) != cupmErrorDeviceAlreadyInUse) PetscCallCUPM(cupmGetLastError());
64 // and in case it doesn't, explicitly call init here
65 PetscCallCUPM(cupmInit(0));
66 #if PetscDefined(HAVE_CUDA)
67 // nvmlInit() deprecated in NVML 5.319
68 PetscCallNVML(nvmlInit_v2());
69 #endif
70 // where is this variable defined and when is it set? who knows! but it is defined and set
71 // at this point. either way, each device must make this check since I guess MPI might not be
72 // aware of all of them?
73 if (use_gpu_aware_mpi) {
74 bool aware;
75
76 // Even the MPI implementation is configured with GPU-aware, it might still need extra settings to enable it.
77 // So we do the check at runtime with a code that works only with GPU-aware MPI.
78 PetscCall(CUPMAwareMPI_(&aware));
79 if (PetscUnlikely(!aware)) {
80 PetscCall((*PetscErrorPrintf)("PETSc is configured with GPU support, but your MPI is not GPU-aware. For better performance, please use a GPU-aware MPI.\n"));
81 PetscCall((*PetscErrorPrintf)("If you do not care, add option -use_gpu_aware_mpi 0. To not see the message again, add the option to your .petscrc, OR add it to the env var PETSC_OPTIONS.\n"));
82 PetscCall((*PetscErrorPrintf)("For Open MPI, you need to configure it with CUDA, ROCm or GPU-aware UCX (https://docs.open-mpi.org/en/main/tuning-apps/accelerators/index.html)\n"));
83 PetscCall((*PetscErrorPrintf)("If you already configured it with GPU-aware UCX, you may need 'mpiexec -n <np> --mca pml ucx' or export 'OMPI_MCA_pml=\"ucx\"' to use it.\n"));
84 PetscCall((*PetscErrorPrintf)("For MVAPICH2-GDR, you need to set MV2_USE_CUDA=1 (http://mvapich.cse.ohio-state.edu/userguide/gdr/)\n"));
85 PetscCall((*PetscErrorPrintf)("For Cray-MPICH, export MPICH_GPU_SUPPORT_ENABLED=1 (see its 'man mpi'); for MPICH, export MPIR_CVAR_ENABLE_GPU=1\n"));
86 PETSCABORT(PETSC_COMM_SELF, PETSC_ERR_LIB);
87 }
88 }
89 PetscFunctionReturn(PETSC_SUCCESS);
90 }
91
92 template <DeviceType T>
configure()93 PetscErrorCode Device<T>::DeviceInternal::configure() noexcept
94 {
95 PetscFunctionBegin;
96 PetscAssert(initialized(), PETSC_COMM_SELF, PETSC_ERR_COR, "Device %d being configured before it was initialized", id());
97 // why on EARTH nvidia insists on making otherwise informational states into
98 // fully-fledged error codes is beyond me. Why couldn't a pointer to bool argument have
99 // sufficed?!?!?!
100 if (cupmSetDevice(id_) != cupmErrorDeviceAlreadyInUse) PetscCallCUPM(cupmGetLastError());
101 // need to update the device properties
102 PetscCallCUPM(cupmGetDeviceProperties(&dprop_, id_));
103 PetscDeviceCUPMRuntimeArch = dprop_.major * 10 + dprop_.minor;
104 PetscCall(PetscInfo(nullptr, "Configured device %d\n", id_));
105 PetscFunctionReturn(PETSC_SUCCESS);
106 }
107
108 template <DeviceType T>
view(PetscViewer viewer) const109 PetscErrorCode Device<T>::DeviceInternal::view(PetscViewer viewer) const noexcept
110 {
111 PetscBool isascii;
112
113 PetscFunctionBegin;
114 PetscAssert(initialized(), PETSC_COMM_SELF, PETSC_ERR_COR, "Device %d being viewed before it was initialized or configured", id());
115 // we don't print device-specific info in CI-mode
116 if (PetscUnlikely(PetscCIEnabled)) PetscFunctionReturn(PETSC_SUCCESS);
117 PetscCall(PetscObjectTypeCompare(PetscObjectCast(viewer), PETSCVIEWERASCII, &isascii));
118 if (isascii) {
119 MPI_Comm comm;
120 PetscMPIInt rank;
121 PetscViewer sviewer;
122
123 int clock, memclock;
124 PetscCallCUPM(cupmDeviceGetAttribute(&clock, cupmDevAttrClockRate, id_));
125 PetscCallCUPM(cupmDeviceGetAttribute(&memclock, cupmDevAttrMemoryClockRate, id_));
126
127 PetscCall(PetscObjectGetComm(PetscObjectCast(viewer), &comm));
128 PetscCallMPI(MPI_Comm_rank(comm, &rank));
129 PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
130 PetscCall(PetscViewerASCIIPrintf(sviewer, "[%d] name: %s\n", rank, dprop_.name));
131 PetscCall(PetscViewerASCIIPushTab(sviewer));
132 PetscCall(PetscViewerASCIIPrintf(sviewer, "Compute capability: %d.%d\n", dprop_.major, dprop_.minor));
133 PetscCall(PetscViewerASCIIPrintf(sviewer, "Multiprocessor Count: %d\n", dprop_.multiProcessorCount));
134 PetscCall(PetscViewerASCIIPrintf(sviewer, "Maximum Grid Dimensions: %d x %d x %d\n", dprop_.maxGridSize[0], dprop_.maxGridSize[1], dprop_.maxGridSize[2]));
135 PetscCall(PetscViewerASCIIPrintf(sviewer, "Maximum Block Dimensions: %d x %d x %d\n", dprop_.maxThreadsDim[0], dprop_.maxThreadsDim[1], dprop_.maxThreadsDim[2]));
136 PetscCall(PetscViewerASCIIPrintf(sviewer, "Maximum Threads Per Block: %d\n", dprop_.maxThreadsPerBlock));
137 PetscCall(PetscViewerASCIIPrintf(sviewer, "Warp Size: %d\n", dprop_.warpSize));
138 PetscCall(PetscViewerASCIIPrintf(sviewer, "Total Global Memory (bytes): %zu\n", dprop_.totalGlobalMem));
139 PetscCall(PetscViewerASCIIPrintf(sviewer, "Total Constant Memory (bytes): %zu\n", dprop_.totalConstMem));
140 PetscCall(PetscViewerASCIIPrintf(sviewer, "Shared Memory Per Block (bytes): %zu\n", dprop_.sharedMemPerBlock));
141 PetscCall(PetscViewerASCIIPrintf(sviewer, "Multiprocessor Clock Rate (kHz): %d\n", clock));
142 PetscCall(PetscViewerASCIIPrintf(sviewer, "Memory Clock Rate (kHz): %d\n", memclock));
143 PetscCall(PetscViewerASCIIPrintf(sviewer, "Memory Bus Width (bits): %d\n", dprop_.memoryBusWidth));
144 PetscCall(PetscViewerASCIIPrintf(sviewer, "Peak Memory Bandwidth (GB/s): %f\n", 2.0 * memclock * (dprop_.memoryBusWidth / 8) / 1.0e6));
145 PetscCall(PetscViewerASCIIPrintf(sviewer, "Can map host memory: %s\n", dprop_.canMapHostMemory ? "PETSC_TRUE" : "PETSC_FALSE"));
146 PetscCall(PetscViewerASCIIPrintf(sviewer, "Can execute multiple kernels concurrently: %s\n", dprop_.concurrentKernels ? "PETSC_TRUE" : "PETSC_FALSE"));
147 PetscCall(PetscViewerASCIIPopTab(sviewer));
148 PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
149 }
150 PetscFunctionReturn(PETSC_SUCCESS);
151 }
152
153 template <DeviceType T>
getattribute(PetscDeviceAttribute attr,void * value) const154 PetscErrorCode Device<T>::DeviceInternal::getattribute(PetscDeviceAttribute attr, void *value) const noexcept
155 {
156 PetscFunctionBegin;
157 PetscAssert(initialized(), PETSC_COMM_SELF, PETSC_ERR_COR, "Device %d was not initialized", id());
158 switch (attr) {
159 case PETSC_DEVICE_ATTR_SIZE_T_SHARED_MEM_PER_BLOCK:
160 *static_cast<std::size_t *>(value) = prop().sharedMemPerBlock;
161 case PETSC_DEVICE_ATTR_MAX:
162 break;
163 }
164 PetscFunctionReturn(PETSC_SUCCESS);
165 }
166
167 template <DeviceType T>
shutdown()168 PetscErrorCode Device<T>::DeviceInternal::shutdown() noexcept
169 {
170 PetscFunctionBegin;
171 if (!initialized()) PetscFunctionReturn(PETSC_SUCCESS);
172 #if PetscDefined(HAVE_CUDA)
173 PetscCallNVML(nvmlShutdown());
174 #endif
175 PetscFunctionReturn(PETSC_SUCCESS);
176 }
177
178 static std::jmp_buf cupmMPIAwareJumpBuffer;
179 static bool cupmMPIAwareJumpBufferSet;
180
181 // godspeed to anyone that attempts to call this function
SilenceVariableIsNotNeededAndWillNotBeEmittedWarning_ThisFunctionShouldNeverBeCalled()182 void SilenceVariableIsNotNeededAndWillNotBeEmittedWarning_ThisFunctionShouldNeverBeCalled()
183 {
184 PETSCABORT(MPI_COMM_NULL, (PetscErrorCode)INT_MAX);
185 if (cupmMPIAwareJumpBufferSet) (void)cupmMPIAwareJumpBuffer;
186 }
187
188 template <DeviceType T>
CUPMAwareMPI_(bool * awareness)189 PetscErrorCode Device<T>::DeviceInternal::CUPMAwareMPI_(bool *awareness) noexcept
190 {
191 constexpr int hbuf[] = {1, 0};
192 int *dbuf = nullptr;
193 const auto cupmSignalHandler = [](int signal, void *ptr) -> PetscErrorCode {
194 if ((signal == SIGSEGV) && cupmMPIAwareJumpBufferSet) std::longjmp(cupmMPIAwareJumpBuffer, 1);
195 return PetscSignalHandlerDefault(signal, ptr);
196 };
197
198 PetscFunctionBegin;
199 *awareness = false;
200 PetscCallCUPM(cupmMalloc(reinterpret_cast<void **>(&dbuf), sizeof(hbuf)));
201 PetscCallCUPM(cupmMemcpy(dbuf, hbuf, sizeof(hbuf), cupmMemcpyHostToDevice));
202 PetscCallCUPM(cupmDeviceSynchronize());
203 PetscCall(PetscPushSignalHandler(cupmSignalHandler, nullptr));
204 cupmMPIAwareJumpBufferSet = true;
205 if (!setjmp(cupmMPIAwareJumpBuffer) && !MPI_Allreduce(dbuf, dbuf + 1, 1, MPI_INT, MPI_SUM, PETSC_COMM_SELF)) *awareness = true;
206 cupmMPIAwareJumpBufferSet = false;
207 PetscCall(PetscPopSignalHandler());
208 PetscCallCUPM(cupmFree(dbuf));
209 PetscFunctionReturn(PETSC_SUCCESS);
210 }
211
212 template <DeviceType T>
finalize_()213 PetscErrorCode Device<T>::finalize_() noexcept
214 {
215 PetscFunctionBegin;
216 if (PetscUnlikely(!initialized_)) PetscFunctionReturn(PETSC_SUCCESS);
217 for (auto &&device : devices_) {
218 if (device) PetscCall(device->shutdown());
219 device.reset();
220 }
221 defaultDevice_ = PETSC_CUPM_DEVICE_NONE; // disabled by default
222 initialized_ = false;
223 PetscFunctionReturn(PETSC_SUCCESS);
224 }
225
226 template <DeviceType T>
CUPM_VISIBLE_DEVICES()227 PETSC_NODISCARD static PETSC_CONSTEXPR_14 const char *CUPM_VISIBLE_DEVICES() noexcept
228 {
229 switch (T) {
230 case DeviceType::CUDA:
231 return "CUDA_VISIBLE_DEVICES";
232 case DeviceType::HIP:
233 return "HIP_VISIBLE_DEVICES";
234 }
235 PetscUnreachable();
236 return "PETSC_ERROR_PLIB";
237 }
238
239 /*
240 The default device ID is
241 MPI -- rank % number_local_devices
242 PyTorch -- getenv("LOCAL_RANK")
243 */
244 template <DeviceType T>
initialize(MPI_Comm comm,PetscInt * defaultDeviceId,PetscBool * defaultView,PetscDeviceInitType * defaultInitType)245 PetscErrorCode Device<T>::initialize(MPI_Comm comm, PetscInt *defaultDeviceId, PetscBool *defaultView, PetscDeviceInitType *defaultInitType) noexcept
246 {
247 auto initId = std::make_pair(*defaultDeviceId, PETSC_FALSE);
248 auto initView = std::make_pair(*defaultView, PETSC_FALSE);
249 auto initType = std::make_pair(*defaultInitType, PETSC_FALSE);
250 int ndev = 0;
251
252 PetscFunctionBegin;
253 if (initialized_) PetscFunctionReturn(PETSC_SUCCESS);
254 initialized_ = true;
255 PetscCall(PetscRegisterFinalize(finalize_));
256 PetscCall(base_type::PetscOptionDeviceAll(comm, initType, initId, initView));
257
258 if (initType.first == PETSC_DEVICE_INIT_NONE) {
259 initId.first = PETSC_CUPM_DEVICE_NONE;
260 } else if (const auto cerr = cupmGetDeviceCount(&ndev)) {
261 auto PETSC_UNUSED ignored = cupmGetLastError();
262
263 PetscCheck((initType.first != PETSC_DEVICE_INIT_EAGER) && !initView.first, comm, PETSC_ERR_USER_INPUT, "Cannot eagerly initialize %s, as doing so results in %s error %d (%s) : %s", cupmName(), cupmName(), static_cast<PetscErrorCode>(cerr), cupmGetErrorName(cerr), cupmGetErrorString(cerr));
264 // we won't be initializing anything anyways
265 initType.first = PETSC_DEVICE_INIT_NONE;
266 // save the error code for later
267 initId.first = -static_cast<decltype(initId.first)>(cerr);
268 }
269
270 // check again for init type, since the device count may have changed it
271 if (initType.first == PETSC_DEVICE_INIT_NONE) {
272 // id < 0 (excluding PETSC_DECIDE) indicates an error has occurred during setup
273 if ((initId.first > 0) || (initId.first == PETSC_DECIDE)) initId.first = PETSC_CUPM_DEVICE_NONE;
274 // initType overrides initView
275 initView.first = PETSC_FALSE;
276 } else {
277 PetscCall(PetscDeviceCheckDeviceCount_Internal(ndev));
278 if (initId.first == PETSC_DECIDE) {
279 if (ndev) {
280 /* TORCHELASTIC_RUN_ID is used as a proxy to determine if the current process was launched with torchrun */
281 char *pytorch_exists = (char *)getenv("TORCHELASTIC_RUN_ID");
282 char *pytorch_rank = (char *)getenv("LOCAL_RANK");
283
284 if (pytorch_exists && pytorch_rank) {
285 char *endptr;
286
287 initId.first = (PetscInt)strtol(pytorch_rank, &endptr, 10);
288 PetscCheck(initId.first < ndev, PETSC_COMM_SELF, PETSC_ERR_LIB, "PyTorch environmental variable LOCAL_RANK %s > number devices %d", pytorch_rank, ndev);
289 } else {
290 PetscMPIInt rank;
291
292 PetscCallMPI(MPI_Comm_rank(comm, &rank));
293 initId.first = rank % ndev;
294 }
295 } else initId.first = 0;
296 }
297 if (initView.first) initType.first = PETSC_DEVICE_INIT_EAGER;
298 }
299
300 static_assert(std::is_same<PetscMPIInt, decltype(defaultDevice_)>::value, "");
301 // initId.first is PetscInt, _defaultDevice is int
302 PetscCall(PetscMPIIntCast(initId.first, &defaultDevice_));
303 // record the results of the initialization
304 *defaultDeviceId = initId.first;
305 *defaultView = initView.first;
306 *defaultInitType = initType.first;
307 PetscFunctionReturn(PETSC_SUCCESS);
308 }
309
310 template <DeviceType T>
init_device_id_(PetscInt * inid) const311 PetscErrorCode Device<T>::init_device_id_(PetscInt *inid) const noexcept
312 {
313 const auto id = *inid == PETSC_DECIDE ? defaultDevice_ : (int)*inid;
314 const auto cerr = static_cast<cupmError_t>(-defaultDevice_);
315
316 PetscFunctionBegin;
317 PetscCheck(defaultDevice_ != PETSC_CUPM_DEVICE_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Trying to retrieve a %s PetscDevice when it has been disabled", cupmName());
318 PetscCheck(defaultDevice_ >= 0, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot lazily initialize PetscDevice: %s error %d (%s) : %s", cupmName(), static_cast<PetscErrorCode>(cerr), cupmGetErrorName(cerr), cupmGetErrorString(cerr));
319 PetscAssert(static_cast<decltype(devices_.size())>(id) < devices_.size(), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Only supports %zu number of devices but trying to get device with id %d", devices_.size(), id);
320
321 if (!devices_[id]) devices_[id] = util::make_unique<DeviceInternal>(id);
322 PetscAssert(id == devices_[id]->id(), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Entry %d contains device with mismatching id %d", id, devices_[id]->id());
323 PetscCall(devices_[id]->initialize());
324 *inid = id;
325 PetscFunctionReturn(PETSC_SUCCESS);
326 }
327
328 template <DeviceType T>
configure_device_(PetscDevice device)329 PetscErrorCode Device<T>::configure_device_(PetscDevice device) noexcept
330 {
331 PetscFunctionBegin;
332 PetscCall(devices_[device->deviceId]->configure());
333 PetscFunctionReturn(PETSC_SUCCESS);
334 }
335
336 template <DeviceType T>
view_device_(PetscDevice device,PetscViewer viewer)337 PetscErrorCode Device<T>::view_device_(PetscDevice device, PetscViewer viewer) noexcept
338 {
339 PetscFunctionBegin;
340 // now this __shouldn't__ reconfigure the device, but there is a petscinfo call to indicate
341 // it is being reconfigured
342 PetscCall(devices_[device->deviceId]->configure());
343 PetscCall(devices_[device->deviceId]->view(viewer));
344 PetscFunctionReturn(PETSC_SUCCESS);
345 }
346
347 template <DeviceType T>
get_attribute_(PetscInt id,PetscDeviceAttribute attr,void * value)348 PetscErrorCode Device<T>::get_attribute_(PetscInt id, PetscDeviceAttribute attr, void *value) noexcept
349 {
350 PetscFunctionBegin;
351 PetscCall(devices_[id]->getattribute(attr, value));
352 PetscFunctionReturn(PETSC_SUCCESS);
353 }
354
355 // explicitly instantiate the classes
356 #if PetscDefined(HAVE_CUDA)
357 template class Device<DeviceType::CUDA>;
358 #endif
359 #if PetscDefined(HAVE_HIP)
360 template class Device<DeviceType::HIP>;
361 #endif
362
363 } // namespace cupm
364
365 } // namespace device
366
367 } // namespace Petsc
368