xref: /petsc/src/sys/objects/device/impls/cupm/cupmdevice.cxx (revision 1d1f2f2a4779ec582f0d66735516ea773846f70c)
1 #include "../../interface/cupmdevice.hpp"
2 #include <algorithm>
3 #include <csetjmp> // for cuda mpi awareness
4 #include <csignal> // SIGSEGV
5 #include <iterator>
6 #include <type_traits>
7 
8 #if PetscDefined(USE_LOG)
9 PETSC_INTERN PetscErrorCode PetscLogInitialize(void);
10 #else
11 #define PetscLogInitialize() 0
12 #endif
13 
14 namespace Petsc {
15 
16 namespace Device {
17 
18 namespace CUPM {
19 
20 // internal "impls" class for CUPMDevice. Each instance represents a single cupm device
21 template <DeviceType T>
22 class Device<T>::DeviceInternal {
23   const int        id_;
24   bool             devInitialized_ = false;
25   cupmDeviceProp_t dprop_; // cudaDeviceProp appears to be an actual struct, i.e. you can't
26                            // initialize it with nullptr or NULL (i've tried)
27 
28   PETSC_CXX_COMPAT_DECL(bool CUPMAwareMPI_());
29 
30 public:
31   // default constructor
32   explicit constexpr DeviceInternal(int dev) noexcept : id_(dev) { }
33 
34   // gather all relevant information for a particular device, a cupmDeviceProp_t is
35   // usually sufficient here
36   PETSC_NODISCARD PetscErrorCode initialize() noexcept;
37   PETSC_NODISCARD PetscErrorCode configure() noexcept;
38   PETSC_NODISCARD PetscErrorCode view(PetscViewer) const noexcept;
39   PETSC_NODISCARD PetscErrorCode finalize() noexcept;
40 
41   PETSC_NODISCARD auto id() const -> decltype(id_) { return id_; }
42   PETSC_NODISCARD auto initialized() const -> decltype(devInitialized_) { return devInitialized_; }
43   PETSC_NODISCARD auto prop() const -> const decltype(dprop_) & { return dprop_; }
44 
45   // factory
46   PETSC_CXX_COMPAT_DECL(std::unique_ptr<DeviceInternal> makeDevice(int i)) { return std::unique_ptr<DeviceInternal>(new DeviceInternal(i)); }
47 };
48 
49 // the goal here is simply to get the cupm backend to create its context, not to do any type of
50 // modification of it, or create objects (since these may be affected by subsequent
51 // configuration changes)
52 template <DeviceType T>
53 PetscErrorCode Device<T>::DeviceInternal::initialize() noexcept {
54   PetscFunctionBegin;
55   if (devInitialized_) PetscFunctionReturn(0);
56   devInitialized_ = true;
57   // need to do this BEFORE device has been set, although if the user
58   // has already done this then we just ignore it
59   if (cupmSetDeviceFlags(cupmDeviceMapHost) == cupmErrorSetOnActiveProcess) {
60     // reset the error if it was cupmErrorSetOnActiveProcess
61     const auto PETSC_UNUSED unused = cupmGetLastError();
62   } else {
63     PetscCallCUPM(cupmGetLastError());
64   }
65   // cuda 5.0+ will create a context when cupmSetDevice is called
66   if (cupmSetDevice(id_) != cupmErrorDeviceAlreadyInUse) PetscCallCUPM(cupmGetLastError());
67   // forces cuda < 5.0 to initialize a context
68   PetscCallCUPM(cupmFree(nullptr));
69   // where is this variable defined and when is it set? who knows! but it is defined and set
70   // at this point. either way, each device must make this check since I guess MPI might not be
71   // aware of all of them?
72   if (use_gpu_aware_mpi) {
73     // For OpenMPI, we could do a compile time check with
74     // "defined(PETSC_HAVE_OMPI_MAJOR_VERSION) && defined(MPIX_CUDA_AWARE_SUPPORT) &&
75     // MPIX_CUDA_AWARE_SUPPORT" to see if it is CUDA-aware. However, recent versions of IBM
76     // Spectrum MPI (e.g., 10.3.1) on Summit meet above conditions, but one has to use jsrun
77     // --smpiargs=-gpu to really enable GPU-aware MPI. So we do the check at runtime with a
78     // code that works only with GPU-aware MPI.
79     if (PetscUnlikely(!CUPMAwareMPI_())) {
80       (*PetscErrorPrintf)("PETSc is configured with GPU support, but your MPI is not GPU-aware. For better performance, please use a GPU-aware MPI.\n");
81       (*PetscErrorPrintf)("If you do not care, add option -use_gpu_aware_mpi 0. To not see the message again, add the option to your .petscrc, OR add it to the env var PETSC_OPTIONS.\n");
82       (*PetscErrorPrintf)("If you do care, for IBM Spectrum MPI on OLCF Summit, you may need jsrun --smpiargs=-gpu.\n");
83       (*PetscErrorPrintf)("For OpenMPI, you need to configure it --with-cuda (https://www.open-mpi.org/faq/?category=buildcuda)\n");
84       (*PetscErrorPrintf)("For MVAPICH2-GDR, you need to set MV2_USE_CUDA=1 (http://mvapich.cse.ohio-state.edu/userguide/gdr/)\n");
85       (*PetscErrorPrintf)("For Cray-MPICH, you need to set MPICH_RDMA_ENABLED_CUDA=1 (https://www.olcf.ornl.gov/tutorials/gpudirect-mpich-enabled-cuda/)\n");
86       PETSCABORT(PETSC_COMM_SELF, PETSC_ERR_LIB);
87     }
88   }
89   PetscFunctionReturn(0);
90 }
91 
92 template <DeviceType T>
93 PetscErrorCode Device<T>::DeviceInternal::configure() noexcept {
94   PetscFunctionBegin;
95   PetscAssert(devInitialized_, PETSC_COMM_SELF, PETSC_ERR_COR, "Device %d being configured before it was initialized", id_);
96   // why on EARTH nvidia insists on making otherwise informational states into
97   // fully-fledged error codes is beyond me. Why couldn't a pointer to bool argument have
98   // sufficed?!?!?!
99   if (cupmSetDevice(id_) != cupmErrorDeviceAlreadyInUse) PetscCallCUPM(cupmGetLastError());
100   // need to update the device properties
101   PetscCallCUPM(cupmGetDeviceProperties(&dprop_, id_));
102   PetscCall(PetscInfo(nullptr, "Configured device %d\n", id_));
103   PetscFunctionReturn(0);
104 }
105 
106 template <DeviceType T>
107 PetscErrorCode Device<T>::DeviceInternal::view(PetscViewer viewer) const noexcept {
108   PetscBool iascii;
109 
110   PetscFunctionBegin;
111   PetscAssert(devInitialized_, PETSC_COMM_SELF, PETSC_ERR_COR, "Device %d being viewed before it was initialized or configured", id_);
112   PetscCall(PetscObjectTypeCompare(PetscObjectCast(viewer), PETSCVIEWERASCII, &iascii));
113   if (iascii) {
114     MPI_Comm    comm;
115     PetscMPIInt rank;
116     PetscViewer sviewer;
117 
118     PetscCall(PetscObjectGetComm(PetscObjectCast(viewer), &comm));
119     PetscCallMPI(MPI_Comm_rank(comm, &rank));
120     PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
121     PetscCall(PetscViewerASCIIPrintf(sviewer, "[%d] device %d: %s\n", rank, id_, dprop_.name));
122     PetscCall(PetscViewerASCIIPushTab(sviewer));
123     PetscCall(PetscViewerASCIIPrintf(sviewer, "Compute capability: %d.%d\n", dprop_.major, dprop_.minor));
124     PetscCall(PetscViewerASCIIPrintf(sviewer, "Multiprocessor Count: %d\n", dprop_.multiProcessorCount));
125     PetscCall(PetscViewerASCIIPrintf(sviewer, "Maximum Grid Dimensions: %d x %d x %d\n", dprop_.maxGridSize[0], dprop_.maxGridSize[1], dprop_.maxGridSize[2]));
126     PetscCall(PetscViewerASCIIPrintf(sviewer, "Maximum Block Dimensions: %d x %d x %d\n", dprop_.maxThreadsDim[0], dprop_.maxThreadsDim[1], dprop_.maxThreadsDim[2]));
127     PetscCall(PetscViewerASCIIPrintf(sviewer, "Maximum Threads Per Block: %d\n", dprop_.maxThreadsPerBlock));
128     PetscCall(PetscViewerASCIIPrintf(sviewer, "Warp Size: %d\n", dprop_.warpSize));
129     PetscCall(PetscViewerASCIIPrintf(sviewer, "Total Global Memory (bytes): %zu\n", dprop_.totalGlobalMem));
130     PetscCall(PetscViewerASCIIPrintf(sviewer, "Total Constant Memory (bytes): %zu\n", dprop_.totalConstMem));
131     PetscCall(PetscViewerASCIIPrintf(sviewer, "Shared Memory Per Block (bytes): %zu\n", dprop_.sharedMemPerBlock));
132     PetscCall(PetscViewerASCIIPrintf(sviewer, "Multiprocessor Clock Rate (KHz): %d\n", dprop_.clockRate));
133     PetscCall(PetscViewerASCIIPrintf(sviewer, "Memory Clock Rate (KHz): %d\n", dprop_.memoryClockRate));
134     PetscCall(PetscViewerASCIIPrintf(sviewer, "Memory Bus Width (bits): %d\n", dprop_.memoryBusWidth));
135     PetscCall(PetscViewerASCIIPrintf(sviewer, "Peak Memory Bandwidth (GB/s): %f\n", 2.0 * dprop_.memoryClockRate * (dprop_.memoryBusWidth / 8) / 1.0e6));
136     PetscCall(PetscViewerASCIIPrintf(sviewer, "Can map host memory: %s\n", dprop_.canMapHostMemory ? "PETSC_TRUE" : "PETSC_FALSE"));
137     PetscCall(PetscViewerASCIIPrintf(sviewer, "Can execute multiple kernels concurrently: %s\n", dprop_.concurrentKernels ? "PETSC_TRUE" : "PETSC_FALSE"));
138     PetscCall(PetscViewerASCIIPopTab(sviewer));
139     PetscCall(PetscViewerFlush(sviewer));
140     PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
141     PetscCall(PetscViewerFlush(viewer));
142   }
143   PetscFunctionReturn(0);
144 }
145 
146 static std::jmp_buf cupmMPIAwareJumpBuffer;
147 static bool         cupmMPIAwareJumpBufferSet;
148 
149 // godspeed to anyone that attempts to call this function
150 void SilenceVariableIsNotNeededAndWillNotBeEmittedWarning_ThisFunctionShouldNeverBeCalled() {
151   PETSCABORT(MPI_COMM_NULL, INT_MAX);
152   if (cupmMPIAwareJumpBufferSet) (void)cupmMPIAwareJumpBuffer;
153 }
154 
155 #define CHKCUPMAWARE(...) \
156   do { \
157     cupmError_t cerr_ = __VA_ARGS__; \
158     if (PetscUnlikely(cerr_ != cupmSuccess)) return false; \
159   } while (0)
160 
161 template <DeviceType T>
162 PETSC_CXX_COMPAT_DEFN(bool Device<T>::DeviceInternal::CUPMAwareMPI_()) {
163   constexpr int  bufSize           = 2;
164   constexpr int  hbuf[bufSize]     = {1, 0};
165   int           *dbuf              = nullptr;
166   constexpr auto bytes             = bufSize * sizeof(*dbuf);
167   auto           awareness         = false;
168   const auto     cupmSignalHandler = [](int signal, void *ptr) -> PetscErrorCode {
169     if ((signal == SIGSEGV) && cupmMPIAwareJumpBufferSet) std::longjmp(cupmMPIAwareJumpBuffer, 1);
170     return PetscSignalHandlerDefault(signal, ptr);
171   };
172 
173   PetscFunctionBegin;
174   CHKCUPMAWARE(cupmMalloc(reinterpret_cast<void **>(&dbuf), bytes));
175   CHKCUPMAWARE(cupmMemcpy(dbuf, hbuf, bytes, cupmMemcpyHostToDevice));
176   PetscCallAbort(PETSC_COMM_SELF, PetscPushSignalHandler(cupmSignalHandler, nullptr));
177   cupmMPIAwareJumpBufferSet = true;
178   if (setjmp(cupmMPIAwareJumpBuffer)) {
179     // if a segv was triggered in the MPI_Allreduce below, it is very likely due to MPI not
180     // being GPU-aware
181     awareness = false;
182     // control flow up until this point:
183     // 1. CUPMDevice<T>::CUPMDeviceInternal::MPICUPMAware__()
184     // 2. MPI_Allreduce
185     // 3. SIGSEGV
186     // 4. PetscSignalHandler_Private
187     // 5. cupmSignalHandler (lambda function)
188     // 6. here
189     // PetscSignalHandler_Private starts with PetscFunctionBegin and is pushed onto the stack
190     // so we must undo this. This would be most naturally done in cupmSignalHandler, however
191     // the C/C++ standard dictates:
192     //
193     // After invoking longjmp(), non-volatile-qualified local objects should not be accessed if
194     // their values could have changed since the invocation of setjmp(). Their value in this
195     // case is considered indeterminate, and accessing them is undefined behavior.
196     //
197     // so for safety (since we don't know what PetscStackPop may try to read/declare) we do it
198     // outside of the longjmp control flow
199     PetscStackPop;
200   } else if (!MPI_Allreduce(dbuf, dbuf + 1, 1, MPI_INT, MPI_SUM, PETSC_COMM_SELF)) awareness = true;
201   cupmMPIAwareJumpBufferSet = false;
202   PetscCallAbort(PETSC_COMM_SELF, PetscPopSignalHandler());
203   CHKCUPMAWARE(cupmFree(dbuf));
204   PetscFunctionReturn(awareness);
205 }
206 
207 #undef CHKCUPMAWARE
208 
209 template <DeviceType T>
210 PetscErrorCode Device<T>::DeviceInternal::finalize() noexcept {
211   PetscFunctionBegin;
212   devInitialized_ = false;
213   PetscFunctionReturn(0);
214 }
215 
216 template <DeviceType T>
217 PetscErrorCode Device<T>::finalize_() noexcept {
218   PetscFunctionBegin;
219   if (!initialized_) PetscFunctionReturn(0);
220   for (auto &&device : devices_) {
221     if (device) {
222       PetscCall(device->finalize());
223       device.reset();
224     }
225   }
226   defaultDevice_ = PETSC_CUPM_DEVICE_NONE; // disabled by default
227   initialized_   = false;
228   PetscFunctionReturn(0);
229 }
230 
231 // these functions should be named identically to the option they produce where "CUPMTYPE" and
232 // "cupmtype" are the uppercase and lowercase string versions of the cupm backend respectively
233 template <DeviceType T>
234 PETSC_CXX_COMPAT_DECL(PETSC_CONSTEXPR_14 const char *PetscDevice_CUPMTYPE_Options()) {
235   switch (T) {
236   case DeviceType::CUDA: return "PetscDevice CUDA Options";
237   case DeviceType::HIP: return "PetscDevice HIP Options";
238   }
239   PetscUnreachable();
240   return "PETSC_ERROR_PLIB";
241 }
242 
243 template <DeviceType T>
244 PETSC_CXX_COMPAT_DECL(PETSC_CONSTEXPR_14 const char *device_enable_cupmtype()) {
245   switch (T) {
246   case DeviceType::CUDA: return "-device_enable_cuda";
247   case DeviceType::HIP: return "-device_enable_hip";
248   }
249   PetscUnreachable();
250   return "PETSC_ERROR_PLIB";
251 }
252 
253 template <DeviceType T>
254 PETSC_CXX_COMPAT_DECL(PETSC_CONSTEXPR_14 const char *device_select_cupmtype()) {
255   switch (T) {
256   case DeviceType::CUDA: return "-device_select_cuda";
257   case DeviceType::HIP: return "-device_select_hip";
258   }
259   PetscUnreachable();
260   return "PETSC_ERROR_PLIB";
261 }
262 
263 template <DeviceType T>
264 PETSC_CXX_COMPAT_DECL(PETSC_CONSTEXPR_14 const char *device_view_cupmtype()) {
265   switch (T) {
266   case DeviceType::CUDA: return "-device_view_cuda";
267   case DeviceType::HIP: return "-device_view_hip";
268   }
269   PetscUnreachable();
270   return "PETSC_ERROR_PLIB";
271 }
272 
273 template <DeviceType T>
274 PETSC_CXX_COMPAT_DECL(PETSC_CONSTEXPR_14 const char *CUPM_VISIBLE_DEVICES()) {
275   switch (T) {
276   case DeviceType::CUDA: return "CUDA_VISIBLE_DEVICES";
277   case DeviceType::HIP: return "HIP_VISIBLE_DEVICES";
278   }
279   PetscUnreachable();
280   return "PETSC_ERROR_PLIB";
281 }
282 
283 template <DeviceType T>
284 PetscErrorCode Device<T>::initialize(MPI_Comm comm, PetscInt *defaultDeviceId, PetscDeviceInitType *defaultInitType) noexcept {
285   PetscInt  initTypeCUPM = *defaultInitType, id = *defaultDeviceId;
286   PetscBool view = PETSC_FALSE, flg;
287   int       ndev = 0;
288 
289   PetscFunctionBegin;
290   if (initialized_) PetscFunctionReturn(0);
291   initialized_ = true;
292   PetscCall(PetscRegisterFinalize(finalize_));
293 
294   {
295     // the functions to populate the command line strings are named after the string they return
296     PetscOptionsBegin(comm, nullptr, PetscDevice_CUPMTYPE_Options<T>(), "Sys");
297     PetscCall(PetscOptionsEList(device_enable_cupmtype<T>(), "How (or whether) to initialize a device", "CUPMDevice<CUPMDeviceType>::initialize()", PetscDeviceInitTypes, 3, PetscDeviceInitTypes[initTypeCUPM], &initTypeCUPM, nullptr));
298     PetscCall(PetscOptionsRangeInt(device_select_cupmtype<T>(), "Which device to use. Pass " PetscStringize(PETSC_DECIDE) " to have PETSc decide or (given they exist) [0-NUM_DEVICE) for a specific device", "PetscDeviceCreate", id, &id, nullptr, PETSC_DECIDE, std::numeric_limits<decltype(defaultDevice_)>::max()));
299     PetscCall(PetscOptionsBool(device_view_cupmtype<T>(), "Display device information and assignments (forces eager initialization)", nullptr, view, &view, &flg));
300     PetscOptionsEnd();
301   }
302 
303   if (initTypeCUPM == PETSC_DEVICE_INIT_NONE) {
304     id = PETSC_CUPM_DEVICE_NONE;
305   } else if (auto cerr = cupmGetDeviceCount(&ndev)) {
306     auto PETSC_UNUSED ignored = cupmGetLastError();
307     // we won't be initializing anything anyways
308     initTypeCUPM              = PETSC_DEVICE_INIT_NONE;
309     // save the error code for later
310     id                        = -static_cast<decltype(id)>(cerr);
311 
312     if (PetscUnlikely((initTypeCUPM == PETSC_DEVICE_INIT_EAGER) || (view && flg))) {
313       const auto name    = cupmGetErrorName(cerr);
314       const auto desc    = cupmGetErrorString(cerr);
315       const auto backend = cupmName();
316       SETERRQ(comm, PETSC_ERR_USER_INPUT, "Cannot eagerly initialize %s, as doing so results in %s error %d (%s) : %s", backend, backend, static_cast<PetscErrorCode>(cerr), name, desc);
317     }
318   }
319 
320   // check again for init type, since the device count may have changed it
321   if (initTypeCUPM == PETSC_DEVICE_INIT_NONE) {
322     // id < 0 (excluding PETSC_DECIDE) indicates an error has occurred during setup
323     if ((id > 0) || (id == PETSC_DECIDE)) id = PETSC_CUPM_DEVICE_NONE;
324   } else {
325     PetscCall(PetscDeviceCheckDeviceCount_Internal(ndev));
326     if (id == PETSC_DECIDE) {
327       if (ndev) {
328         PetscMPIInt rank;
329 
330         PetscCallMPI(MPI_Comm_rank(comm, &rank));
331         id = rank % ndev;
332       } else id = 0;
333     }
334     view = static_cast<decltype(view)>(view && flg);
335     if (view) initTypeCUPM = PETSC_DEVICE_INIT_EAGER;
336   }
337 
338   static_assert(std::is_same<PetscMPIInt, decltype(defaultDevice_)>::value, "");
339   // id is PetscInt, _defaultDevice is int
340   PetscCall(PetscMPIIntCast(id, &defaultDevice_));
341   if (initTypeCUPM == PETSC_DEVICE_INIT_EAGER) {
342     devices_[defaultDevice_] = DeviceInternal::makeDevice(defaultDevice_);
343     PetscCall(devices_[defaultDevice_]->initialize());
344     PetscCall(devices_[defaultDevice_]->configure());
345     if (view) {
346       PetscViewer vwr;
347 
348       PetscCall(PetscLogInitialize());
349       PetscCall(PetscViewerASCIIGetStdout(comm, &vwr));
350       PetscCall(devices_[defaultDevice_]->view(vwr));
351     }
352   }
353 
354   // record the results of the initialization
355   *defaultInitType = static_cast<PetscDeviceInitType>(initTypeCUPM);
356   *defaultDeviceId = id;
357   PetscFunctionReturn(0);
358 }
359 
360 template <DeviceType T>
361 PetscErrorCode Device<T>::getDevice(PetscDevice device, PetscInt id) const noexcept {
362   const auto cerr = static_cast<cupmError_t>(-defaultDevice_);
363 
364   PetscFunctionBegin;
365   PetscCheck(defaultDevice_ != PETSC_CUPM_DEVICE_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Trying to retrieve a %s PetscDevice when it has been disabled", cupmName());
366   PetscCheck(defaultDevice_ >= 0, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot lazily initialize PetscDevice: %s error %d (%s) : %s", cupmName(), static_cast<PetscErrorCode>(cerr), cupmGetErrorName(cerr), cupmGetErrorString(cerr));
367   if (id == PETSC_DECIDE) id = defaultDevice_;
368   PetscAssert(static_cast<decltype(devices_.size())>(id) < devices_.size(), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Only supports %zu number of devices but trying to get device with id %" PetscInt_FMT, devices_.size(), id);
369   if (devices_[id]) {
370     PetscAssert(id == devices_[id]->id(), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Entry %" PetscInt_FMT " contains device with mismatching id %d", id, devices_[id]->id());
371   } else devices_[id] = DeviceInternal::makeDevice(id);
372   PetscCall(devices_[id]->initialize());
373   device->deviceId           = devices_[id]->id(); // technically id = _devices[id]->_id here
374   device->ops->createcontext = create_;
375   device->ops->configure     = this->configureDevice;
376   device->ops->view          = this->viewDevice;
377   PetscFunctionReturn(0);
378 }
379 
380 template <DeviceType T>
381 PetscErrorCode Device<T>::configureDevice(PetscDevice device) noexcept {
382   PetscFunctionBegin;
383   PetscCall(devices_[device->deviceId]->configure());
384   PetscFunctionReturn(0);
385 }
386 
387 template <DeviceType T>
388 PetscErrorCode Device<T>::viewDevice(PetscDevice device, PetscViewer viewer) noexcept {
389   PetscFunctionBegin;
390   // now this __shouldn't__ reconfigure the device, but there is a petscinfo call to indicate
391   // it is being reconfigured
392   PetscCall(devices_[device->deviceId]->configure());
393   PetscCall(devices_[device->deviceId]->view(viewer));
394   PetscFunctionReturn(0);
395 }
396 
397 // explicitly instantiate the classes
398 #if PetscDefined(HAVE_CUDA)
399 template class Device<DeviceType::CUDA>;
400 #endif
401 #if PetscDefined(HAVE_HIP)
402 template class Device<DeviceType::HIP>;
403 #endif
404 
405 } // namespace CUPM
406 
407 } // namespace Device
408 
409 } // namespace Petsc
410