xref: /petsc/src/sys/objects/device/interface/memory.cxx (revision 6016107252cfa03568230349a14202a384fbf0c0) !
1 #include <petsc/private/deviceimpl.h> /*I <petscdevice.h> I*/
2 
3 #include <petsc/private/cpp/register_finalize.hpp>
4 #include <petsc/private/cpp/array.hpp>
5 
6 #include <unordered_map>
7 #include <algorithm> // std::find_if
8 #include <cstring>   // std::memset
9 
10 const char *const PetscDeviceCopyModes[] = {"host_to_host", "device_to_host", "host_to_device", "device_to_device", "auto", "PetscDeviceCopyMode", "PETSC_DEVICE_COPY_", nullptr};
11 static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_HTOH) == 0, "");
12 static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_DTOH) == 1, "");
13 static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_HTOD) == 2, "");
14 static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_DTOD) == 3, "");
15 static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_AUTO) == 4, "");
16 
17 // ==========================================================================================
18 // MemoryMap
19 //
20 // Since the pointers allocated via PetscDeviceAllocate() may be device pointers we cannot just
21 // store meta-data within the pointer itself (as we can't dereference them). So instead we need
22 // to keep an extra map to keep track of them
23 //
24 // Each entry maps pointer -> [PetscMemType, PetscObjectId, size]
25 // ==========================================================================================
26 
27 // GCC implementation for std::hash<T*>. LLVM's libc++ is almost 2x slower because they do all
28 // kinds of complicated murmur hashing, so we make sure to enforce GCC's version.
29 struct PointerHash {
30   std::size_t operator()(const void *ptr) const noexcept { return reinterpret_cast<std::size_t>(ptr); }
31 };
32 
33 class MemoryMap : public Petsc::RegisterFinalizeable<MemoryMap> {
34 public:
35   struct PointerAttributes {
36     PetscMemType  mtype{}; // memtype of allocation
37     PetscObjectId id{};    // id of allocation
38     std::size_t   size{};  // size of allocation (bytes)
39 
40     // even though this is a POD and can be aggregate initialized, the STL uses () constructors
41     // in unordered_map and so we need to provide a trivial contructor...
42     constexpr PointerAttributes(PetscMemType, PetscObjectId, std::size_t) noexcept;
43     constexpr PointerAttributes() noexcept                                              = default;
44     constexpr PointerAttributes(const PointerAttributes &) noexcept                     = default;
45     PETSC_CONSTEXPR_14 PointerAttributes &operator=(const PointerAttributes &) noexcept = default;
46     constexpr PointerAttributes(PointerAttributes &&) noexcept                          = default;
47     PETSC_CONSTEXPR_14 PointerAttributes &operator=(PointerAttributes &&) noexcept      = default;
48 
49     bool operator==(const PointerAttributes &) const noexcept;
50 
51     PETSC_NODISCARD bool contains(const void *, const void *) const noexcept;
52   };
53 
54   using map_type = std::unordered_map<void *, PointerAttributes, PointerHash>;
55 
56   map_type map;
57 
58   // return the iterator of the allocation containing ptr, or map.cend() if not found
59   PETSC_NODISCARD map_type::const_iterator search_for(const void *) const noexcept;
60 
61 private:
62   friend class Petsc::RegisterFinalizeable<MemoryMap>;
63   PETSC_NODISCARD PetscErrorCode register_finalize_() noexcept;
64   PETSC_NODISCARD PetscErrorCode finalize_() noexcept;
65 };
66 
67 // ==========================================================================================
68 // PointerAttributes
69 // ==========================================================================================
70 
71 constexpr MemoryMap::PointerAttributes::PointerAttributes(PetscMemType mtype_, PetscObjectId id_, std::size_t size_) noexcept : mtype(mtype_), id(id_), size(size_) { }
72 
73 bool MemoryMap::PointerAttributes::operator==(const PointerAttributes &other) const noexcept {
74   return mtype == other.mtype && id == other.id && size == other.size;
75 }
76 
77 bool MemoryMap::PointerAttributes::contains(const void *ptr_begin, const void *ptr) const noexcept {
78   return (ptr >= ptr_begin) && (ptr < (static_cast<const char *>(ptr_begin) + size));
79 }
80 
81 // ==========================================================================================
82 // Memory map functions
83 // ==========================================================================================
84 
85 PetscErrorCode MemoryMap::register_finalize_() noexcept {
86   PetscFunctionBegin;
87   // Preallocate, this does give a modest performance bump since unordered_map is so __dog__
88   // slow if it needs to rehash. Experiments show that users tend not to have more than 5 or
89   // so concurrently live pointers lying around. 10 at most.
90   PetscCallCXX(map.reserve(16));
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MemoryMap::finalize_() noexcept {
95   PetscFunctionBegin;
96   PetscCall(PetscInfo(nullptr, "Finalizing memory map\n"));
97   PetscCallCXX(map = map_type{});
98   PetscFunctionReturn(0);
99 }
100 
101 /*
102   MemoryMap::search_for - retrieve an iterator to the key-value pair for a pointer in the map
103 
104   Input Parameter:
105 . ptr - pointer to search for
106 
107   Notes:
108   Accounts for sub-regions, i.e. if ptr is contained within another pointers region, it returns
109   the iterator to the super-pointers key-value pair.
110 
111   If ptr is not found, returns map.end()
112 */
113 MemoryMap::map_type::const_iterator MemoryMap::search_for(const void *ptr) const noexcept {
114   const auto end = map.end();
115   const auto it  = map.find(const_cast<map_type::key_type>(ptr));
116 
117   // ptr was found, and points to an entire block
118   if (it != end) return it;
119   // wasn't found, but maybe its part of a block. have to search every block for it
120   // clang-format off
121   return std::find_if(map.begin(), end, [ptr](const map_type::const_iterator::value_type &map_it) {
122     return map_it.second.contains(map_it.first, ptr);
123   });
124   // clang-format on
125 }
126 
127 static MemoryMap memory_map;
128 
129 // ==========================================================================================
130 // Utility functions
131 // ==========================================================================================
132 
133 static PetscErrorCode PetscDeviceCheckCapable_Private(PetscDeviceContext dctx, bool cond, const char descr[]) {
134   PetscFunctionBegin;
135   PetscCheck(cond, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device context (id: %" PetscInt64_FMT ", name: %s, type: %s) can only handle %s host memory", PetscObjectCast(dctx)->id, PetscObjectCast(dctx)->name, dctx->device ? PetscDeviceTypes[dctx->device->type] : "unknown", descr);
136   PetscFunctionReturn(0);
137 }
138 
139 // A helper utility, since register is called from PetscDeviceRegisterMemory() and
140 // PetscDevicAllocate(). The latter also needs the generated id, so instead of making it search
141 // the map again we just return it here
142 static PetscErrorCode PetscDeviceRegisterMemory_Private(const void *PETSC_RESTRICT ptr, PetscMemType mtype, std::size_t size, PetscObjectId *PETSC_RESTRICT id = nullptr) {
143   auto      &map = memory_map.map;
144   const auto it  = memory_map.search_for(ptr);
145 
146   PetscFunctionBegin;
147   if (it == map.cend()) {
148     // pointer was never registered with the map, insert it and bail
149     const auto newid = PetscObjectNewId_Internal();
150 
151     if (PetscDefined(USE_DEBUG)) {
152       const auto tmp = MemoryMap::PointerAttributes(mtype, newid, size);
153 
154       for (const auto &entry : map) {
155         // REVIEW ME: maybe this should just be handled...
156         PetscCheck(!tmp.contains(ptr, entry.first), PETSC_COMM_SELF, PETSC_ERR_ORDER, "Trying to register pointer %p (memtype %s, size %zu) but it appears you have already registered a sub-region of it (pointer %p, memtype %s, size %zu). Must register the larger region first", ptr, PetscMemTypeToString(mtype), size,
157                    entry.first, PetscMemTypeToString(entry.second.mtype), entry.second.size);
158       }
159     }
160     // clang-format off
161     if (id) *id = newid;
162     PetscCallCXX(map.emplace(
163       std::piecewise_construct,
164       std::forward_as_tuple(const_cast<MemoryMap::map_type::key_type>(ptr)),
165       std::forward_as_tuple(mtype, newid, size)
166     ));
167     // clang-format on
168     PetscFunctionReturn(0);
169   }
170   if (PetscDefined(USE_DEBUG)) {
171     const auto &old = it->second;
172 
173     PetscCheck(MemoryMap::PointerAttributes(mtype, old.id, size) == old, PETSC_COMM_SELF, PETSC_ERR_LIB, "Pointer %p appears to have been previously allocated with memtype %s, size %zu and assigned id %" PetscInt64_FMT ", which does not match new values: (mtype %s, size %zu, id %" PetscInt64_FMT ")", it->first,
174                PetscMemTypeToString(old.mtype), old.size, old.id, PetscMemTypeToString(mtype), size, old.id);
175   }
176   if (id) *id = it->second.id;
177   PetscFunctionReturn(0);
178 }
179 
180 /*@C
181   PetscDeviceRegisterMemory - Register a pointer for use with device-aware memory system
182 
183   Not Collective
184 
185   Input Parameters:
186 + ptr   - The pointer to register
187 . mtype - The `PetscMemType` of the pointer
188 - size  - The size (in bytes) of the memory region
189 
190   Notes:
191   `ptr` need not point to the beginning of the memory range, however the user should register
192   the
193 
194   It's OK to re-register the same `ptr` repeatedly (subsequent registrations do nothing)
195   however the given `mtype` and `size` must match the original registration.
196 
197   `size` may be 0 (in which case this routine does nothing).
198 
199   Level: intermediate
200 
201 .seealso: `PetscDeviceMalloc()`, `PetscDeviceArrayCopy()`, `PetscDeviceFree()`,
202 `PetscDeviceArrayZero()`
203 @*/
204 PetscErrorCode PetscDeviceRegisterMemory(const void *PETSC_RESTRICT ptr, PetscMemType mtype, std::size_t size) {
205   PetscFunctionBegin;
206   if (PetscMemTypeHost(mtype)) PetscValidPointer(ptr, 1);
207   if (PetscUnlikely(!size)) PetscFunctionReturn(0); // there is no point registering empty range
208   PetscCall(PetscDeviceRegisterMemory_Private(ptr, mtype, size));
209   PetscFunctionReturn(0);
210 }
211 
212 /*@C
213   PetscDeviceAllocate - Allocate device-aware memory
214 
215   Not Collective, Asynchronous, Auto-dependency aware
216 
217   Input Parameters:
218 + dctx  - The `PetscDeviceContext` used to allocate the memory
219 . clear - Whether or not the memory should be zeroed
220 . mtype - The type of memory to allocate
221 - n     - The amount (in bytes) to allocate
222 
223   Output Parameter:
224 . ptr - The pointer to store the result in
225 
226   Notes:
227   The user should prefer `PetscDeviceMalloc()` over this routine as it automatically computes
228   the size of the allocation based on the size of the datatype.
229 
230   Memory allocated with this function must be freed with `PetscDeviceFree()` or
231   `PetscDeviceDeallocate()`.
232 
233   Note result stored `ptr` is immediately valid and the user may freely inspect or manipulate
234   its value on function return, i.e.\:
235 
236 .vb
237   PetscInt *ptr;
238 
239   PetscDeviceAllocate(dctx, PETSC_FALSE, PETSC_MEMTYPE_DEVICE, 20, (void**)&ptr);
240 
241   PetscInt *sub_ptr = ptr + 10; // OK, no need to synchronize
242 
243   ptr[0] = 10; // ERROR, directly accessing contents of ptr is undefined until synchronization
244 .ve
245 
246   If `n` is zero, then `ptr` is set to `PETSC_NULLPTR`.
247 
248   DAG representation:
249 .vb
250   time ->
251 
252   -> dctx - |= CALL =| -\- dctx -->
253                          \- ptr ->
254 .ve
255 
256   Level: intermediate
257 
258 .N ASYNC_API
259 
260 .seealso: `PetscDeviceMalloc()`, `PetscGetMemType()`, `PetscDeviceFree()`,
261 `PetscDeviceDeallocate()`, `PetscDeviceArrayCopy()`, `PetscDeviceArrayZero()`
262 @*/
263 PetscErrorCode PetscDeviceAllocate(PetscDeviceContext dctx, PetscBool clear, PetscMemType mtype, size_t n, void **PETSC_RESTRICT ptr) {
264   PetscObjectId id = 0;
265 
266   PetscFunctionBegin;
267   PetscValidPointer(ptr, 5);
268   *ptr = nullptr;
269   if (PetscUnlikely(!n)) PetscFunctionReturn(0);
270   PetscCall(memory_map.register_finalize());
271   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
272 
273   // get our pointer here
274   if (dctx->ops->memalloc) {
275     PetscUseTypeMethod(dctx, memalloc, clear, mtype, n, ptr);
276   } else {
277     PetscCall(PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(mtype), "allocating"));
278     PetscCall(PetscMallocA(1, clear, __LINE__, PETSC_FUNCTION_NAME, __FILE__, n, ptr));
279   }
280   PetscCall(PetscDeviceRegisterMemory_Private(*ptr, mtype, n, &id));
281   // Note this is a "write" so that the next dctx to try and read from the pointer has to wait
282   // for the allocation to be ready
283   PetscCall(PetscDeviceContextMarkIntentFromID(dctx, id, PETSC_MEMORY_ACCESS_WRITE, "memory allocation"));
284   PetscFunctionReturn(0);
285 }
286 
287 /*@C
288   PetscDeviceDeallocate - Free device-aware memory
289 
290   Not Collective, Asynchronous, Auto-dependency aware
291 
292   Input Parameters:
293 + dctx  - The `PetscDeviceContext` used to free the memory
294 - ptr   - The pointer to free
295 
296   Notes:
297   `ptr` must have been allocated using any of `PetscDeviceMalloc()`, `PetscDeviceCalloc()` or
298   `PetscDeviceAllocate()`, or registered with the system via `PetscDeviceRegisterMemory()`.
299 
300   The user should prefer `PetscDeviceFree()` over this routine as it automatically sets `ptr`
301   to `PETSC_NULLPTR` on successful deallocation.
302 
303   `ptr` may be `NULL`.
304 
305   DAG representation:
306 .vb
307   time ->
308 
309   -> dctx -/- |= CALL =| - dctx ->
310   -> ptr -/
311 .ve
312 
313   Level: intermediate
314 
315 .N ASYNC_API
316 
317 .seealso: `PetscDeviceFree()`, `PetscDeviceAllocate()`
318 @*/
319 PetscErrorCode PetscDeviceDeallocate(PetscDeviceContext dctx, void *PETSC_RESTRICT ptr) {
320   PetscFunctionBegin;
321   if (ptr) {
322     auto      &map      = memory_map.map;
323     const auto found_it = map.find(const_cast<MemoryMap::map_type::key_type>(ptr));
324 
325     if (PetscUnlikelyDebug(found_it == map.end())) {
326       // OK this is a bad pointer, now determine why
327       const auto it = memory_map.search_for(ptr);
328 
329       // if it is map.cend() then no allocation owns it, meaning it was not allocated by us!
330       PetscCheck(it != map.cend(), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Pointer %p was not allocated via PetscDeviceAllocate()", ptr);
331       // if we are here then we did allocate it but the user has tried to do something along
332       // the lines of:
333       //
334       // allocate(&ptr, size);
335       // deallocate(ptr+5);
336       //
337       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Attempting to deallocate pointer %p which is a suballocation of %p (memtype %s, id %" PetscInt64_FMT ", size %zu bytes)", ptr, it->first, PetscMemTypeToString(it->second.mtype), it->second.id,
338               it->second.size);
339     }
340     PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
341 
342     // mark intent BEFORE we free, note we mark as write so that we are made to wait on any
343     // outstanding reads (don't want to kill the pointer before they are done)
344     PetscCall(PetscDeviceContextMarkIntentFromID(dctx, found_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory deallocation"));
345 
346     // do free
347     if (dctx->ops->memfree) {
348       PetscUseTypeMethod(dctx, memfree, found_it->second.mtype, (void **)&ptr);
349     } else {
350       PetscCall(PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(found_it->second.mtype), "freeing"));
351     }
352     // if ptr still exists, then the device context could not handle it
353     if (ptr) PetscCall(PetscFree(ptr));
354     PetscCallCXX(map.erase(found_it));
355   }
356   PetscFunctionReturn(0);
357 }
358 
359 /*@C
360   PetscDeviceMemcpy - Copy memory in a device-aware manner
361 
362   Not Collective, Asynchronous, Auto-dependency aware
363 
364   Input Parameters:
365 + dctx - The `PetscDeviceContext` used to copy the memory
366 . dest - The pointer to copy to
367 . src  - The pointer to copy from
368 - n    - The amount (in bytes) to copy
369 
370   Notes:
371   Both `dest` and `src` must have been allocated by `PetscDeviceAllocate()` or
372   `PetscDeviceMalloc()` or registered with `PetscDeviceRegisterMemory()`.
373 
374   `src` and `dest` cannot overlap.
375 
376   If both `src` and `dest` are on the host this routine is fully synchronous.
377 
378   The user should prefer `PetscDeviceArrayCopy()` over this routine as it automatically
379   computes the number of bytes to copy from the size of the pointer types.
380 
381   DAG representation:
382 .vb
383   time ->
384 
385   -> dctx - |= CALL =| - dctx ->
386   -> dest --------------------->
387   -> src ---------------------->
388 .ve
389 
390   Level: intermediate
391 
392 .N ASYNC_API
393 
394 .seealso: `PetscDeviceArrayCopy()`, `PetscDeviceMalloc()`, `PetscDeviceFree()`
395 @*/
396 PetscErrorCode PetscDeviceMemcpy(PetscDeviceContext dctx, void *PETSC_RESTRICT dest, const void *PETSC_RESTRICT src, std::size_t n) {
397   PetscFunctionBegin;
398   if (!n) PetscFunctionReturn(0);
399   PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer");
400   PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer");
401   if (dest == src) PetscFunctionReturn(0);
402   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
403 
404   {
405     const auto dest_it = memory_map.search_for(dest);
406     const auto src_it  = memory_map.search_for(src);
407 
408     PetscAssert(dest_it != memory_map.map.cend(), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Destination pointer %p was not registered with the memory tracker, call PetscDeviceRegisterMemory() on it", dest);
409     PetscAssert(src_it != memory_map.map.cend(), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Source pointer %p was not registered with the memory tracker, call PetscDeviceRegisterMemory() on it", src);
410 
411     PetscCall(PetscDeviceContextMarkIntentFromID(dctx, src_it->second.id, PETSC_MEMORY_ACCESS_READ, "memory copy (src)"));
412     PetscCall(PetscDeviceContextMarkIntentFromID(dctx, dest_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory copy (dest)"));
413 
414     const auto mode = PetscMemTypeToDeviceCopyMode(dest_it->second.mtype, src_it->second.mtype);
415     // perform the copy
416     if (dctx->ops->memcopy) {
417       PetscUseTypeMethod(dctx, memcopy, dest, src, n, mode);
418     } else {
419       // REVIEW ME: we might potentially need to sync here if the memory is device-allocated
420       // (pinned) but being copied by a host dctx
421       PetscCall(PetscDeviceCheckCapable_Private(dctx, mode == PETSC_DEVICE_COPY_HTOH, "copying"));
422       PetscCall(PetscMemcpy(dest, src, n));
423     }
424 
425     if (mode == PETSC_DEVICE_COPY_HTOD) {
426       PetscCall(PetscLogCpuToGpu(n));
427     } else if (mode == PETSC_DEVICE_COPY_DTOH) {
428       PetscCall(PetscLogGpuToCpu(n));
429     }
430   }
431   PetscFunctionReturn(0);
432 }
433 
434 /*@C
435   PetscDeviceMemset - Memset device-aware memory
436 
437   Not Collective, Asynchronous, Auto-dependency aware
438 
439   Input Parameters:
440 + dctx  - The `PetscDeviceContext` used to memset the memory
441 . ptr   - The pointer to the memory
442 . v     - The value to set
443 - n     - The amount (in bytes) to set
444 
445   Notes:
446   `ptr` must have been allocated by `PetscDeviceAllocate()` or `PetscDeviceMalloc()` or
447   registered with `PetscDeviceRegisterMemory()`.
448 
449   The user should prefer `PetscDeviceArrayZero()` over this routine as it automatically
450   computes the number of bytes to copy from the size of the pointer types, though they should
451   note that it only zeros memory.
452 
453   This routine is analogous to `memset()`. That is, this routine copies the value
454   `static_cast<unsigned char>(v)` into each of the first count characters of the object pointed
455   to by `dest`.
456 
457   If `dest` is on device, this routine is asynchronous.
458 
459   DAG representation:
460 .vb
461   time ->
462 
463   -> dctx - |= CALL =| - dctx ->
464   -> dest --------------------->
465 .ve
466 
467   Level: intermediate
468 
469 .N ASYNC_API
470 
471 .seealso: `PetscDeviceArrayZero()`, `PetscDeviceMalloc()`, `PetscDeviceFree()`
472 @*/
473 PetscErrorCode PetscDeviceMemset(PetscDeviceContext dctx, void *ptr, PetscInt v, std::size_t n) {
474   PetscFunctionBegin;
475   if (PetscUnlikely(!n)) PetscFunctionReturn(0);
476   PetscCheck(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer");
477   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
478   {
479     const auto ptr_it = memory_map.search_for(ptr);
480 
481     // mark
482     PetscAssert(ptr_it != memory_map.map.cend(), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p was not registered with the memory tracker, call PetscDeviceRegisterMemory() on it", ptr);
483     PetscCall(PetscDeviceContextMarkIntentFromID(dctx, ptr_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory set"));
484 
485     // set
486     if (dctx->ops->memset) {
487       PetscUseTypeMethod(dctx, memset, ptr_it->second.mtype, ptr, v, n);
488     } else {
489       // REVIEW ME: we might potentially need to sync here if the memory is device-allocated
490       // (pinned) but being memset by a host dctx
491       PetscCall(PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(ptr_it->second.mtype), "memsetting"));
492       std::memset(ptr, static_cast<int>(v), n);
493     }
494   }
495   PetscFunctionReturn(0);
496 }
497