xref: /petsc/src/sys/objects/device/impls/segmentedmempool.hpp (revision 0619917b5a674bb687c64e7daba2ab22be99af31)
1 #ifndef PETSC_SEGMENTEDMEMPOOL_HPP
2 #define PETSC_SEGMENTEDMEMPOOL_HPP
3 
4 #include <petsc/private/deviceimpl.h>
5 
6 #include <petsc/private/cpp/macros.hpp>
7 #include <petsc/private/cpp/type_traits.hpp>
8 #include <petsc/private/cpp/utility.hpp>
9 #include <petsc/private/cpp/register_finalize.hpp>
10 #include <petsc/private/cpp/memory.hpp>
11 
12 #include <limits>
13 #include <deque>
14 #include <vector>
15 
16 namespace Petsc
17 {
18 
19 namespace device
20 {
21 
22 template <typename T>
23 class StreamBase {
24 public:
25   using id_type      = int;
26   using derived_type = T;
27 
28   static const id_type INVALID_ID;
29 
30   // needed so that dependent auto works, see veccupmimpl.h for a detailed discussion
31   template <typename U = T>
32   PETSC_NODISCARD auto get_stream() const noexcept PETSC_DECLTYPE_AUTO_RETURNS(static_cast<const U &>(*this).get_stream_());
33 
34   PETSC_NODISCARD id_type get_id() const noexcept { return static_cast<const T &>(*this).get_id_(); }
35 
36   template <typename E>
37   PetscErrorCode record_event(E &&event) const noexcept
38   {
39     return static_cast<const T &>(*this).record_event_(std::forward<E>(event));
40   }
41 
42   template <typename E>
43   PetscErrorCode wait_for_event(E &&event) const noexcept
44   {
45     return static_cast<const T &>(*this).wait_for_(std::forward<E>(event));
46   }
47 
48 protected:
49   constexpr StreamBase() noexcept = default;
50 
51   struct default_event_type { };
52   using default_stream_type = std::nullptr_t;
53 
54   PETSC_NODISCARD static constexpr default_stream_type get_stream_() noexcept { return nullptr; }
55 
56   PETSC_NODISCARD static constexpr id_type get_id_() noexcept { return 0; }
57 
58   template <typename U = T>
59   static constexpr PetscErrorCode record_event_(const typename U::event_type &) noexcept
60   {
61     return PETSC_SUCCESS;
62   }
63 
64   template <typename U = T>
65   static constexpr PetscErrorCode wait_for_(const typename U::event_type &) noexcept
66   {
67     return PETSC_SUCCESS;
68   }
69 };
70 
71 template <typename T>
72 const typename StreamBase<T>::id_type StreamBase<T>::INVALID_ID = -1;
73 
74 struct DefaultStream : StreamBase<DefaultStream> {
75   using stream_type = typename StreamBase<DefaultStream>::default_stream_type;
76   using id_type     = typename StreamBase<DefaultStream>::id_type;
77   using event_type  = typename StreamBase<DefaultStream>::default_event_type;
78 };
79 
80 } // namespace device
81 
82 namespace memory
83 {
84 
85 namespace impl
86 {
87 
88 // ==========================================================================================
89 // MemoryChunk
90 //
91 // Represents a checked-out region of a MemoryBlock. Tracks the offset into the owning
92 // MemoryBlock and its size/capacity
93 // ==========================================================================================
94 
95 template <typename EventType>
96 class MemoryChunk {
97 public:
98   using event_type = EventType;
99   using size_type  = std::size_t;
100 
101   MemoryChunk(size_type, size_type) noexcept;
102   explicit MemoryChunk(size_type) noexcept;
103 
104   MemoryChunk(MemoryChunk &&) noexcept;
105   MemoryChunk &operator=(MemoryChunk &&) noexcept;
106 
107   MemoryChunk(const MemoryChunk &) noexcept            = delete;
108   MemoryChunk &operator=(const MemoryChunk &) noexcept = delete;
109 
110   PETSC_NODISCARD size_type start() const noexcept { return start_; }
111   PETSC_NODISCARD size_type size() const noexcept { return size_; }
112   // REVIEW ME:
113   // make this an actual field, normally each chunk shrinks_to_fit() on begin claimed, but in
114   // theory only the last chunk needs to do this
115   PETSC_NODISCARD size_type capacity() const noexcept { return size_; }
116   PETSC_NODISCARD size_type total_offset() const noexcept { return start() + size(); }
117 
118   template <typename U>
119   PetscErrorCode release(const device::StreamBase<U> *) noexcept;
120   template <typename U>
121   PetscErrorCode claim(const device::StreamBase<U> *, size_type, bool *, bool = false) noexcept;
122   template <typename U>
123   PETSC_NODISCARD bool can_claim(const device::StreamBase<U> *, size_type, bool) const noexcept;
124   PetscErrorCode       resize(size_type) noexcept;
125   PETSC_NODISCARD bool contains(size_type) const noexcept;
126 
127 private:
128   // clang-format off
129   event_type      event_{};          // event recorded when the chunk was released
130   bool            open_      = true; // is this chunk open?
131   // id of the last stream to use the chunk, populated on release
132   int             stream_id_ = device::DefaultStream::INVALID_ID;
133   size_type       size_      = 0;    // size of the chunk
134   const size_type start_     = 0;    // offset from the start of the owning block
135   // clang-format on
136 
137   template <typename U>
138   PETSC_NODISCARD bool stream_compat_(const device::StreamBase<U> *) const noexcept;
139 };
140 
141 // ==========================================================================================
142 // MemoryChunk - Private API
143 // ==========================================================================================
144 
145 // asks and answers the question: can this stream claim this chunk without serializing?
146 template <typename E>
147 template <typename U>
148 inline bool MemoryChunk<E>::stream_compat_(const device::StreamBase<U> *strm) const noexcept
149 {
150   return (stream_id_ == strm->INVALID_ID) || (stream_id_ == strm->get_id());
151 }
152 
153 // ==========================================================================================
154 // MemoryChunk - Public API
155 // ==========================================================================================
156 
157 template <typename E>
158 inline MemoryChunk<E>::MemoryChunk(size_type start, size_type size) noexcept : size_(size), start_(start)
159 {
160 }
161 
162 template <typename E>
163 inline MemoryChunk<E>::MemoryChunk(size_type size) noexcept : MemoryChunk(0, size)
164 {
165 }
166 
167 template <typename E>
168 inline MemoryChunk<E>::MemoryChunk(MemoryChunk<E> &&other) noexcept :
169   event_(std::move(other.event_)), open_(util::exchange(other.open_, false)), stream_id_(util::exchange(other.stream_id_, device::DefaultStream::INVALID_ID)), size_(util::exchange(other.size_, 0)), start_(std::move(other.start_))
170 {
171 }
172 
173 template <typename E>
174 inline MemoryChunk<E> &MemoryChunk<E>::operator=(MemoryChunk<E> &&other) noexcept
175 {
176   PetscFunctionBegin;
177   if (this != &other) {
178     event_     = std::move(other.event_);
179     open_      = util::exchange(other.open_, false);
180     stream_id_ = util::exchange(other.stream_id_, device::DefaultStream::INVALID_ID);
181     size_      = util::exchange(other.size_, 0);
182     start_     = std::move(other.start_);
183   }
184   PetscFunctionReturn(*this);
185 }
186 
187 /*
188   MemoryChunk::release - release a chunk on a stream
189 
190   Input Parameter:
191 . stream - the stream to release the chunk with
192 
193   Notes:
194   Inserts a release operation on stream and records the state of stream at the time this
195   routine was called.
196 
197   Future allocation requests which attempt to claim the chunk on the same stream may re-acquire
198   the chunk without serialization.
199 
200   If another stream attempts to claim the chunk they must wait for the recorded event before
201   claiming the chunk.
202 */
203 template <typename E>
204 template <typename U>
205 inline PetscErrorCode MemoryChunk<E>::release(const device::StreamBase<U> *stream) noexcept
206 {
207   PetscFunctionBegin;
208   open_      = true;
209   stream_id_ = stream->get_id();
210   PetscCall(stream->record_event(event_));
211   PetscFunctionReturn(PETSC_SUCCESS);
212 }
213 
214 /*
215   MemoryChunk::claim - attempt to claim a particular chunk
216 
217   Input Parameters:
218 + stream    - the stream on which to attempt to claim
219 . req_size  - the requested size (in elements) to attempt to claim
220 - serialize - (optional, false) whether the claimant allows serialization
221 
222   Output Parameter:
223 . success - true if the chunk was claimed, false otherwise
224 */
225 template <typename E>
226 template <typename U>
227 inline PetscErrorCode MemoryChunk<E>::claim(const device::StreamBase<U> *stream, size_type req_size, bool *success, bool serialize) noexcept
228 {
229   PetscFunctionBegin;
230   if ((*success = can_claim(stream, req_size, serialize))) {
231     if (serialize && !stream_compat_(stream)) PetscCall(stream->wait_for_event(event_));
232     PetscCall(resize(req_size));
233     open_ = false;
234   }
235   PetscFunctionReturn(PETSC_SUCCESS);
236 }
237 
238 /*
239   MemoryChunk::can_claim - test whether a particular chunk can be claimed
240 
241   Input Parameters:
242 + stream    - the stream on which to attempt to claim
243 . req_size  - the requested size (in elements) to attempt to claim
244 - serialize - whether the claimant allows serialization
245 
246   Output:
247 . [return] - true if the chunk is claimable given the configuration, false otherwise
248 */
249 template <typename E>
250 template <typename U>
251 inline bool MemoryChunk<E>::can_claim(const device::StreamBase<U> *stream, size_type req_size, bool serialize) const noexcept
252 {
253   if (open_ && (req_size <= capacity())) {
254     // fully compatible
255     if (stream_compat_(stream)) return true;
256     // stream wasn't compatible, but could claim if we serialized
257     if (serialize) return true;
258     // incompatible stream and did not want to serialize
259   }
260   return false;
261 }
262 
263 /*
264   MemoryChunk::resize - grow a chunk to new size
265 
266   Input Parameter:
267 . newsize - the new size Requested
268 
269   Notes:
270   newsize cannot be larger than capacity
271 */
272 template <typename E>
273 inline PetscErrorCode MemoryChunk<E>::resize(size_type newsize) noexcept
274 {
275   PetscFunctionBegin;
276   PetscAssert(newsize <= capacity(), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "New size %zu larger than capacity %zu", newsize, capacity());
277   size_ = newsize;
278   PetscFunctionReturn(PETSC_SUCCESS);
279 }
280 
281 /*
282   MemoryChunk::contains - query whether a memory chunk contains a particular offset
283 
284   Input Parameters:
285 . offset - The offset from the MemoryBlock start
286 
287   Notes:
288   Returns true if the chunk contains the offset, false otherwise
289 */
290 template <typename E>
291 inline bool MemoryChunk<E>::contains(size_type offset) const noexcept
292 {
293   return (offset >= start()) && (offset < total_offset());
294 }
295 
296 // ==========================================================================================
297 // MemoryBlock
298 //
299 // A "memory block" manager, which owns the pointer to a particular memory range. Retrieving
300 // and restoring a block is thread-safe (so may be used by multiple device streams).
301 // ==========================================================================================
302 
303 template <typename T, typename AllocatorType, typename StreamType>
304 class MemoryBlock {
305 public:
306   using value_type      = T;
307   using allocator_type  = AllocatorType;
308   using stream_type     = StreamType;
309   using event_type      = typename stream_type::event_type;
310   using chunk_type      = MemoryChunk<event_type>;
311   using size_type       = typename chunk_type::size_type;
312   using chunk_list_type = std::vector<chunk_type>;
313 
314   template <typename U>
315   MemoryBlock(allocator_type *, size_type, const device::StreamBase<U> *) noexcept;
316 
317   ~MemoryBlock() noexcept(std::is_nothrow_destructible<chunk_list_type>::value);
318 
319   MemoryBlock(MemoryBlock &&) noexcept;
320   MemoryBlock &operator=(MemoryBlock &&) noexcept;
321 
322   // memory blocks are not copyable
323   MemoryBlock(const MemoryBlock &)            = delete;
324   MemoryBlock &operator=(const MemoryBlock &) = delete;
325 
326   /* --- actual functions --- */
327   PetscErrorCode       try_allocate_chunk(size_type, T **, const stream_type *, bool *) noexcept;
328   PetscErrorCode       try_deallocate_chunk(T **, const stream_type *, bool *) noexcept;
329   PetscErrorCode       try_find_chunk(const T *, chunk_type **) noexcept;
330   PETSC_NODISCARD bool owns_pointer(const T *) const noexcept;
331 
332   PETSC_NODISCARD size_type size() const noexcept { return size_; }
333   PETSC_NODISCARD size_type bytes() const noexcept { return sizeof(value_type) * size(); }
334   PETSC_NODISCARD size_type num_chunks() const noexcept { return chunks_.size(); }
335 
336 private:
337   value_type     *mem_{};
338   allocator_type *allocator_{};
339   size_type       size_{};
340   chunk_list_type chunks_{};
341 
342   PetscErrorCode clear_(const stream_type *) noexcept;
343 };
344 
345 // ==========================================================================================
346 // MemoryBlock - Private API
347 // ==========================================================================================
348 
349 // clear the memory block, called from destructors and move assignment/construction
350 template <typename T, typename A, typename S>
351 PetscErrorCode MemoryBlock<T, A, S>::clear_(const stream_type *stream) noexcept
352 {
353   PetscFunctionBegin;
354   if (PetscLikely(mem_)) {
355     PetscCall(allocator_->deallocate(mem_, stream));
356     mem_ = nullptr;
357   }
358   size_ = 0;
359   PetscCallCXX(chunks_.clear());
360   PetscFunctionReturn(PETSC_SUCCESS);
361 }
362 
363 // ==========================================================================================
364 // MemoryBlock - Public API
365 // ==========================================================================================
366 
367 // default constructor, allocates memory immediately
368 template <typename T, typename A, typename S>
369 template <typename U>
370 MemoryBlock<T, A, S>::MemoryBlock(allocator_type *alloc, size_type s, const device::StreamBase<U> *stream) noexcept : allocator_(alloc), size_(s)
371 {
372   PetscFunctionBegin;
373   PetscCallAbort(PETSC_COMM_SELF, alloc->allocate(&mem_, s, stream));
374   PetscAssertAbort(mem_, PETSC_COMM_SELF, PETSC_ERR_MEM, "Failed to allocate memory block of size %zu", s);
375   PetscFunctionReturnVoid();
376 }
377 
378 template <typename T, typename A, typename S>
379 MemoryBlock<T, A, S>::~MemoryBlock() noexcept(std::is_nothrow_destructible<chunk_list_type>::value)
380 {
381   stream_type stream;
382 
383   PetscFunctionBegin;
384   PetscCallAbort(PETSC_COMM_SELF, clear_(&stream));
385   PetscFunctionReturnVoid();
386 }
387 
388 template <typename T, typename A, typename S>
389 MemoryBlock<T, A, S>::MemoryBlock(MemoryBlock &&other) noexcept : mem_(util::exchange(other.mem_, nullptr)), allocator_(other.allocator_), size_(util::exchange(other.size_, 0)), chunks_(std::move(other.chunks_))
390 {
391 }
392 
393 template <typename T, typename A, typename S>
394 MemoryBlock<T, A, S> &MemoryBlock<T, A, S>::operator=(MemoryBlock &&other) noexcept
395 {
396   PetscFunctionBegin;
397   if (this != &other) {
398     stream_type stream;
399 
400     PetscCallAbort(PETSC_COMM_SELF, clear_(&stream));
401     mem_       = util::exchange(other.mem_, nullptr);
402     allocator_ = other.allocator_;
403     size_      = util::exchange(other.size_, 0);
404     chunks_    = std::move(other.chunks_);
405   }
406   PetscFunctionReturn(*this);
407 }
408 
409 /*
410   MemoryBock::owns_pointer - returns true if this block owns a pointer, false otherwise
411 */
412 template <typename T, typename A, typename S>
413 inline bool MemoryBlock<T, A, S>::owns_pointer(const T *ptr) const noexcept
414 {
415   // each pool is linear in memory, so it suffices to check the bounds
416   return (ptr >= mem_) && (ptr < std::next(mem_, size()));
417 }
418 
419 /*
420   MemoryBlock::try_allocate_chunk - try to get a chunk from this MemoryBlock
421 
422   Input Parameters:
423 + req_size - the requested size of the allocation (in elements)
424 . ptr      - ptr to fill
425 - stream   - stream to fill the pointer on
426 
427   Output Parameter:
428 . success  - true if chunk was gotten, false otherwise
429 
430   Notes:
431   If the current memory could not satisfy the memory request, ptr is unchanged
432 */
433 template <typename T, typename A, typename S>
434 inline PetscErrorCode MemoryBlock<T, A, S>::try_allocate_chunk(size_type req_size, T **ptr, const stream_type *stream, bool *success) noexcept
435 {
436   PetscFunctionBegin;
437   *success = false;
438   if (req_size <= size()) {
439     const auto try_create_chunk = [&]() {
440       const auto was_empty     = chunks_.empty();
441       const auto block_alloced = was_empty ? 0 : chunks_.back().total_offset();
442 
443       PetscFunctionBegin;
444       if (block_alloced + req_size <= size()) {
445         PetscCallCXX(chunks_.emplace_back(block_alloced, req_size));
446         PetscCall(chunks_.back().claim(stream, req_size, success));
447         *ptr = mem_ + block_alloced;
448         if (was_empty) PetscAssert(*success, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Failed to claim chunk (of size %zu) even though block (of size %zu) was empty!", req_size, size());
449       }
450       PetscFunctionReturn(PETSC_SUCCESS);
451     };
452     const auto try_find_open_chunk = [&](bool serialize = false) {
453       PetscFunctionBegin;
454       for (auto &chunk : chunks_) {
455         PetscCall(chunk.claim(stream, req_size, success, serialize));
456         if (*success) {
457           *ptr = mem_ + chunk.start();
458           break;
459         }
460       }
461       PetscFunctionReturn(PETSC_SUCCESS);
462     };
463     const auto try_steal_other_stream_chunk = [&]() {
464       PetscFunctionBegin;
465       PetscCall(try_find_open_chunk(true));
466       PetscFunctionReturn(PETSC_SUCCESS);
467     };
468 
469     // search previously distributed chunks, but only claim one if it is on the same stream
470     // as us
471     PetscCall(try_find_open_chunk());
472 
473     // if we are here we couldn't reuse one of our own chunks so check first if the pool
474     // has room for a new one
475     if (!*success) PetscCall(try_create_chunk());
476 
477     // try pruning dead chunks off the back, note we do this regardless of whether we are
478     // successful
479     while (chunks_.back().can_claim(stream, 0, false)) {
480       PetscCallCXX(chunks_.pop_back());
481       if (chunks_.empty()) {
482         // if chunks are empty it implies we have managed to claim (and subsequently destroy)
483         // our own chunk twice! something has gone wrong
484         PetscAssert(!*success, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Successfully claimed a chunk (of size %zu, from block of size %zu) but have now managed to claim it for a second time (and destroyed it)!", req_size, size());
485         break;
486       }
487     }
488 
489     // if previously unsuccessful see if enough space has opened up due to pruning. note that
490     // if the chunk list was emptied from the pruning this call must succeed in allocating a
491     // chunk, otherwise something is wrong
492     if (!*success) PetscCall(try_create_chunk());
493 
494     // last resort, iterate over all chunks and see if we can steal one by waiting on the
495     // current owner to finish using it
496     if (!*success) PetscCall(try_steal_other_stream_chunk());
497   }
498   PetscFunctionReturn(PETSC_SUCCESS);
499 }
500 
501 /*
502   MemoryBlock::try_deallocate_chunk - try to restore a chunk to this MemoryBlock
503 
504   Input Parameters:
505 + ptr     - ptr to restore
506 - stream  - stream to restore the pointer on
507 
508   Output Parameter:
509 . success - true if chunk was restored, false otherwise
510 
511   Notes:
512   ptr is set to nullptr on successful restore, and is unchanged otherwise. If the ptr is owned
513   by this MemoryBlock then it is restored on stream. The same stream may receive ptr again
514   without synchronization, but other streams may not do so until either serializing or the
515   stream is idle again.
516 */
517 template <typename T, typename A, typename S>
518 inline PetscErrorCode MemoryBlock<T, A, S>::try_deallocate_chunk(T **ptr, const stream_type *stream, bool *success) noexcept
519 {
520   chunk_type *chunk = nullptr;
521 
522   PetscFunctionBegin;
523   PetscCall(try_find_chunk(*ptr, &chunk));
524   if (chunk) {
525     PetscCall(chunk->release(stream));
526     *ptr     = nullptr;
527     *success = true;
528   } else {
529     *success = false;
530   }
531   PetscFunctionReturn(PETSC_SUCCESS);
532 }
533 
534 /*
535   MemoryBlock::try_find_chunk - try to find the chunk which owns ptr
536 
537   Input Parameter:
538 . ptr - the pointer to look for
539 
540   Output Parameter:
541 . ret_chunk - pointer to the owning chunk or nullptr if not found
542 */
543 template <typename T, typename A, typename S>
544 inline PetscErrorCode MemoryBlock<T, A, S>::try_find_chunk(const T *ptr, chunk_type **ret_chunk) noexcept
545 {
546   PetscFunctionBegin;
547   *ret_chunk = nullptr;
548   if (owns_pointer(ptr)) {
549     const auto offset = static_cast<size_type>(ptr - mem_);
550 
551     for (auto &chunk : chunks_) {
552       if (chunk.contains(offset)) {
553         *ret_chunk = &chunk;
554         break;
555       }
556     }
557 
558     PetscAssert(*ret_chunk, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Failed to find %zu in block, even though it is within block range [%zu, %zu)", reinterpret_cast<uintptr_t>(ptr), reinterpret_cast<uintptr_t>(mem_), reinterpret_cast<uintptr_t>(std::next(mem_, size())));
559   }
560   PetscFunctionReturn(PETSC_SUCCESS);
561 }
562 
563 namespace detail
564 {
565 
566 template <typename T>
567 struct real_type {
568   using type = T;
569 };
570 
571 template <>
572 struct real_type<PetscScalar> {
573   using type = PetscReal;
574 };
575 
576 } // namespace detail
577 
578 template <typename T>
579 struct SegmentedMemoryPoolAllocatorBase {
580   using value_type      = T;
581   using size_type       = std::size_t;
582   using real_value_type = typename detail::real_type<T>::type;
583 
584   template <typename U>
585   static PetscErrorCode allocate(value_type **, size_type, const device::StreamBase<U> *) noexcept;
586   template <typename U>
587   static PetscErrorCode deallocate(value_type *, const device::StreamBase<U> *) noexcept;
588   template <typename U>
589   static PetscErrorCode zero(value_type *, size_type, const device::StreamBase<U> *) noexcept;
590   template <typename U>
591   static PetscErrorCode uninitialized_copy(value_type *, const value_type *, size_type, const device::StreamBase<U> *) noexcept;
592   template <typename U>
593   static PetscErrorCode set_canary(value_type *, size_type, const device::StreamBase<U> *) noexcept;
594 };
595 
596 template <typename T>
597 template <typename U>
598 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::allocate(value_type **ptr, size_type n, const device::StreamBase<U> *) noexcept
599 {
600   PetscFunctionBegin;
601   PetscCall(PetscMalloc1(n, ptr));
602   PetscFunctionReturn(PETSC_SUCCESS);
603 }
604 
605 template <typename T>
606 template <typename U>
607 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::deallocate(value_type *ptr, const device::StreamBase<U> *) noexcept
608 {
609   PetscFunctionBegin;
610   PetscCall(PetscFree(ptr));
611   PetscFunctionReturn(PETSC_SUCCESS);
612 }
613 
614 template <typename T>
615 template <typename U>
616 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::zero(value_type *ptr, size_type n, const device::StreamBase<U> *) noexcept
617 {
618   PetscFunctionBegin;
619   PetscCall(PetscArrayzero(ptr, n));
620   PetscFunctionReturn(PETSC_SUCCESS);
621 }
622 
623 template <typename T>
624 template <typename U>
625 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::uninitialized_copy(value_type *dest, const value_type *src, size_type n, const device::StreamBase<U> *) noexcept
626 {
627   PetscFunctionBegin;
628   PetscCall(PetscArraycpy(dest, src, n));
629   PetscFunctionReturn(PETSC_SUCCESS);
630 }
631 
632 template <typename T>
633 template <typename U>
634 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::set_canary(value_type *ptr, size_type n, const device::StreamBase<U> *) noexcept
635 {
636   using limit_type            = std::numeric_limits<real_value_type>;
637   constexpr value_type canary = limit_type::has_signaling_NaN ? limit_type::signaling_NaN() : limit_type::max();
638 
639   PetscFunctionBegin;
640   for (size_type i = 0; i < n; ++i) ptr[i] = canary;
641   PetscFunctionReturn(PETSC_SUCCESS);
642 }
643 
644 } // namespace impl
645 
646 // ==========================================================================================
647 // SegmentedMemoryPool
648 //
649 // Stream-aware async memory allocator. Holds a list of memory "blocks" which each control an
650 // allocated buffer. This buffer is further split into memory "chunks" which control
651 // consecutive, non-overlapping regions of the block. Chunks may be in 1 of 2 states:
652 //
653 // 1. Open:
654 //    The chunk is free to be claimed by the next suitable allocation request. If the
655 //    allocation request is made on the same stream as the chunk was deallocated on, no
656 //    serialization needs to occur. If not, the allocating stream must wait for the
657 //    event. Claiming the chunk "closes" the chunk.
658 //
659 // 2. Closed:
660 //    The chunk has been claimed by an allocation request. It cannot be opened again until it
661 //    is deallocated; doing so "opens" the chunk.
662 //
663 // Note that there does not need to be a chunk for every region, chunks are created to satisfy
664 // an allocation request.
665 //
666 // Thus there is usually a region of "unallocated" memory at the end of the buffer, which may
667 // be claimed by a newly created chunk if existing chunks cannot satisfy the allocation
668 // request. This region exists _only_ at the end, as there are no gaps between chunks.
669 //
670 //
671 // |-----------------------------------------------------------------------------------------
672 // | SegmentedMemoryPool
673 // |
674 // | ||-------------||
675 // | ||             ||    -------------------------------------------------------------------
676 // | ||             ||    | AAAAAAAAAAAAAABBBBBBBCCCCCCCCCCCCCCCCCCCCDDDDDDDDDDDDDXXXXXXXX...
677 // | ||             ||    | |             |      |                   |            |
678 // | ||             ||    | x-----x-------x-----xx---------x---------x------x-----x
679 // | || MemoryBlock || -> | ------|-------------|----------|----------------|--------
680 // | ||             ||    | | MemoryChunk | MemoryChunk | MemoryChunk | MemoryChunk |
681 // | ||             ||    | ---------------------------------------------------------
682 // | ||             ||    -------------------------------------------------------------------
683 // | ||-------------||
684 // | ||             ||
685 // | ||     ...     ||
686 // | ||             ||
687 // ==========================================================================================
688 
689 template <typename MemType, typename StreamType = device::DefaultStream, typename AllocType = impl::SegmentedMemoryPoolAllocatorBase<MemType>, std::size_t DefaultChunkSize = 256>
690 class SegmentedMemoryPool;
691 
692 // The actual memory pool class. It is in essence just a wrapper for a list of MemoryBlocks.
693 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
694 class SegmentedMemoryPool : public RegisterFinalizeable<SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>> {
695 public:
696   using value_type     = MemType;
697   using stream_type    = StreamType;
698   using allocator_type = AllocType;
699   using block_type     = impl::MemoryBlock<value_type, allocator_type, stream_type>;
700   using pool_type      = std::deque<block_type>;
701   using size_type      = typename block_type::size_type;
702 
703   explicit SegmentedMemoryPool(AllocType = AllocType{}, std::size_t = DefaultChunkSize) noexcept(std::is_nothrow_default_constructible<pool_type>::value);
704 
705   PetscErrorCode allocate(PetscInt, value_type **, const stream_type *, size_type = std::alignment_of<MemType>::value) noexcept;
706   PetscErrorCode deallocate(value_type **, const stream_type *) noexcept;
707   PetscErrorCode reallocate(PetscInt, value_type **, const stream_type *) noexcept;
708 
709 private:
710   pool_type      pool_;
711   allocator_type allocator_;
712   size_type      chunk_size_;
713 
714   PetscErrorCode make_block_(size_type, const stream_type *) noexcept;
715 
716   friend class RegisterFinalizeable<SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>>;
717   PetscErrorCode register_finalize_(const stream_type *) noexcept;
718   PetscErrorCode finalize_() noexcept;
719 
720   PetscErrorCode allocate_(size_type, value_type **, const stream_type *) noexcept;
721 };
722 
723 // ==========================================================================================
724 // SegmentedMemoryPool - Private API
725 // ==========================================================================================
726 
727 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
728 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::make_block_(size_type size, const stream_type *stream) noexcept
729 {
730   const auto block_size = std::max(size, chunk_size_);
731 
732   PetscFunctionBegin;
733   PetscCallCXX(pool_.emplace_back(&allocator_, block_size, stream));
734   PetscCall(PetscInfo(nullptr, "Allocated new block of size %zu, total %zu blocks\n", block_size, pool_.size()));
735   PetscFunctionReturn(PETSC_SUCCESS);
736 }
737 
738 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
739 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::register_finalize_(const stream_type *stream) noexcept
740 {
741   PetscFunctionBegin;
742   PetscCall(make_block_(chunk_size_, stream));
743   PetscFunctionReturn(PETSC_SUCCESS);
744 }
745 
746 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
747 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::finalize_() noexcept
748 {
749   PetscFunctionBegin;
750   PetscCallCXX(pool_.clear());
751   chunk_size_ = DefaultChunkSize;
752   PetscFunctionReturn(PETSC_SUCCESS);
753 }
754 
755 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
756 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::allocate_(size_type size, value_type **ptr, const stream_type *stream) noexcept
757 {
758   auto found = false;
759 
760   PetscFunctionBegin;
761   PetscCall(this->register_finalize(stream));
762   for (auto &block : pool_) {
763     PetscCall(block.try_allocate_chunk(size, ptr, stream, &found));
764     if (PetscLikely(found)) PetscFunctionReturn(PETSC_SUCCESS);
765   }
766 
767   PetscCall(PetscInfo(nullptr, "Could not find an open block in the pool (%zu blocks) (requested size %zu), allocating new block\n", pool_.size(), size));
768   // if we are here we couldn't find an open block in the pool, so make a new block
769   PetscCall(make_block_(size, stream));
770   // and assign it
771   PetscCall(pool_.back().try_allocate_chunk(size, ptr, stream, &found));
772   PetscAssert(found, PETSC_COMM_SELF, PETSC_ERR_MEM, "Failed to get a suitable memory chunk (of size %zu) from newly allocated memory block (size %zu)", size, pool_.back().size());
773   PetscFunctionReturn(PETSC_SUCCESS);
774 }
775 
776 // ==========================================================================================
777 // SegmentedMemoryPool - Public API
778 // ==========================================================================================
779 
780 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
781 inline SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::SegmentedMemoryPool(AllocType alloc, std::size_t size) noexcept(std::is_nothrow_default_constructible<pool_type>::value) : allocator_(std::move(alloc)), chunk_size_(size)
782 {
783 }
784 
785 /*
786   SegmentedMemoryPool::allocate - get an allocation from the memory pool
787 
788   Input Parameters:
789 + req_size - size (in elements) to get
790 . ptr      - the pointer to hold the allocation
791 - stream   - the stream on which to get the allocation
792 
793   Output Parameter:
794 . ptr - the pointer holding the allocation
795 
796   Notes:
797   req_size cannot be negative. If req_size if zero, ptr is set to nullptr
798 */
799 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
800 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::allocate(PetscInt req_size, value_type **ptr, const stream_type *stream, size_type alignment) noexcept
801 {
802   value_type *ret_ptr = nullptr;
803 
804   PetscFunctionBegin;
805   PetscAssert(req_size >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Requested memory amount (%" PetscInt_FMT ") must be >= 0", req_size);
806   PetscAssertPointer(ptr, 2);
807   PetscAssertPointer(stream, 3);
808   if (req_size) {
809     const auto size         = static_cast<size_type>(req_size);
810     auto       aligned_size = alignment == alignof(char) ? size : size + alignment;
811     void      *vptr         = nullptr;
812 
813     PetscCall(allocate_(aligned_size, &ret_ptr, stream));
814     vptr = ret_ptr;
815     std::align(alignment, size, vptr, aligned_size);
816     ret_ptr = reinterpret_cast<value_type *>(vptr);
817     // sets memory to NaN or infinity depending on the type to catch out uninitialized memory
818     // accesses.
819     if (PetscDefined(USE_DEBUG)) PetscCall(allocator_.set_canary(ret_ptr, size, stream));
820   }
821   *ptr = ret_ptr;
822   PetscFunctionReturn(PETSC_SUCCESS);
823 }
824 
825 /*
826   SegmentedMemoryPool::deallocate - release a pointer back to the memory pool
827 
828   Input Parameters:
829 + ptr    - the pointer to release
830 - stream - the stream to release it on
831 
832   Notes:
833   If ptr is not owned by the pool it is unchanged.
834 */
835 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
836 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::deallocate(value_type **ptr, const stream_type *stream) noexcept
837 {
838   PetscFunctionBegin;
839   PetscAssertPointer(ptr, 1);
840   PetscAssertPointer(stream, 2);
841   // nobody owns a nullptr, and if they do then they have bigger problems
842   if (!*ptr) PetscFunctionReturn(PETSC_SUCCESS);
843   for (auto &block : pool_) {
844     auto found = false;
845 
846     PetscCall(block.try_deallocate_chunk(ptr, stream, &found));
847     if (PetscLikely(found)) break;
848   }
849   PetscFunctionReturn(PETSC_SUCCESS);
850 }
851 
852 /*
853   SegmentedMemoryPool::reallocate - Resize an allocated buffer
854 
855   Input Parameters:
856 + new_req_size - the new buffer size
857 . ptr          - pointer to the buffer
858 - stream       - stream to resize with
859 
860   Output Parameter:
861 . ptr - pointer to the new region
862 
863   Notes:
864   ptr must have been allocated by the pool.
865 
866   It's OK to shrink the buffer, even down to 0 (in which case it is just deallocated).
867 */
868 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
869 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::reallocate(PetscInt new_req_size, value_type **ptr, const stream_type *stream) noexcept
870 {
871   using chunk_type = typename block_type::chunk_type;
872 
873   const auto  new_size = static_cast<size_type>(new_req_size);
874   const auto  old_ptr  = *ptr;
875   chunk_type *chunk    = nullptr;
876 
877   PetscFunctionBegin;
878   PetscAssert(new_req_size >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Requested memory amount (%" PetscInt_FMT ") must be >= 0", new_req_size);
879   PetscAssertPointer(ptr, 2);
880   PetscAssertPointer(stream, 3);
881 
882   // if reallocating to zero, just free
883   if (PetscUnlikely(new_size == 0)) {
884     PetscCall(deallocate(ptr, stream));
885     PetscFunctionReturn(PETSC_SUCCESS);
886   }
887 
888   // search the blocks for the owning chunk
889   for (auto &block : pool_) {
890     PetscCall(block.try_find_chunk(old_ptr, &chunk));
891     if (chunk) break; // found
892   }
893   PetscAssert(chunk, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Memory pool does not own %p, so cannot reallocate it", *ptr);
894 
895   if (chunk->capacity() < new_size) {
896     // chunk does not have enough room, need to grab a fresh chunk and copy to it
897     *ptr = nullptr;
898     PetscCall(chunk->release(stream));
899     PetscCall(allocate(new_size, ptr, stream));
900     PetscCall(allocator_.uninitialized_copy(*ptr, old_ptr, new_size, stream));
901   } else {
902     // chunk had enough room we can simply grow (or shrink) to fit the new size
903     PetscCall(chunk->resize(new_size));
904   }
905   PetscFunctionReturn(PETSC_SUCCESS);
906 }
907 
908 } // namespace memory
909 
910 } // namespace Petsc
911 
912 #endif // PETSC_SEGMENTEDMEMPOOL_HPP
913