xref: /petsc/src/sys/objects/device/impls/segmentedmempool.hpp (revision 750b007cd8d816cecd9de99077bb0a703b4cf61a)
1 #ifndef PETSC_SEGMENTEDMEMPOOL_HPP
2 #define PETSC_SEGMENTEDMEMPOOL_HPP
3 
4 #include <petsc/private/deviceimpl.h>
5 
6 #include <petsc/private/cpp/macros.hpp>
7 #include <petsc/private/cpp/type_traits.hpp>
8 #include <petsc/private/cpp/utility.hpp>
9 #include <petsc/private/cpp/register_finalize.hpp>
10 #include <petsc/private/cpp/memory.hpp>
11 
12 #include <limits>
13 #include <deque>
14 #include <vector>
15 
16 namespace Petsc {
17 
18 namespace device {
19 
20 template <typename T>
21 class StreamBase {
22 public:
23   using id_type      = int;
24   using derived_type = T;
25 
26   static const id_type INVALID_ID;
27 
28   // needed so that dependent auto works, see veccupmimpl.h for a detailed discussion
29   template <typename U = T>
30   PETSC_NODISCARD auto get_stream() const noexcept PETSC_DECLTYPE_AUTO_RETURNS(static_cast<const U &>(*this).get_stream_());
31 
32   PETSC_NODISCARD id_type get_id() const noexcept { return static_cast<const T &>(*this).get_id_(); }
33 
34   template <typename E>
35   PETSC_NODISCARD PetscErrorCode record_event(E &&event) const noexcept {
36     return static_cast<const T &>(*this).record_event_(std::forward<E>(event));
37   }
38 
39   template <typename E>
40   PETSC_NODISCARD PetscErrorCode wait_for_event(E &&event) const noexcept {
41     return static_cast<const T &>(*this).wait_for_(std::forward<E>(event));
42   }
43 
44 protected:
45   constexpr StreamBase() noexcept = default;
46 
47   struct default_event_type { };
48   using default_stream_type = std::nullptr_t;
49 
50   PETSC_NODISCARD static constexpr default_stream_type get_stream_() noexcept { return nullptr; }
51 
52   PETSC_NODISCARD static constexpr id_type get_id_() noexcept { return 0; }
53 
54   template <typename U = T>
55   PETSC_NODISCARD static constexpr PetscErrorCode record_event_(const typename U::event_type &) noexcept {
56     return 0;
57   }
58 
59   template <typename U = T>
60   PETSC_NODISCARD static constexpr PetscErrorCode wait_for_(const typename U::event_type &) noexcept {
61     return 0;
62   }
63 };
64 
65 template <typename T>
66 const typename StreamBase<T>::id_type StreamBase<T>::INVALID_ID = -1;
67 
68 struct DefaultStream : StreamBase<DefaultStream> {
69   using stream_type = typename StreamBase<DefaultStream>::default_stream_type;
70   using id_type     = typename StreamBase<DefaultStream>::id_type;
71   using event_type  = typename StreamBase<DefaultStream>::default_event_type;
72 };
73 
74 } // namespace device
75 
76 namespace memory {
77 
78 namespace impl {
79 
80 // ==========================================================================================
81 // MemoryChunk
82 //
83 // Represents a checked-out region of a MemoryBlock. Tracks the offset into the owning
84 // MemoryBlock and its size/capacity
85 // ==========================================================================================
86 
87 template <typename EventType>
88 class MemoryChunk {
89 public:
90   using event_type = EventType;
91   using size_type  = std::size_t;
92 
93   MemoryChunk(size_type, size_type) noexcept;
94   explicit MemoryChunk(size_type) noexcept;
95 
96   MemoryChunk(MemoryChunk &&) noexcept;
97   MemoryChunk &operator=(MemoryChunk &&) noexcept;
98 
99   MemoryChunk(const MemoryChunk &) noexcept            = delete;
100   MemoryChunk &operator=(const MemoryChunk &) noexcept = delete;
101 
102   PETSC_NODISCARD size_type start() const noexcept { return start_; }
103   PETSC_NODISCARD size_type size() const noexcept { return size_; }
104   // REVIEW ME:
105   // make this an actual field, normally each chunk shrinks_to_fit() on begin claimed, but in
106   // theory only the last chunk needs to do this
107   PETSC_NODISCARD size_type capacity() const noexcept { return size_; }
108   PETSC_NODISCARD size_type total_offset() const noexcept { return start() + size(); }
109 
110   template <typename U>
111   PETSC_NODISCARD PetscErrorCode release(const device::StreamBase<U> *) noexcept;
112   template <typename U>
113   PETSC_NODISCARD PetscErrorCode claim(const device::StreamBase<U> *, size_type, bool *, bool = false) noexcept;
114   template <typename U>
115   PETSC_NODISCARD bool           can_claim(const device::StreamBase<U> *, size_type, bool) const noexcept;
116   PETSC_NODISCARD PetscErrorCode resize(size_type) noexcept;
117   PETSC_NODISCARD bool           contains(size_type) const noexcept;
118 
119 private:
120   // clang-format off
121   event_type      event_{};          // event recorded when the chunk was released
122   bool            open_      = true; // is this chunk open?
123   // id of the last stream to use the chunk, populated on release
124   int             stream_id_ = device::DefaultStream::INVALID_ID;
125   size_type       size_      = 0;    // size of the chunk
126   const size_type start_     = 0;    // offset from the start of the owning block
127   // clang-format on
128 
129   template <typename U>
130   PETSC_NODISCARD bool stream_compat_(const device::StreamBase<U> *) const noexcept;
131 };
132 
133 // ==========================================================================================
134 // MemoryChunk - Private API
135 // ==========================================================================================
136 
137 // asks and answers the question: can this stream claim this chunk without serializing?
138 template <typename E>
139 template <typename U>
140 inline bool MemoryChunk<E>::stream_compat_(const device::StreamBase<U> *strm) const noexcept {
141   return (stream_id_ == strm->INVALID_ID) || (stream_id_ == strm->get_id());
142 }
143 
144 // ==========================================================================================
145 // MemoryChunk - Public API
146 // ==========================================================================================
147 
148 template <typename E>
149 inline MemoryChunk<E>::MemoryChunk(size_type start, size_type size) noexcept : size_(size), start_(start) { }
150 
151 template <typename E>
152 inline MemoryChunk<E>::MemoryChunk(size_type size) noexcept : MemoryChunk(0, size) { }
153 
154 template <typename E>
155 inline MemoryChunk<E>::MemoryChunk(MemoryChunk<E> &&other) noexcept :
156   event_(std::move(other.event_)), open_(util::exchange(other.open_, false)), stream_id_(util::exchange(other.stream_id_, device::DefaultStream::INVALID_ID)), size_(util::exchange(other.size_, 0)), start_(std::move(other.start_)) { }
157 
158 template <typename E>
159 inline MemoryChunk<E> &MemoryChunk<E>::operator=(MemoryChunk<E> &&other) noexcept {
160   PetscFunctionBegin;
161   if (this != &other) {
162     event_     = std::move(other.event_);
163     open_      = util::exchange(other.open_, false);
164     stream_id_ = util::exchange(other.stream_id_, device::DefaultStream::INVALID_ID);
165     size_      = util::exchange(other.size_, 0);
166     start_     = std::move(other.start_);
167   }
168   PetscFunctionReturn(*this);
169 }
170 
171 /*
172   MemoryChunk::release - release a chunk on a stream
173 
174   Input Parameter:
175 . stream - the stream to release the chunk with
176 
177   Notes:
178   Inserts a release operation on stream and records the state of stream at the time this
179   routine was called.
180 
181   Future allocation requests which attempt to claim the chunk on the same stream may re-acquire
182   the chunk without serialization.
183 
184   If another stream attempts to claim the chunk they must wait for the recorded event before
185   claiming the chunk.
186 */
187 template <typename E>
188 template <typename U>
189 inline PetscErrorCode MemoryChunk<E>::release(const device::StreamBase<U> *stream) noexcept {
190   PetscFunctionBegin;
191   open_      = true;
192   stream_id_ = stream->get_id();
193   PetscCall(stream->record_event(event_));
194   PetscFunctionReturn(0);
195 }
196 
197 /*
198   MemoryChunk::claim - attempt to claim a particular chunk
199 
200   Input Parameters:
201 + stream    - the stream on which to attempt to claim
202 . req_size  - the requested size (in elements) to attempt to claim
203 - serialize - (optional, false) whether the claimant allows serialization
204 
205   Output Parameter:
206 . success - true if the chunk was claimed, false otherwise
207 */
208 template <typename E>
209 template <typename U>
210 inline PetscErrorCode MemoryChunk<E>::claim(const device::StreamBase<U> *stream, size_type req_size, bool *success, bool serialize) noexcept {
211   PetscFunctionBegin;
212   if ((*success = can_claim(stream, req_size, serialize))) {
213     if (serialize && !stream_compat_(stream)) PetscCall(stream->wait_for_event(event_));
214     PetscCall(resize(req_size));
215     open_ = false;
216   }
217   PetscFunctionReturn(0);
218 }
219 
220 /*
221   MemoryChunk::can_claim - test whether a particular chunk can be claimed
222 
223   Input Parameters:
224 + stream    - the stream on which to attempt to claim
225 . req_size  - the requested size (in elements) to attempt to claim
226 - serialize - whether the claimant allows serialization
227 
228   Output:
229 . [return] - true if the chunk is claimable given the configuration, false otherwise
230 */
231 template <typename E>
232 template <typename U>
233 inline bool MemoryChunk<E>::can_claim(const device::StreamBase<U> *stream, size_type req_size, bool serialize) const noexcept {
234   if (open_ && (req_size <= capacity())) {
235     // fully compatible
236     if (stream_compat_(stream)) return true;
237     // stream wasn't compatible, but could claim if we serialized
238     if (serialize) return true;
239     // incompatible stream and did not want to serialize
240   }
241   return false;
242 }
243 
244 /*
245   MemoryChunk::resize - grow a chunk to new size
246 
247   Input Parameter:
248 . newsize - the new size Requested
249 
250   Notes:
251   newsize cannot be larger than capacity
252 */
253 template <typename E>
254 inline PetscErrorCode MemoryChunk<E>::resize(size_type newsize) noexcept {
255   PetscFunctionBegin;
256   PetscAssert(newsize <= capacity(), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "New size %zu larger than capacity %zu", newsize, capacity());
257   size_ = newsize;
258   PetscFunctionReturn(0);
259 }
260 
261 /*
262   MemoryChunk::contains - query whether a memory chunk contains a particular offset
263 
264   Input Parameters:
265 . offset - The offset from the MemoryBlock start
266 
267   Notes:
268   Returns true if the chunk contains the offset, false otherwise
269 */
270 template <typename E>
271 inline bool MemoryChunk<E>::contains(size_type offset) const noexcept {
272   return (offset >= start()) && (offset < total_offset());
273 }
274 
275 // ==========================================================================================
276 // MemoryBlock
277 //
278 // A "memory block" manager, which owns the pointer to a particular memory range. Retrieving
279 // and restoring a block is thread-safe (so may be used by multiple device streams).
280 // ==========================================================================================
281 
282 template <typename T, typename AllocatorType, typename StreamType>
283 class MemoryBlock {
284 public:
285   using value_type      = T;
286   using allocator_type  = AllocatorType;
287   using stream_type     = StreamType;
288   using event_type      = typename stream_type::event_type;
289   using chunk_type      = MemoryChunk<event_type>;
290   using size_type       = typename chunk_type::size_type;
291   using chunk_list_type = std::vector<chunk_type>;
292 
293   template <typename U>
294   MemoryBlock(allocator_type *, size_type, const device::StreamBase<U> *) noexcept;
295 
296   ~MemoryBlock() noexcept(std::is_nothrow_destructible<chunk_list_type>::value);
297 
298   MemoryBlock(MemoryBlock &&) noexcept;
299   MemoryBlock &operator=(MemoryBlock &&) noexcept;
300 
301   // memory blocks are not copyable
302   MemoryBlock(const MemoryBlock &)            = delete;
303   MemoryBlock &operator=(const MemoryBlock &) = delete;
304 
305   /* --- actual functions --- */
306   PETSC_NODISCARD PetscErrorCode try_allocate_chunk(size_type, T **, const stream_type *, bool *) noexcept;
307   PETSC_NODISCARD PetscErrorCode try_deallocate_chunk(T **, const stream_type *, bool *) noexcept;
308   PETSC_NODISCARD PetscErrorCode try_find_chunk(const T *, chunk_type **) noexcept;
309   PETSC_NODISCARD bool           owns_pointer(const T *) const noexcept;
310 
311   PETSC_NODISCARD size_type size() const noexcept { return size_; }
312   PETSC_NODISCARD size_type bytes() const noexcept { return sizeof(value_type) * size(); }
313   PETSC_NODISCARD size_type num_chunks() const noexcept { return chunks_.size(); }
314 
315 private:
316   value_type     *mem_{};
317   allocator_type *allocator_{};
318   size_type       size_{};
319   chunk_list_type chunks_{};
320 
321   PETSC_NODISCARD PetscErrorCode clear_(const stream_type *) noexcept;
322 };
323 
324 // ==========================================================================================
325 // MemoryBlock - Private API
326 // ==========================================================================================
327 
328 // clear the memory block, called from destructors and move assignment/construction
329 template <typename T, typename A, typename S>
330 PETSC_NODISCARD PetscErrorCode MemoryBlock<T, A, S>::clear_(const stream_type *stream) noexcept {
331   PetscFunctionBegin;
332   if (PetscLikely(mem_)) {
333     PetscCall(allocator_->deallocate(mem_, stream));
334     mem_ = nullptr;
335   }
336   size_ = 0;
337   PetscCallCXX(chunks_.clear());
338   PetscFunctionReturn(0);
339 }
340 
341 // ==========================================================================================
342 // MemoryBlock - Public API
343 // ==========================================================================================
344 
345 // default constructor, allocates memory immediately
346 template <typename T, typename A, typename S>
347 template <typename U>
348 MemoryBlock<T, A, S>::MemoryBlock(allocator_type *alloc, size_type s, const device::StreamBase<U> *stream) noexcept : allocator_(alloc), size_(s) {
349   PetscFunctionBegin;
350   PetscCallAbort(PETSC_COMM_SELF, alloc->allocate(&mem_, s, stream));
351   PetscAssertAbort(mem_, PETSC_COMM_SELF, PETSC_ERR_MEM, "Failed to allocate memory block of size %zu", s);
352   PetscFunctionReturnVoid();
353 }
354 
355 template <typename T, typename A, typename S>
356 MemoryBlock<T, A, S>::~MemoryBlock() noexcept(std::is_nothrow_destructible<chunk_list_type>::value) {
357   stream_type stream;
358 
359   PetscFunctionBegin;
360   PetscCallAbort(PETSC_COMM_SELF, clear_(&stream));
361   PetscFunctionReturnVoid();
362 }
363 
364 template <typename T, typename A, typename S>
365 MemoryBlock<T, A, S>::MemoryBlock(MemoryBlock &&other) noexcept : mem_(util::exchange(other.mem_, nullptr)), allocator_(other.allocator_), size_(util::exchange(other.size_, 0)), chunks_(std::move(other.chunks_)) { }
366 
367 template <typename T, typename A, typename S>
368 MemoryBlock<T, A, S> &MemoryBlock<T, A, S>::operator=(MemoryBlock &&other) noexcept {
369   PetscFunctionBegin;
370   if (this != &other) {
371     stream_type stream;
372 
373     PetscCallAbort(PETSC_COMM_SELF, clear_(&stream));
374     mem_       = util::exchange(other.mem_, nullptr);
375     allocator_ = other.allocator_;
376     size_      = util::exchange(other.size_, 0);
377     chunks_    = std::move(other.chunks_);
378   }
379   PetscFunctionReturn(*this);
380 }
381 
382 /*
383   MemoryBock::owns_pointer - returns true if this block owns a pointer, false otherwise
384 */
385 template <typename T, typename A, typename S>
386 inline bool MemoryBlock<T, A, S>::owns_pointer(const T *ptr) const noexcept {
387   // each pool is linear in memory, so it suffices to check the bounds
388   return (ptr >= mem_) && (ptr < std::next(mem_, size()));
389 }
390 
391 /*
392   MemoryBlock::try_allocate_chunk - try to get a chunk from this MemoryBlock
393 
394   Input Parameters:
395 + req_size - the requested size of the allocation (in elements)
396 . ptr      - ptr to fill
397 - stream   - stream to fill the pointer on
398 
399   Output Parameter:
400 . success  - true if chunk was gotten, false otherwise
401 
402   Notes:
403   If the current memory could not satisfy the memory request, ptr is unchanged
404 */
405 template <typename T, typename A, typename S>
406 inline PetscErrorCode MemoryBlock<T, A, S>::try_allocate_chunk(size_type req_size, T **ptr, const stream_type *stream, bool *success) noexcept {
407   PetscFunctionBegin;
408   *success = false;
409   if (req_size <= size()) {
410     const auto try_create_chunk = [&]() {
411       const auto was_empty     = chunks_.empty();
412       const auto block_alloced = was_empty ? 0 : chunks_.back().total_offset();
413 
414       PetscFunctionBegin;
415       if (block_alloced + req_size <= size()) {
416         PetscCallCXX(chunks_.emplace_back(block_alloced, req_size));
417         PetscCall(chunks_.back().claim(stream, req_size, success));
418         *ptr = mem_ + block_alloced;
419         if (was_empty) PetscAssert(*success, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Failed to claim chunk (of size %zu) even though block (of size %zu) was empty!", req_size, size());
420       }
421       PetscFunctionReturn(0);
422     };
423     const auto try_find_open_chunk = [&](bool serialize = false) {
424       PetscFunctionBegin;
425       for (auto &chunk : chunks_) {
426         PetscCall(chunk.claim(stream, req_size, success, serialize));
427         if (*success) {
428           *ptr = mem_ + chunk.start();
429           break;
430         }
431       }
432       PetscFunctionReturn(0);
433     };
434     const auto try_steal_other_stream_chunk = [&]() {
435       PetscFunctionBegin;
436       PetscCall(try_find_open_chunk(true));
437       PetscFunctionReturn(0);
438     };
439 
440     // search previously distributed chunks, but only claim one if it is on the same stream
441     // as us
442     PetscCall(try_find_open_chunk());
443 
444     // if we are here we couldn't reuse one of our own chunks so check first if the pool
445     // has room for a new one
446     if (!*success) PetscCall(try_create_chunk());
447 
448     // try pruning dead chunks off the back, note we do this regardless of whether we are
449     // successful
450     while (chunks_.back().can_claim(stream, 0, false)) {
451       PetscCallCXX(chunks_.pop_back());
452       if (chunks_.empty()) {
453         // if chunks are empty it implies we have managed to claim (and subsequently destroy)
454         // our own chunk twice! something has gone wrong
455         PetscAssert(!*success, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Successfully claimed a chunk (of size %zu, from block of size %zu) but have now managed to claim it for a second time (and destroyed it)!", req_size, size());
456         break;
457       }
458     }
459 
460     // if previously unsuccessful see if enough space has opened up due to pruning. note that
461     // if the chunk list was emptied from the pruning this call must succeed in allocating a
462     // chunk, otherwise something is wrong
463     if (!*success) PetscCall(try_create_chunk());
464 
465     // last resort, iterate over all chunks and see if we can steal one by waiting on the
466     // current owner to finish using it
467     if (!*success) PetscCall(try_steal_other_stream_chunk());
468   }
469   PetscFunctionReturn(0);
470 }
471 
472 /*
473   MemoryBlock::try_deallocate_chunk - try to restore a chunk to this MemoryBlock
474 
475   Input Parameters:
476 + ptr     - ptr to restore
477 - stream  - stream to restore the pointer on
478 
479   Output Parameter:
480 . success - true if chunk was restored, false otherwise
481 
482   Notes:
483   ptr is set to nullptr on successful restore, and is unchanged otherwise. If the ptr is owned
484   by this MemoryBlock then it is restored on stream. The same stream may recieve ptr again
485   without synchronization, but other streams may not do so until either serializing or the
486   stream is idle again.
487 */
488 template <typename T, typename A, typename S>
489 inline PetscErrorCode MemoryBlock<T, A, S>::try_deallocate_chunk(T **ptr, const stream_type *stream, bool *success) noexcept {
490   chunk_type *chunk = nullptr;
491 
492   PetscFunctionBegin;
493   PetscCall(try_find_chunk(*ptr, &chunk));
494   if (chunk) {
495     PetscCall(chunk->release(stream));
496     *ptr     = nullptr;
497     *success = true;
498   } else {
499     *success = false;
500   }
501   PetscFunctionReturn(0);
502 }
503 
504 /*
505   MemoryBlock::try_find_chunk - try to find the chunk which owns ptr
506 
507   Input Parameter:
508 . ptr - the pointer to lookk for
509 
510   Output Parameter:
511 . ret_chunk - pointer to the owning chunk or nullptr if not found
512 */
513 template <typename T, typename A, typename S>
514 inline PetscErrorCode MemoryBlock<T, A, S>::try_find_chunk(const T *ptr, chunk_type **ret_chunk) noexcept {
515   PetscFunctionBegin;
516   *ret_chunk = nullptr;
517   if (owns_pointer(ptr)) {
518     const auto offset = static_cast<size_type>(ptr - mem_);
519 
520     for (auto &chunk : chunks_) {
521       if (chunk.contains(offset)) {
522         *ret_chunk = &chunk;
523         break;
524       }
525     }
526 
527     PetscAssert(*ret_chunk, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Failed to find %zu in block, even though it is within block range [%zu, %zu)", reinterpret_cast<uintptr_t>(ptr), reinterpret_cast<uintptr_t>(mem_), reinterpret_cast<uintptr_t>(std::next(mem_, size())));
528   }
529   PetscFunctionReturn(0);
530 }
531 
532 namespace detail {
533 
534 template <typename T>
535 struct real_type {
536   using type = T;
537 };
538 
539 template <>
540 struct real_type<PetscScalar> {
541   using type = PetscReal;
542 };
543 
544 } // namespace detail
545 
546 template <typename T>
547 struct SegmentedMemoryPoolAllocatorBase {
548   using value_type      = T;
549   using size_type       = std::size_t;
550   using real_value_type = typename detail::real_type<T>::type;
551 
552   template <typename U>
553   PETSC_NODISCARD static PetscErrorCode allocate(value_type **, size_type, const device::StreamBase<U> *) noexcept;
554   template <typename U>
555   PETSC_NODISCARD static PetscErrorCode deallocate(value_type *, const device::StreamBase<U> *) noexcept;
556   template <typename U>
557   PETSC_NODISCARD static PetscErrorCode zero(value_type *, size_type, const device::StreamBase<U> *) noexcept;
558   template <typename U>
559   PETSC_NODISCARD static PetscErrorCode uninitialized_copy(value_type *, const value_type *, size_type, const device::StreamBase<U> *) noexcept;
560   template <typename U>
561   PETSC_NODISCARD static PetscErrorCode set_canary(value_type *, size_type, const device::StreamBase<U> *) noexcept;
562 };
563 
564 template <typename T>
565 template <typename U>
566 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::allocate(value_type **ptr, size_type n, const device::StreamBase<U> *) noexcept {
567   PetscFunctionBegin;
568   PetscCall(PetscMalloc1(n, ptr));
569   PetscFunctionReturn(0);
570 }
571 
572 template <typename T>
573 template <typename U>
574 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::deallocate(value_type *ptr, const device::StreamBase<U> *) noexcept {
575   PetscFunctionBegin;
576   PetscCall(PetscFree(ptr));
577   PetscFunctionReturn(0);
578 }
579 
580 template <typename T>
581 template <typename U>
582 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::zero(value_type *ptr, size_type n, const device::StreamBase<U> *) noexcept {
583   PetscFunctionBegin;
584   PetscCall(PetscArrayzero(ptr, n));
585   PetscFunctionReturn(0);
586 }
587 
588 template <typename T>
589 template <typename U>
590 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::uninitialized_copy(value_type *dest, const value_type *src, size_type n, const device::StreamBase<U> *) noexcept {
591   PetscFunctionBegin;
592   PetscCall(PetscArraycpy(dest, src, n));
593   PetscFunctionReturn(0);
594 }
595 
596 template <typename T>
597 template <typename U>
598 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::set_canary(value_type *ptr, size_type n, const device::StreamBase<U> *) noexcept {
599   using limit_type            = std::numeric_limits<real_value_type>;
600   constexpr value_type canary = limit_type::has_signaling_NaN ? limit_type::signaling_NaN() : limit_type::max();
601 
602   PetscFunctionBegin;
603   for (size_type i = 0; i < n; ++i) ptr[i] = canary;
604   PetscFunctionReturn(0);
605 }
606 
607 } // namespace impl
608 
609 // ==========================================================================================
610 // SegmentedMemoryPool
611 //
612 // Stream-aware async memory allocator. Holds a list of memory "blocks" which each control an
613 // allocated buffer. This buffer is further split into memory "chunks" which control
614 // consecutive, non-overlapping regions of the block. Chunks may be in 1 of 2 states:
615 //
616 // 1. Open:
617 //    The chunk is free to be claimed by the next suitable allocation request. If the
618 //    allocation request is made on the same stream as the chunk was deallocated on, no
619 //    serialization needs to occur. If not, the allocating stream must wait for the
620 //    event. Claiming the chunk "closes" the chunk.
621 //
622 // 2. Closed:
623 //    The chunk has been claimed by an allocation request. It cannot be opened again until it
624 //    is deallocated; doing so "opens" the chunk.
625 //
626 // Note that there does not need to be a chunk for every region, chunks are created to satisfy
627 // an allocation request.
628 //
629 // Thus there is usually a region of "unallocated" memory at the end of the buffer, which may
630 // be claimed by a newly created chunk if existing chunks cannot satisfy the allocation
631 // request. This region exists _only_ at the end, as there are no gaps between chunks.
632 //
633 //
634 // |-----------------------------------------------------------------------------------------
635 // | SegmentedMemoryPool
636 // |
637 // | ||-------------||
638 // | ||             ||    -------------------------------------------------------------------
639 // | ||             ||    | AAAAAAAAAAAAAABBBBBBBCCCCCCCCCCCCCCCCCCCCDDDDDDDDDDDDDXXXXXXXX...
640 // | ||             ||    | |             |      |                   |            |
641 // | ||             ||    | x-----x-------x-----xx---------x---------x------x-----x
642 // | || MemoryBlock || -> | ------|-------------|----------|----------------|--------
643 // | ||             ||    | | MemoryChunk | MemoryChunk | MemoryChunk | MemoryChunk |
644 // | ||             ||    | ---------------------------------------------------------
645 // | ||             ||    -------------------------------------------------------------------
646 // | ||-------------||
647 // | ||             ||
648 // | ||     ...     ||
649 // | ||             ||
650 // ==========================================================================================
651 
652 template <typename MemType, typename StreamType = device::DefaultStream, typename AllocType = impl::SegmentedMemoryPoolAllocatorBase<MemType>, std::size_t DefaultChunkSize = 256>
653 class SegmentedMemoryPool;
654 
655 // The actual memory pool class. It is in essence just a wrapper for a list of MemoryBlocks.
656 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
657 class SegmentedMemoryPool : public RegisterFinalizeable<SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>> {
658 public:
659   using value_type     = MemType;
660   using stream_type    = StreamType;
661   using allocator_type = AllocType;
662   using block_type     = impl::MemoryBlock<value_type, allocator_type, stream_type>;
663   using pool_type      = std::deque<block_type>;
664   using size_type      = typename block_type::size_type;
665 
666   explicit SegmentedMemoryPool(AllocType = AllocType{}, std::size_t = DefaultChunkSize) noexcept(std::is_nothrow_default_constructible<pool_type>::value);
667 
668   PETSC_NODISCARD PetscErrorCode allocate(PetscInt, value_type **, const stream_type *, size_type = std::alignment_of<MemType>::value) noexcept;
669   PETSC_NODISCARD PetscErrorCode deallocate(value_type **, const stream_type *) noexcept;
670   PETSC_NODISCARD PetscErrorCode reallocate(PetscInt, value_type **, const stream_type *) noexcept;
671 
672 private:
673   pool_type      pool_;
674   allocator_type allocator_;
675   size_type      chunk_size_;
676 
677   PETSC_NODISCARD PetscErrorCode make_block_(size_type, const stream_type *) noexcept;
678 
679   friend class RegisterFinalizeable<SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>>;
680   PETSC_NODISCARD PetscErrorCode register_finalize_(const stream_type *) noexcept;
681   PETSC_NODISCARD PetscErrorCode finalize_() noexcept;
682 
683   PETSC_NODISCARD PetscErrorCode allocate_(size_type, value_type **, const stream_type *) noexcept;
684 };
685 
686 // ==========================================================================================
687 // SegmentedMemoryPool - Private API
688 // ==========================================================================================
689 
690 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
691 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::make_block_(size_type size, const stream_type *stream) noexcept {
692   const auto block_size = std::max(size, chunk_size_);
693 
694   PetscFunctionBegin;
695   PetscCallCXX(pool_.emplace_back(&allocator_, block_size, stream));
696   PetscCall(PetscInfo(nullptr, "Allocated new block of size %zu, total %zu blocks\n", block_size, pool_.size()));
697   PetscFunctionReturn(0);
698 }
699 
700 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
701 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::register_finalize_(const stream_type *stream) noexcept {
702   PetscFunctionBegin;
703   PetscCall(make_block_(chunk_size_, stream));
704   PetscFunctionReturn(0);
705 }
706 
707 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
708 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::finalize_() noexcept {
709   PetscFunctionBegin;
710   PetscCallCXX(pool_.clear());
711   chunk_size_ = DefaultChunkSize;
712   PetscFunctionReturn(0);
713 }
714 
715 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
716 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::allocate_(size_type size, value_type **ptr, const stream_type *stream) noexcept {
717   auto found = false;
718 
719   PetscFunctionBegin;
720   PetscCall(this->register_finalize(PETSC_COMM_SELF, stream));
721   for (auto &block : pool_) {
722     PetscCall(block.try_allocate_chunk(size, ptr, stream, &found));
723     if (PetscLikely(found)) PetscFunctionReturn(0);
724   }
725 
726   PetscCall(PetscInfo(nullptr, "Could not find an open block in the pool (%zu blocks) (requested size %zu), allocating new block\n", pool_.size(), size));
727   // if we are here we couldn't find an open block in the pool, so make a new block
728   PetscCall(make_block_(size, stream));
729   // and assign it
730   PetscCall(pool_.back().try_allocate_chunk(size, ptr, stream, &found));
731   PetscAssert(found, PETSC_COMM_SELF, PETSC_ERR_MEM, "Failed to get a suitable memory chunk (of size %zu) from newly allocated memory block (size %zu)", size, pool_.back().size());
732   PetscFunctionReturn(0);
733 }
734 
735 // ==========================================================================================
736 // SegmentedMemoryPool - Public API
737 // ==========================================================================================
738 
739 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
740 inline SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::SegmentedMemoryPool(AllocType alloc, std::size_t size) noexcept(std::is_nothrow_default_constructible<pool_type>::value) :
741   allocator_(std::move(alloc)), chunk_size_(size) { }
742 
743 /*
744   SegmentedMemoryPool::allocate - get an allocation from the memory pool
745 
746   Input Parameters:
747 + req_size - size (in elements) to get
748 . ptr      - the pointer to hold the allocation
749 - stream   - the stream on which to get the allocation
750 
751   Output Parameter:
752 . ptr - the pointer holding the allocation
753 
754   Notes:
755   req_size cannot be negative. If req_size if zero, ptr is set to nullptr
756 */
757 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
758 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::allocate(PetscInt req_size, value_type **ptr, const stream_type *stream, size_type alignment) noexcept {
759   value_type *ret_ptr = nullptr;
760 
761   PetscFunctionBegin;
762   PetscAssert(req_size >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Requested memory amount (%" PetscInt_FMT ") must be >= 0", req_size);
763   PetscValidPointer(ptr, 2);
764   PetscValidPointer(stream, 3);
765   if (req_size) {
766     const auto size         = static_cast<size_type>(req_size);
767     auto       aligned_size = alignment == alignof(char) ? size : size + alignment;
768     void      *vptr         = nullptr;
769 
770     PetscCall(allocate_(aligned_size, &ret_ptr, stream));
771     vptr = ret_ptr;
772     std::align(alignment, size, vptr, aligned_size);
773     ret_ptr = reinterpret_cast<value_type *>(vptr);
774     // sets memory to NaN or infinity depending on the type to catch out uninitialized memory
775     // accesses.
776     if (PetscDefined(USE_DEBUG)) PetscCall(allocator_.set_canary(ret_ptr, size, stream));
777   }
778   *ptr = ret_ptr;
779   PetscFunctionReturn(0);
780 }
781 
782 /*
783   SegmentedMemoryPool::deallocate - release a pointer back to the memory pool
784 
785   Input Parameters:
786 + ptr    - the pointer to release
787 - stream - the stream to release it on
788 
789   Notes:
790   If ptr is not owned by the pool it is unchanged.
791 */
792 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
793 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::deallocate(value_type **ptr, const stream_type *stream) noexcept {
794   PetscFunctionBegin;
795   PetscValidPointer(ptr, 1);
796   PetscValidPointer(stream, 2);
797   // nobody owns a nullptr, and if they do then they have bigger problems
798   if (!*ptr) PetscFunctionReturn(0);
799   for (auto &block : pool_) {
800     auto found = false;
801 
802     PetscCall(block.try_deallocate_chunk(ptr, stream, &found));
803     if (PetscLikely(found)) break;
804   }
805   PetscFunctionReturn(0);
806 }
807 
808 /*
809   SegmentedMemoryPool::reallocate - Resize an allocated buffer
810 
811   Input Parameters:
812 + new_req_size - the new buffer size
813 . ptr          - pointer to the buffer
814 - stream       - stream to resize with
815 
816   Ouput Parameter:
817 . ptr - pointer to the new region
818 
819   Notes:
820   ptr must have been allocated by the pool.
821 
822   It's OK to shrink the buffer, even down to 0 (in which case it is just deallocated).
823 */
824 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
825 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::reallocate(PetscInt new_req_size, value_type **ptr, const stream_type *stream) noexcept {
826   using chunk_type = typename block_type::chunk_type;
827 
828   const auto  new_size = static_cast<size_type>(new_req_size);
829   const auto  old_ptr  = *ptr;
830   chunk_type *chunk    = nullptr;
831 
832   PetscFunctionBegin;
833   PetscAssert(new_req_size >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Requested memory amount (%" PetscInt_FMT ") must be >= 0", new_req_size);
834   PetscValidPointer(ptr, 2);
835   PetscValidPointer(stream, 3);
836 
837   // if reallocating to zero, just free
838   if (PetscUnlikely(new_size == 0)) {
839     PetscCall(deallocate(ptr, stream));
840     PetscFunctionReturn(0);
841   }
842 
843   // search the blocks for the owning chunk
844   for (auto &block : pool_) {
845     PetscCall(block.try_find_chunk(old_ptr, &chunk));
846     if (chunk) break; // found
847   }
848   PetscAssert(chunk, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Memory pool does not own %p, so cannot reallocate it", *ptr);
849 
850   if (chunk->capacity() < new_size) {
851     // chunk does not have enough room, need to grab a fresh chunk and copy to it
852     *ptr = nullptr;
853     PetscCall(chunk->release(stream));
854     PetscCall(allocate(new_size, ptr, stream));
855     PetscCall(allocator_.uninitialized_copy(*ptr, old_ptr, new_size, stream));
856   } else {
857     // chunk had enough room we can simply grow (or shrink) to fit the new size
858     PetscCall(chunk->resize(new_size));
859   }
860   PetscFunctionReturn(0);
861 }
862 
863 } // namespace memory
864 
865 } // namespace Petsc
866 
867 #endif // PETSC_SEGMENTEDMEMPOOL_HPP
868