xref: /petsc/src/sys/objects/device/impls/segmentedmempool.hpp (revision 66af8762ec03dbef0e079729eb2a1734a35ed7ff)
1 #pragma once
2 
3 #include <petsc/private/deviceimpl.h>
4 
5 #include <petsc/private/cpp/macros.hpp>
6 #include <petsc/private/cpp/type_traits.hpp>
7 #include <petsc/private/cpp/utility.hpp>
8 #include <petsc/private/cpp/register_finalize.hpp>
9 #include <petsc/private/cpp/memory.hpp>
10 
11 #include <limits>
12 #include <deque>
13 #include <vector>
14 
15 namespace Petsc
16 {
17 
18 namespace device
19 {
20 
21 template <typename T>
22 class StreamBase {
23 public:
24   using id_type      = int;
25   using derived_type = T;
26 
27   static const id_type INVALID_ID;
28 
29   // needed so that dependent auto works, see veccupmimpl.h for a detailed discussion
30   template <typename U = T>
31   PETSC_NODISCARD auto get_stream() const noexcept PETSC_DECLTYPE_AUTO_RETURNS(static_cast<const U &>(*this).get_stream_());
32 
33   PETSC_NODISCARD id_type get_id() const noexcept { return static_cast<const T &>(*this).get_id_(); }
34 
35   template <typename E>
36   PetscErrorCode record_event(E &&event) const noexcept
37   {
38     return static_cast<const T &>(*this).record_event_(std::forward<E>(event));
39   }
40 
41   template <typename E>
42   PetscErrorCode wait_for_event(E &&event) const noexcept
43   {
44     return static_cast<const T &>(*this).wait_for_(std::forward<E>(event));
45   }
46 
47 protected:
48   constexpr StreamBase() noexcept = default;
49 
50   struct default_event_type { };
51   using default_stream_type = std::nullptr_t;
52 
53   PETSC_NODISCARD static constexpr default_stream_type get_stream_() noexcept { return nullptr; }
54 
55   PETSC_NODISCARD static constexpr id_type get_id_() noexcept { return 0; }
56 
57   template <typename U = T>
58   static constexpr PetscErrorCode record_event_(const typename U::event_type &) noexcept
59   {
60     return PETSC_SUCCESS;
61   }
62 
63   template <typename U = T>
64   static constexpr PetscErrorCode wait_for_(const typename U::event_type &) noexcept
65   {
66     return PETSC_SUCCESS;
67   }
68 };
69 
70 template <typename T>
71 const typename StreamBase<T>::id_type StreamBase<T>::INVALID_ID = -1;
72 
73 struct DefaultStream : StreamBase<DefaultStream> {
74   using stream_type = typename StreamBase<DefaultStream>::default_stream_type;
75   using id_type     = typename StreamBase<DefaultStream>::id_type;
76   using event_type  = typename StreamBase<DefaultStream>::default_event_type;
77 };
78 
79 } // namespace device
80 
81 namespace memory
82 {
83 
84 namespace impl
85 {
86 
87 // ==========================================================================================
88 // MemoryChunk
89 //
90 // Represents a checked-out region of a MemoryBlock. Tracks the offset into the owning
91 // MemoryBlock and its size/capacity
92 // ==========================================================================================
93 
94 template <typename EventType>
95 class MemoryChunk {
96 public:
97   using event_type = EventType;
98   using size_type  = std::size_t;
99 
100   MemoryChunk(size_type, size_type) noexcept;
101   explicit MemoryChunk(size_type) noexcept;
102 
103   MemoryChunk(MemoryChunk &&) noexcept;
104   MemoryChunk &operator=(MemoryChunk &&) noexcept;
105 
106   MemoryChunk(const MemoryChunk &) noexcept            = delete;
107   MemoryChunk &operator=(const MemoryChunk &) noexcept = delete;
108 
109   PETSC_NODISCARD size_type start() const noexcept { return start_; }
110   PETSC_NODISCARD size_type size() const noexcept { return size_; }
111   // REVIEW ME:
112   // make this an actual field, normally each chunk shrinks_to_fit() on begin claimed, but in
113   // theory only the last chunk needs to do this
114   PETSC_NODISCARD size_type capacity() const noexcept { return size_; }
115   PETSC_NODISCARD size_type total_offset() const noexcept { return start() + size(); }
116 
117   template <typename U>
118   PetscErrorCode release(const device::StreamBase<U> *) noexcept;
119   template <typename U>
120   PetscErrorCode claim(const device::StreamBase<U> *, size_type, bool *, bool = false) noexcept;
121   template <typename U>
122   PETSC_NODISCARD bool can_claim(const device::StreamBase<U> *, size_type, bool) const noexcept;
123   PetscErrorCode       resize(size_type) noexcept;
124   PETSC_NODISCARD bool contains(size_type) const noexcept;
125 
126 private:
127   // clang-format off
128   event_type      event_{};          // event recorded when the chunk was released
129   bool            open_      = true; // is this chunk open?
130   // id of the last stream to use the chunk, populated on release
131   int             stream_id_ = device::DefaultStream::INVALID_ID;
132   size_type       size_      = 0;    // size of the chunk
133   const size_type start_     = 0;    // offset from the start of the owning block
134   // clang-format on
135 
136   template <typename U>
137   PETSC_NODISCARD bool stream_compat_(const device::StreamBase<U> *) const noexcept;
138 };
139 
140 // ==========================================================================================
141 // MemoryChunk - Private API
142 // ==========================================================================================
143 
144 // asks and answers the question: can this stream claim this chunk without serializing?
145 template <typename E>
146 template <typename U>
147 inline bool MemoryChunk<E>::stream_compat_(const device::StreamBase<U> *strm) const noexcept
148 {
149   return (stream_id_ == strm->INVALID_ID) || (stream_id_ == strm->get_id());
150 }
151 
152 // ==========================================================================================
153 // MemoryChunk - Public API
154 // ==========================================================================================
155 
156 template <typename E>
157 inline MemoryChunk<E>::MemoryChunk(size_type start, size_type size) noexcept : size_(size), start_(start)
158 {
159 }
160 
161 template <typename E>
162 inline MemoryChunk<E>::MemoryChunk(size_type size) noexcept : MemoryChunk(0, size)
163 {
164 }
165 
166 template <typename E>
167 inline MemoryChunk<E>::MemoryChunk(MemoryChunk<E> &&other) noexcept :
168   event_(std::move(other.event_)), open_(util::exchange(other.open_, false)), stream_id_(util::exchange(other.stream_id_, device::DefaultStream::INVALID_ID)), size_(util::exchange(other.size_, 0)), start_(std::move(other.start_))
169 {
170 }
171 
172 template <typename E>
173 inline MemoryChunk<E> &MemoryChunk<E>::operator=(MemoryChunk<E> &&other) noexcept
174 {
175   PetscFunctionBegin;
176   if (this != &other) {
177     event_     = std::move(other.event_);
178     open_      = util::exchange(other.open_, false);
179     stream_id_ = util::exchange(other.stream_id_, device::DefaultStream::INVALID_ID);
180     size_      = util::exchange(other.size_, 0);
181     start_     = std::move(other.start_);
182   }
183   PetscFunctionReturn(*this);
184 }
185 
186 /*
187   MemoryChunk::release - release a chunk on a stream
188 
189   Input Parameter:
190 . stream - the stream to release the chunk with
191 
192   Notes:
193   Inserts a release operation on stream and records the state of stream at the time this
194   routine was called.
195 
196   Future allocation requests which attempt to claim the chunk on the same stream may re-acquire
197   the chunk without serialization.
198 
199   If another stream attempts to claim the chunk they must wait for the recorded event before
200   claiming the chunk.
201 */
202 template <typename E>
203 template <typename U>
204 inline PetscErrorCode MemoryChunk<E>::release(const device::StreamBase<U> *stream) noexcept
205 {
206   PetscFunctionBegin;
207   open_      = true;
208   stream_id_ = stream->get_id();
209   PetscCall(stream->record_event(event_));
210   PetscFunctionReturn(PETSC_SUCCESS);
211 }
212 
213 /*
214   MemoryChunk::claim - attempt to claim a particular chunk
215 
216   Input Parameters:
217 + stream    - the stream on which to attempt to claim
218 . req_size  - the requested size (in elements) to attempt to claim
219 - serialize - (optional, false) whether the claimant allows serialization
220 
221   Output Parameter:
222 . success - true if the chunk was claimed, false otherwise
223 */
224 template <typename E>
225 template <typename U>
226 inline PetscErrorCode MemoryChunk<E>::claim(const device::StreamBase<U> *stream, size_type req_size, bool *success, bool serialize) noexcept
227 {
228   PetscFunctionBegin;
229   if ((*success = can_claim(stream, req_size, serialize))) {
230     if (serialize && !stream_compat_(stream)) PetscCall(stream->wait_for_event(event_));
231     PetscCall(resize(req_size));
232     open_ = false;
233   }
234   PetscFunctionReturn(PETSC_SUCCESS);
235 }
236 
237 /*
238   MemoryChunk::can_claim - test whether a particular chunk can be claimed
239 
240   Input Parameters:
241 + stream    - the stream on which to attempt to claim
242 . req_size  - the requested size (in elements) to attempt to claim
243 - serialize - whether the claimant allows serialization
244 
245   Output:
246 . [return] - true if the chunk is claimable given the configuration, false otherwise
247 */
248 template <typename E>
249 template <typename U>
250 inline bool MemoryChunk<E>::can_claim(const device::StreamBase<U> *stream, size_type req_size, bool serialize) const noexcept
251 {
252   if (open_ && (req_size <= capacity())) {
253     // fully compatible
254     if (stream_compat_(stream)) return true;
255     // stream wasn't compatible, but could claim if we serialized
256     if (serialize) return true;
257     // incompatible stream and did not want to serialize
258   }
259   return false;
260 }
261 
262 /*
263   MemoryChunk::resize - grow a chunk to new size
264 
265   Input Parameter:
266 . newsize - the new size Requested
267 
268   Notes:
269   newsize cannot be larger than capacity
270 */
271 template <typename E>
272 inline PetscErrorCode MemoryChunk<E>::resize(size_type newsize) noexcept
273 {
274   PetscFunctionBegin;
275   PetscAssert(newsize <= capacity(), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "New size %zu larger than capacity %zu", newsize, capacity());
276   size_ = newsize;
277   PetscFunctionReturn(PETSC_SUCCESS);
278 }
279 
280 /*
281   MemoryChunk::contains - query whether a memory chunk contains a particular offset
282 
283   Input Parameters:
284 . offset - The offset from the MemoryBlock start
285 
286   Notes:
287   Returns true if the chunk contains the offset, false otherwise
288 */
289 template <typename E>
290 inline bool MemoryChunk<E>::contains(size_type offset) const noexcept
291 {
292   return (offset >= start()) && (offset < total_offset());
293 }
294 
295 // ==========================================================================================
296 // MemoryBlock
297 //
298 // A "memory block" manager, which owns the pointer to a particular memory range. Retrieving
299 // and restoring a block is thread-safe (so may be used by multiple device streams).
300 // ==========================================================================================
301 
302 template <typename T, typename AllocatorType, typename StreamType>
303 class MemoryBlock {
304 public:
305   using value_type      = T;
306   using allocator_type  = AllocatorType;
307   using stream_type     = StreamType;
308   using event_type      = typename stream_type::event_type;
309   using chunk_type      = MemoryChunk<event_type>;
310   using size_type       = typename chunk_type::size_type;
311   using chunk_list_type = std::vector<chunk_type>;
312 
313   template <typename U>
314   MemoryBlock(allocator_type *, size_type, const device::StreamBase<U> *) noexcept;
315 
316   ~MemoryBlock() noexcept(std::is_nothrow_destructible<chunk_list_type>::value);
317 
318   MemoryBlock(MemoryBlock &&) noexcept;
319   MemoryBlock &operator=(MemoryBlock &&) noexcept;
320 
321   // memory blocks are not copyable
322   MemoryBlock(const MemoryBlock &)            = delete;
323   MemoryBlock &operator=(const MemoryBlock &) = delete;
324 
325   /* --- actual functions --- */
326   PetscErrorCode       try_allocate_chunk(size_type, T **, const stream_type *, bool *) noexcept;
327   PetscErrorCode       try_deallocate_chunk(T **, const stream_type *, bool *) noexcept;
328   PetscErrorCode       try_find_chunk(const T *, chunk_type **) noexcept;
329   PETSC_NODISCARD bool owns_pointer(const T *) const noexcept;
330 
331   PETSC_NODISCARD size_type size() const noexcept { return size_; }
332   PETSC_NODISCARD size_type bytes() const noexcept { return sizeof(value_type) * size(); }
333   PETSC_NODISCARD size_type num_chunks() const noexcept { return chunks_.size(); }
334 
335 private:
336   value_type     *mem_{};
337   allocator_type *allocator_{};
338   size_type       size_{};
339   chunk_list_type chunks_{};
340 
341   PetscErrorCode clear_(const stream_type *) noexcept;
342 };
343 
344 // ==========================================================================================
345 // MemoryBlock - Private API
346 // ==========================================================================================
347 
348 // clear the memory block, called from destructors and move assignment/construction
349 template <typename T, typename A, typename S>
350 PetscErrorCode MemoryBlock<T, A, S>::clear_(const stream_type *stream) noexcept
351 {
352   PetscFunctionBegin;
353   if (PetscLikely(mem_)) {
354     PetscCall(allocator_->deallocate(mem_, stream));
355     mem_ = nullptr;
356   }
357   size_ = 0;
358   PetscCallCXX(chunks_.clear());
359   PetscFunctionReturn(PETSC_SUCCESS);
360 }
361 
362 // ==========================================================================================
363 // MemoryBlock - Public API
364 // ==========================================================================================
365 
366 // default constructor, allocates memory immediately
367 template <typename T, typename A, typename S>
368 template <typename U>
369 MemoryBlock<T, A, S>::MemoryBlock(allocator_type *alloc, size_type s, const device::StreamBase<U> *stream) noexcept : allocator_(alloc), size_(s)
370 {
371   PetscFunctionBegin;
372   PetscCallAbort(PETSC_COMM_SELF, alloc->allocate(&mem_, s, stream));
373   PetscAssertAbort(mem_, PETSC_COMM_SELF, PETSC_ERR_MEM, "Failed to allocate memory block of size %zu", s);
374   PetscFunctionReturnVoid();
375 }
376 
377 template <typename T, typename A, typename S>
378 MemoryBlock<T, A, S>::~MemoryBlock() noexcept(std::is_nothrow_destructible<chunk_list_type>::value)
379 {
380   stream_type stream;
381 
382   PetscFunctionBegin;
383   PetscCallAbort(PETSC_COMM_SELF, clear_(&stream));
384   PetscFunctionReturnVoid();
385 }
386 
387 template <typename T, typename A, typename S>
388 MemoryBlock<T, A, S>::MemoryBlock(MemoryBlock &&other) noexcept : mem_(util::exchange(other.mem_, nullptr)), allocator_(other.allocator_), size_(util::exchange(other.size_, 0)), chunks_(std::move(other.chunks_))
389 {
390 }
391 
392 template <typename T, typename A, typename S>
393 MemoryBlock<T, A, S> &MemoryBlock<T, A, S>::operator=(MemoryBlock &&other) noexcept
394 {
395   PetscFunctionBegin;
396   if (this != &other) {
397     stream_type stream;
398 
399     PetscCallAbort(PETSC_COMM_SELF, clear_(&stream));
400     mem_       = util::exchange(other.mem_, nullptr);
401     allocator_ = other.allocator_;
402     size_      = util::exchange(other.size_, 0);
403     chunks_    = std::move(other.chunks_);
404   }
405   PetscFunctionReturn(*this);
406 }
407 
408 /*
409   MemoryBock::owns_pointer - returns true if this block owns a pointer, false otherwise
410 */
411 template <typename T, typename A, typename S>
412 inline bool MemoryBlock<T, A, S>::owns_pointer(const T *ptr) const noexcept
413 {
414   // each pool is linear in memory, so it suffices to check the bounds
415   return (ptr >= mem_) && (ptr < std::next(mem_, size()));
416 }
417 
418 /*
419   MemoryBlock::try_allocate_chunk - try to get a chunk from this MemoryBlock
420 
421   Input Parameters:
422 + req_size - the requested size of the allocation (in elements)
423 . ptr      - ptr to fill
424 - stream   - stream to fill the pointer on
425 
426   Output Parameter:
427 . success  - true if chunk was gotten, false otherwise
428 
429   Notes:
430   If the current memory could not satisfy the memory request, ptr is unchanged
431 */
432 template <typename T, typename A, typename S>
433 inline PetscErrorCode MemoryBlock<T, A, S>::try_allocate_chunk(size_type req_size, T **ptr, const stream_type *stream, bool *success) noexcept
434 {
435   PetscFunctionBegin;
436   *success = false;
437   if (req_size <= size()) {
438     const auto try_create_chunk = [&]() {
439       const auto was_empty     = chunks_.empty();
440       const auto block_alloced = was_empty ? 0 : chunks_.back().total_offset();
441 
442       PetscFunctionBegin;
443       if (block_alloced + req_size <= size()) {
444         PetscCallCXX(chunks_.emplace_back(block_alloced, req_size));
445         PetscCall(chunks_.back().claim(stream, req_size, success));
446         *ptr = mem_ + block_alloced;
447         if (was_empty) PetscAssert(*success, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Failed to claim chunk (of size %zu) even though block (of size %zu) was empty!", req_size, size());
448       }
449       PetscFunctionReturn(PETSC_SUCCESS);
450     };
451     const auto try_find_open_chunk = [&](bool serialize = false) {
452       PetscFunctionBegin;
453       for (auto &chunk : chunks_) {
454         PetscCall(chunk.claim(stream, req_size, success, serialize));
455         if (*success) {
456           *ptr = mem_ + chunk.start();
457           break;
458         }
459       }
460       PetscFunctionReturn(PETSC_SUCCESS);
461     };
462     const auto try_steal_other_stream_chunk = [&]() {
463       PetscFunctionBegin;
464       PetscCall(try_find_open_chunk(true));
465       PetscFunctionReturn(PETSC_SUCCESS);
466     };
467 
468     // search previously distributed chunks, but only claim one if it is on the same stream
469     // as us
470     PetscCall(try_find_open_chunk());
471 
472     // if we are here we couldn't reuse one of our own chunks so check first if the pool
473     // has room for a new one
474     if (!*success) PetscCall(try_create_chunk());
475 
476     // try pruning dead chunks off the back, note we do this regardless of whether we are
477     // successful
478     while (chunks_.back().can_claim(stream, 0, false)) {
479       PetscCallCXX(chunks_.pop_back());
480       if (chunks_.empty()) {
481         // if chunks are empty it implies we have managed to claim (and subsequently destroy)
482         // our own chunk twice! something has gone wrong
483         PetscAssert(!*success, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Successfully claimed a chunk (of size %zu, from block of size %zu) but have now managed to claim it for a second time (and destroyed it)!", req_size, size());
484         break;
485       }
486     }
487 
488     // if previously unsuccessful see if enough space has opened up due to pruning. note that
489     // if the chunk list was emptied from the pruning this call must succeed in allocating a
490     // chunk, otherwise something is wrong
491     if (!*success) PetscCall(try_create_chunk());
492 
493     // last resort, iterate over all chunks and see if we can steal one by waiting on the
494     // current owner to finish using it
495     if (!*success) PetscCall(try_steal_other_stream_chunk());
496   }
497   PetscFunctionReturn(PETSC_SUCCESS);
498 }
499 
500 /*
501   MemoryBlock::try_deallocate_chunk - try to restore a chunk to this MemoryBlock
502 
503   Input Parameters:
504 + ptr     - ptr to restore
505 - stream  - stream to restore the pointer on
506 
507   Output Parameter:
508 . success - true if chunk was restored, false otherwise
509 
510   Notes:
511   ptr is set to nullptr on successful restore, and is unchanged otherwise. If the ptr is owned
512   by this MemoryBlock then it is restored on stream. The same stream may receive ptr again
513   without synchronization, but other streams may not do so until either serializing or the
514   stream is idle again.
515 */
516 template <typename T, typename A, typename S>
517 inline PetscErrorCode MemoryBlock<T, A, S>::try_deallocate_chunk(T **ptr, const stream_type *stream, bool *success) noexcept
518 {
519   chunk_type *chunk = nullptr;
520 
521   PetscFunctionBegin;
522   PetscCall(try_find_chunk(*ptr, &chunk));
523   if (chunk) {
524     PetscCall(chunk->release(stream));
525     *ptr     = nullptr;
526     *success = true;
527   } else {
528     *success = false;
529   }
530   PetscFunctionReturn(PETSC_SUCCESS);
531 }
532 
533 /*
534   MemoryBlock::try_find_chunk - try to find the chunk which owns ptr
535 
536   Input Parameter:
537 . ptr - the pointer to look for
538 
539   Output Parameter:
540 . ret_chunk - pointer to the owning chunk or nullptr if not found
541 */
542 template <typename T, typename A, typename S>
543 inline PetscErrorCode MemoryBlock<T, A, S>::try_find_chunk(const T *ptr, chunk_type **ret_chunk) noexcept
544 {
545   PetscFunctionBegin;
546   *ret_chunk = nullptr;
547   if (owns_pointer(ptr)) {
548     const auto offset = static_cast<size_type>(ptr - mem_);
549 
550     for (auto &chunk : chunks_) {
551       if (chunk.contains(offset)) {
552         *ret_chunk = &chunk;
553         break;
554       }
555     }
556 
557     PetscAssert(*ret_chunk, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Failed to find %zu in block, even though it is within block range [%zu, %zu)", reinterpret_cast<uintptr_t>(ptr), reinterpret_cast<uintptr_t>(mem_), reinterpret_cast<uintptr_t>(std::next(mem_, size())));
558   }
559   PetscFunctionReturn(PETSC_SUCCESS);
560 }
561 
562 namespace detail
563 {
564 
565 template <typename T>
566 struct real_type {
567   using type = T;
568 };
569 
570 template <>
571 struct real_type<PetscScalar> {
572   using type = PetscReal;
573 };
574 
575 } // namespace detail
576 
577 template <typename T>
578 struct SegmentedMemoryPoolAllocatorBase {
579   using value_type      = T;
580   using size_type       = std::size_t;
581   using real_value_type = typename detail::real_type<T>::type;
582 
583   template <typename U>
584   static PetscErrorCode allocate(value_type **, size_type, const device::StreamBase<U> *) noexcept;
585   template <typename U>
586   static PetscErrorCode deallocate(value_type *, const device::StreamBase<U> *) noexcept;
587   template <typename U>
588   static PetscErrorCode zero(value_type *, size_type, const device::StreamBase<U> *) noexcept;
589   template <typename U>
590   static PetscErrorCode uninitialized_copy(value_type *, const value_type *, size_type, const device::StreamBase<U> *) noexcept;
591   template <typename U>
592   static PetscErrorCode set_canary(value_type *, size_type, const device::StreamBase<U> *) noexcept;
593 };
594 
595 template <typename T>
596 template <typename U>
597 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::allocate(value_type **ptr, size_type n, const device::StreamBase<U> *) noexcept
598 {
599   PetscFunctionBegin;
600   PetscCall(PetscMalloc1(n, ptr));
601   PetscFunctionReturn(PETSC_SUCCESS);
602 }
603 
604 template <typename T>
605 template <typename U>
606 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::deallocate(value_type *ptr, const device::StreamBase<U> *) noexcept
607 {
608   PetscFunctionBegin;
609   PetscCall(PetscFree(ptr));
610   PetscFunctionReturn(PETSC_SUCCESS);
611 }
612 
613 template <typename T>
614 template <typename U>
615 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::zero(value_type *ptr, size_type n, const device::StreamBase<U> *) noexcept
616 {
617   PetscFunctionBegin;
618   PetscCall(PetscArrayzero(ptr, n));
619   PetscFunctionReturn(PETSC_SUCCESS);
620 }
621 
622 template <typename T>
623 template <typename U>
624 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::uninitialized_copy(value_type *dest, const value_type *src, size_type n, const device::StreamBase<U> *) noexcept
625 {
626   PetscFunctionBegin;
627   PetscCall(PetscArraycpy(dest, src, n));
628   PetscFunctionReturn(PETSC_SUCCESS);
629 }
630 
631 template <typename T>
632 template <typename U>
633 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::set_canary(value_type *ptr, size_type n, const device::StreamBase<U> *) noexcept
634 {
635   using limit_type            = std::numeric_limits<real_value_type>;
636   constexpr value_type canary = limit_type::has_signaling_NaN ? limit_type::signaling_NaN() : limit_type::max();
637 
638   PetscFunctionBegin;
639   for (size_type i = 0; i < n; ++i) ptr[i] = canary;
640   PetscFunctionReturn(PETSC_SUCCESS);
641 }
642 
643 } // namespace impl
644 
645 // ==========================================================================================
646 // SegmentedMemoryPool
647 //
648 // Stream-aware async memory allocator. Holds a list of memory "blocks" which each control an
649 // allocated buffer. This buffer is further split into memory "chunks" which control
650 // consecutive, non-overlapping regions of the block. Chunks may be in 1 of 2 states:
651 //
652 // 1. Open:
653 //    The chunk is free to be claimed by the next suitable allocation request. If the
654 //    allocation request is made on the same stream as the chunk was deallocated on, no
655 //    serialization needs to occur. If not, the allocating stream must wait for the
656 //    event. Claiming the chunk "closes" the chunk.
657 //
658 // 2. Closed:
659 //    The chunk has been claimed by an allocation request. It cannot be opened again until it
660 //    is deallocated; doing so "opens" the chunk.
661 //
662 // Note that there does not need to be a chunk for every region, chunks are created to satisfy
663 // an allocation request.
664 //
665 // Thus there is usually a region of "unallocated" memory at the end of the buffer, which may
666 // be claimed by a newly created chunk if existing chunks cannot satisfy the allocation
667 // request. This region exists _only_ at the end, as there are no gaps between chunks.
668 //
669 //
670 // |-----------------------------------------------------------------------------------------
671 // | SegmentedMemoryPool
672 // |
673 // | ||-------------||
674 // | ||             ||    -------------------------------------------------------------------
675 // | ||             ||    | AAAAAAAAAAAAAABBBBBBBCCCCCCCCCCCCCCCCCCCCDDDDDDDDDDDDDXXXXXXXX...
676 // | ||             ||    | |             |      |                   |            |
677 // | ||             ||    | x-----x-------x-----xx---------x---------x------x-----x
678 // | || MemoryBlock || -> | ------|-------------|----------|----------------|--------
679 // | ||             ||    | | MemoryChunk | MemoryChunk | MemoryChunk | MemoryChunk |
680 // | ||             ||    | ---------------------------------------------------------
681 // | ||             ||    -------------------------------------------------------------------
682 // | ||-------------||
683 // | ||             ||
684 // | ||     ...     ||
685 // | ||             ||
686 // ==========================================================================================
687 
688 template <typename MemType, typename StreamType = device::DefaultStream, typename AllocType = impl::SegmentedMemoryPoolAllocatorBase<MemType>, std::size_t DefaultChunkSize = 256>
689 class SegmentedMemoryPool;
690 
691 // The actual memory pool class. It is in essence just a wrapper for a list of MemoryBlocks.
692 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
693 class SegmentedMemoryPool : public RegisterFinalizeable<SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>> {
694 public:
695   using value_type     = MemType;
696   using stream_type    = StreamType;
697   using allocator_type = AllocType;
698   using block_type     = impl::MemoryBlock<value_type, allocator_type, stream_type>;
699   using pool_type      = std::deque<block_type>;
700   using size_type      = typename block_type::size_type;
701 
702   explicit SegmentedMemoryPool(AllocType = AllocType{}, std::size_t = DefaultChunkSize) noexcept(std::is_nothrow_default_constructible<pool_type>::value);
703 
704   PetscErrorCode allocate(PetscInt, value_type **, const stream_type *, size_type = std::alignment_of<MemType>::value) noexcept;
705   PetscErrorCode deallocate(value_type **, const stream_type *) noexcept;
706   PetscErrorCode reallocate(PetscInt, value_type **, const stream_type *) noexcept;
707 
708 private:
709   pool_type      pool_;
710   allocator_type allocator_;
711   size_type      chunk_size_;
712 
713   PetscErrorCode make_block_(size_type, const stream_type *) noexcept;
714 
715   friend class RegisterFinalizeable<SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>>;
716   PetscErrorCode register_finalize_(const stream_type *) noexcept;
717   PetscErrorCode finalize_() noexcept;
718 
719   PetscErrorCode allocate_(size_type, value_type **, const stream_type *) noexcept;
720 };
721 
722 // ==========================================================================================
723 // SegmentedMemoryPool - Private API
724 // ==========================================================================================
725 
726 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
727 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::make_block_(size_type size, const stream_type *stream) noexcept
728 {
729   const auto block_size = std::max(size, chunk_size_);
730 
731   PetscFunctionBegin;
732   PetscCallCXX(pool_.emplace_back(&allocator_, block_size, stream));
733   PetscCall(PetscInfo(nullptr, "Allocated new block of size %zu, total %zu blocks\n", block_size, pool_.size()));
734   PetscFunctionReturn(PETSC_SUCCESS);
735 }
736 
737 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
738 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::register_finalize_(const stream_type *stream) noexcept
739 {
740   PetscFunctionBegin;
741   PetscCall(make_block_(chunk_size_, stream));
742   PetscFunctionReturn(PETSC_SUCCESS);
743 }
744 
745 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
746 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::finalize_() noexcept
747 {
748   PetscFunctionBegin;
749   PetscCallCXX(pool_.clear());
750   chunk_size_ = DefaultChunkSize;
751   PetscFunctionReturn(PETSC_SUCCESS);
752 }
753 
754 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
755 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::allocate_(size_type size, value_type **ptr, const stream_type *stream) noexcept
756 {
757   auto found = false;
758 
759   PetscFunctionBegin;
760   PetscCall(this->register_finalize(stream));
761   for (auto &block : pool_) {
762     PetscCall(block.try_allocate_chunk(size, ptr, stream, &found));
763     if (PetscLikely(found)) PetscFunctionReturn(PETSC_SUCCESS);
764   }
765 
766   PetscCall(PetscInfo(nullptr, "Could not find an open block in the pool (%zu blocks) (requested size %zu), allocating new block\n", pool_.size(), size));
767   // if we are here we couldn't find an open block in the pool, so make a new block
768   PetscCall(make_block_(size, stream));
769   // and assign it
770   PetscCall(pool_.back().try_allocate_chunk(size, ptr, stream, &found));
771   PetscAssert(found, PETSC_COMM_SELF, PETSC_ERR_MEM, "Failed to get a suitable memory chunk (of size %zu) from newly allocated memory block (size %zu)", size, pool_.back().size());
772   PetscFunctionReturn(PETSC_SUCCESS);
773 }
774 
775 // ==========================================================================================
776 // SegmentedMemoryPool - Public API
777 // ==========================================================================================
778 
779 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
780 inline SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::SegmentedMemoryPool(AllocType alloc, std::size_t size) noexcept(std::is_nothrow_default_constructible<pool_type>::value) : allocator_(std::move(alloc)), chunk_size_(size)
781 {
782 }
783 
784 /*
785   SegmentedMemoryPool::allocate - get an allocation from the memory pool
786 
787   Input Parameters:
788 + req_size - size (in elements) to get
789 . ptr      - the pointer to hold the allocation
790 - stream   - the stream on which to get the allocation
791 
792   Output Parameter:
793 . ptr - the pointer holding the allocation
794 
795   Notes:
796   req_size cannot be negative. If req_size if zero, ptr is set to nullptr
797 */
798 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
799 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::allocate(PetscInt req_size, value_type **ptr, const stream_type *stream, size_type alignment) noexcept
800 {
801   value_type *ret_ptr = nullptr;
802 
803   PetscFunctionBegin;
804   PetscAssert(req_size >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Requested memory amount (%" PetscInt_FMT ") must be >= 0", req_size);
805   PetscAssertPointer(ptr, 2);
806   PetscAssertPointer(stream, 3);
807   if (req_size) {
808     const auto size         = static_cast<size_type>(req_size);
809     auto       aligned_size = alignment == alignof(char) ? size : size + alignment;
810     void      *vptr         = nullptr;
811 
812     PetscCall(allocate_(aligned_size, &ret_ptr, stream));
813     vptr = ret_ptr;
814     std::align(alignment, size, vptr, aligned_size);
815     ret_ptr = reinterpret_cast<value_type *>(vptr);
816     // sets memory to NaN or infinity depending on the type to catch out uninitialized memory
817     // accesses.
818     if (PetscDefined(USE_DEBUG)) PetscCall(allocator_.set_canary(ret_ptr, size, stream));
819   }
820   *ptr = ret_ptr;
821   PetscFunctionReturn(PETSC_SUCCESS);
822 }
823 
824 /*
825   SegmentedMemoryPool::deallocate - release a pointer back to the memory pool
826 
827   Input Parameters:
828 + ptr    - the pointer to release
829 - stream - the stream to release it on
830 
831   Notes:
832   If ptr is not owned by the pool it is unchanged.
833 */
834 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
835 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::deallocate(value_type **ptr, const stream_type *stream) noexcept
836 {
837   PetscFunctionBegin;
838   PetscAssertPointer(ptr, 1);
839   PetscAssertPointer(stream, 2);
840   // nobody owns a nullptr, and if they do then they have bigger problems
841   if (!*ptr) PetscFunctionReturn(PETSC_SUCCESS);
842   for (auto &block : pool_) {
843     auto found = false;
844 
845     PetscCall(block.try_deallocate_chunk(ptr, stream, &found));
846     if (PetscLikely(found)) break;
847   }
848   PetscFunctionReturn(PETSC_SUCCESS);
849 }
850 
851 /*
852   SegmentedMemoryPool::reallocate - Resize an allocated buffer
853 
854   Input Parameters:
855 + new_req_size - the new buffer size
856 . ptr          - pointer to the buffer
857 - stream       - stream to resize with
858 
859   Output Parameter:
860 . ptr - pointer to the new region
861 
862   Notes:
863   ptr must have been allocated by the pool.
864 
865   It's OK to shrink the buffer, even down to 0 (in which case it is just deallocated).
866 */
867 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
868 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::reallocate(PetscInt new_req_size, value_type **ptr, const stream_type *stream) noexcept
869 {
870   using chunk_type = typename block_type::chunk_type;
871 
872   const auto  new_size = static_cast<size_type>(new_req_size);
873   const auto  old_ptr  = *ptr;
874   chunk_type *chunk    = nullptr;
875 
876   PetscFunctionBegin;
877   PetscAssert(new_req_size >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Requested memory amount (%" PetscInt_FMT ") must be >= 0", new_req_size);
878   PetscAssertPointer(ptr, 2);
879   PetscAssertPointer(stream, 3);
880 
881   // if reallocating to zero, just free
882   if (PetscUnlikely(new_size == 0)) {
883     PetscCall(deallocate(ptr, stream));
884     PetscFunctionReturn(PETSC_SUCCESS);
885   }
886 
887   // search the blocks for the owning chunk
888   for (auto &block : pool_) {
889     PetscCall(block.try_find_chunk(old_ptr, &chunk));
890     if (chunk) break; // found
891   }
892   PetscAssert(chunk, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Memory pool does not own %p, so cannot reallocate it", *ptr);
893 
894   if (chunk->capacity() < new_size) {
895     // chunk does not have enough room, need to grab a fresh chunk and copy to it
896     *ptr = nullptr;
897     PetscCall(chunk->release(stream));
898     PetscCall(allocate(new_size, ptr, stream));
899     PetscCall(allocator_.uninitialized_copy(*ptr, old_ptr, new_size, stream));
900   } else {
901     // chunk had enough room we can simply grow (or shrink) to fit the new size
902     PetscCall(chunk->resize(new_size));
903   }
904   PetscFunctionReturn(PETSC_SUCCESS);
905 }
906 
907 } // namespace memory
908 
909 } // namespace Petsc
910