1 #pragma once
2
3 #include <petsc/private/deviceimpl.h>
4
5 #include <petsc/private/cpp/macros.hpp>
6 #include <petsc/private/cpp/type_traits.hpp>
7 #include <petsc/private/cpp/utility.hpp>
8 #include <petsc/private/cpp/register_finalize.hpp>
9 #include <petsc/private/cpp/memory.hpp>
10
11 #include <limits>
12 #include <deque>
13 #include <vector>
14
15 namespace Petsc
16 {
17
18 namespace device
19 {
20
21 template <typename T>
22 class StreamBase {
23 public:
24 using id_type = int;
25 using derived_type = T;
26
27 static const id_type INVALID_ID;
28
29 // needed so that dependent auto works, see veccupmimpl.h for a detailed discussion
30 template <typename U = T>
31 PETSC_NODISCARD auto get_stream() const noexcept PETSC_DECLTYPE_AUTO_RETURNS(static_cast<const U &>(*this).get_stream_());
32
get_id() const33 PETSC_NODISCARD id_type get_id() const noexcept { return static_cast<const T &>(*this).get_id_(); }
34
35 template <typename E>
record_event(E && event) const36 PetscErrorCode record_event(E &&event) const noexcept
37 {
38 return static_cast<const T &>(*this).record_event_(std::forward<E>(event));
39 }
40
41 template <typename E>
wait_for_event(E && event) const42 PetscErrorCode wait_for_event(E &&event) const noexcept
43 {
44 return static_cast<const T &>(*this).wait_for_(std::forward<E>(event));
45 }
46
47 protected:
48 constexpr StreamBase() noexcept = default;
49
50 struct default_event_type { };
51 using default_stream_type = std::nullptr_t;
52
get_stream_()53 PETSC_NODISCARD static constexpr default_stream_type get_stream_() noexcept { return nullptr; }
54
get_id_()55 PETSC_NODISCARD static constexpr id_type get_id_() noexcept { return 0; }
56
57 template <typename U = T>
record_event_(const typename U::event_type &)58 static constexpr PetscErrorCode record_event_(const typename U::event_type &) noexcept
59 {
60 return PETSC_SUCCESS;
61 }
62
63 template <typename U = T>
wait_for_(const typename U::event_type &)64 static constexpr PetscErrorCode wait_for_(const typename U::event_type &) noexcept
65 {
66 return PETSC_SUCCESS;
67 }
68 };
69
70 template <typename T>
71 const typename StreamBase<T>::id_type StreamBase<T>::INVALID_ID = -1;
72
73 struct DefaultStream : StreamBase<DefaultStream> {
74 using stream_type = typename StreamBase<DefaultStream>::default_stream_type;
75 using id_type = typename StreamBase<DefaultStream>::id_type;
76 using event_type = typename StreamBase<DefaultStream>::default_event_type;
77 };
78
79 } // namespace device
80
81 namespace memory
82 {
83
84 namespace impl
85 {
86
87 // ==========================================================================================
88 // MemoryChunk
89 //
90 // Represents a checked-out region of a MemoryBlock. Tracks the offset into the owning
91 // MemoryBlock and its size/capacity
92 // ==========================================================================================
93
94 template <typename EventType>
95 class MemoryChunk {
96 public:
97 using event_type = EventType;
98 using size_type = std::size_t;
99
100 MemoryChunk(size_type, size_type) noexcept;
101 explicit MemoryChunk(size_type) noexcept;
102
103 MemoryChunk(MemoryChunk &&) noexcept;
104 MemoryChunk &operator=(MemoryChunk &&) noexcept;
105
106 MemoryChunk(const MemoryChunk &) noexcept = delete;
107 MemoryChunk &operator=(const MemoryChunk &) noexcept = delete;
108
start() const109 PETSC_NODISCARD size_type start() const noexcept { return start_; }
size() const110 PETSC_NODISCARD size_type size() const noexcept { return size_; }
111 // REVIEW ME:
112 // make this an actual field, normally each chunk shrinks_to_fit() on begin claimed, but in
113 // theory only the last chunk needs to do this
capacity() const114 PETSC_NODISCARD size_type capacity() const noexcept { return size_; }
total_offset() const115 PETSC_NODISCARD size_type total_offset() const noexcept { return start() + size(); }
116
117 template <typename U>
118 PetscErrorCode release(const device::StreamBase<U> *) noexcept;
119 template <typename U>
120 PetscErrorCode claim(const device::StreamBase<U> *, size_type, bool *, bool = false) noexcept;
121 template <typename U>
122 PETSC_NODISCARD bool can_claim(const device::StreamBase<U> *, size_type, bool) const noexcept;
123 PetscErrorCode resize(size_type) noexcept;
124 PETSC_NODISCARD bool contains(size_type) const noexcept;
125
126 private:
127 event_type event_{}; // event recorded when the chunk was released
128 bool open_ = true; // is this chunk open?
129 int stream_id_ = device::DefaultStream::INVALID_ID; // id of the last stream to use the chunk, populated on release
130 size_type size_ = 0; // size of the chunk
131 size_type start_ = 0; // offset from the start of the owning block
132
133 template <typename U>
134 PETSC_NODISCARD bool stream_compat_(const device::StreamBase<U> *) const noexcept;
135 };
136
137 // ==========================================================================================
138 // MemoryChunk - Private API
139 // ==========================================================================================
140
141 // asks and answers the question: can this stream claim this chunk without serializing?
142 template <typename E>
143 template <typename U>
stream_compat_(const device::StreamBase<U> * strm) const144 inline bool MemoryChunk<E>::stream_compat_(const device::StreamBase<U> *strm) const noexcept
145 {
146 return (stream_id_ == strm->INVALID_ID) || (stream_id_ == strm->get_id());
147 }
148
149 // ==========================================================================================
150 // MemoryChunk - Public API
151 // ==========================================================================================
152
153 template <typename E>
MemoryChunk(size_type start,size_type size)154 inline MemoryChunk<E>::MemoryChunk(size_type start, size_type size) noexcept : size_(size), start_(start)
155 {
156 }
157
158 template <typename E>
MemoryChunk(size_type size)159 inline MemoryChunk<E>::MemoryChunk(size_type size) noexcept : MemoryChunk(0, size)
160 {
161 }
162
163 template <typename E>
MemoryChunk(MemoryChunk<E> && other)164 inline MemoryChunk<E>::MemoryChunk(MemoryChunk<E> &&other) noexcept :
165 event_(std::move(other.event_)), open_(util::exchange(other.open_, false)), stream_id_(util::exchange(other.stream_id_, device::DefaultStream::INVALID_ID)), size_(util::exchange(other.size_, 0)), start_(std::move(other.start_))
166 {
167 }
168
169 template <typename E>
operator =(MemoryChunk<E> && other)170 inline MemoryChunk<E> &MemoryChunk<E>::operator=(MemoryChunk<E> &&other) noexcept
171 {
172 PetscFunctionBegin;
173 if (this != &other) {
174 event_ = std::move(other.event_);
175 open_ = util::exchange(other.open_, false);
176 stream_id_ = util::exchange(other.stream_id_, device::DefaultStream::INVALID_ID);
177 size_ = util::exchange(other.size_, 0);
178 start_ = std::move(other.start_);
179 }
180 PetscFunctionReturn(*this);
181 }
182
183 /*
184 MemoryChunk::release - release a chunk on a stream
185
186 Input Parameter:
187 . stream - the stream to release the chunk with
188
189 Notes:
190 Inserts a release operation on stream and records the state of stream at the time this
191 routine was called.
192
193 Future allocation requests which attempt to claim the chunk on the same stream may re-acquire
194 the chunk without serialization.
195
196 If another stream attempts to claim the chunk they must wait for the recorded event before
197 claiming the chunk.
198 */
199 template <typename E>
200 template <typename U>
release(const device::StreamBase<U> * stream)201 inline PetscErrorCode MemoryChunk<E>::release(const device::StreamBase<U> *stream) noexcept
202 {
203 PetscFunctionBegin;
204 open_ = true;
205 stream_id_ = stream->get_id();
206 PetscCall(stream->record_event(event_));
207 PetscFunctionReturn(PETSC_SUCCESS);
208 }
209
210 /*
211 MemoryChunk::claim - attempt to claim a particular chunk
212
213 Input Parameters:
214 + stream - the stream on which to attempt to claim
215 . req_size - the requested size (in elements) to attempt to claim
216 - serialize - (optional, false) whether the claimant allows serialization
217
218 Output Parameter:
219 . success - true if the chunk was claimed, false otherwise
220 */
221 template <typename E>
222 template <typename U>
claim(const device::StreamBase<U> * stream,size_type req_size,bool * success,bool serialize)223 inline PetscErrorCode MemoryChunk<E>::claim(const device::StreamBase<U> *stream, size_type req_size, bool *success, bool serialize) noexcept
224 {
225 PetscFunctionBegin;
226 if ((*success = can_claim(stream, req_size, serialize))) {
227 if (serialize && !stream_compat_(stream)) PetscCall(stream->wait_for_event(event_));
228 PetscCall(resize(req_size));
229 open_ = false;
230 }
231 PetscFunctionReturn(PETSC_SUCCESS);
232 }
233
234 /*
235 MemoryChunk::can_claim - test whether a particular chunk can be claimed
236
237 Input Parameters:
238 + stream - the stream on which to attempt to claim
239 . req_size - the requested size (in elements) to attempt to claim
240 - serialize - whether the claimant allows serialization
241
242 Output:
243 . [return] - true if the chunk is claimable given the configuration, false otherwise
244 */
245 template <typename E>
246 template <typename U>
can_claim(const device::StreamBase<U> * stream,size_type req_size,bool serialize) const247 inline bool MemoryChunk<E>::can_claim(const device::StreamBase<U> *stream, size_type req_size, bool serialize) const noexcept
248 {
249 if (open_ && (req_size <= capacity())) {
250 // fully compatible
251 if (stream_compat_(stream)) return true;
252 // stream wasn't compatible, but could claim if we serialized
253 if (serialize) return true;
254 // incompatible stream and did not want to serialize
255 }
256 return false;
257 }
258
259 /*
260 MemoryChunk::resize - grow a chunk to new size
261
262 Input Parameter:
263 . newsize - the new size Requested
264
265 Notes:
266 newsize cannot be larger than capacity
267 */
268 template <typename E>
resize(size_type newsize)269 inline PetscErrorCode MemoryChunk<E>::resize(size_type newsize) noexcept
270 {
271 PetscFunctionBegin;
272 PetscAssert(newsize <= capacity(), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "New size %zu larger than capacity %zu", newsize, capacity());
273 size_ = newsize;
274 PetscFunctionReturn(PETSC_SUCCESS);
275 }
276
277 /*
278 MemoryChunk::contains - query whether a memory chunk contains a particular offset
279
280 Input Parameters:
281 . offset - The offset from the MemoryBlock start
282
283 Notes:
284 Returns true if the chunk contains the offset, false otherwise
285 */
286 template <typename E>
contains(size_type offset) const287 inline bool MemoryChunk<E>::contains(size_type offset) const noexcept
288 {
289 return (offset >= start()) && (offset < total_offset());
290 }
291
292 // ==========================================================================================
293 // MemoryBlock
294 //
295 // A "memory block" manager, which owns the pointer to a particular memory range. Retrieving
296 // and restoring a block is thread-safe (so may be used by multiple device streams).
297 // ==========================================================================================
298
299 template <typename T, typename AllocatorType, typename StreamType>
300 class MemoryBlock {
301 public:
302 using value_type = T;
303 using allocator_type = AllocatorType;
304 using stream_type = StreamType;
305 using event_type = typename stream_type::event_type;
306 using chunk_type = MemoryChunk<event_type>;
307 using size_type = typename chunk_type::size_type;
308 using chunk_list_type = std::vector<chunk_type>;
309
310 template <typename U>
311 MemoryBlock(allocator_type *, size_type, const device::StreamBase<U> *) noexcept;
312
313 ~MemoryBlock() noexcept(std::is_nothrow_destructible<chunk_list_type>::value);
314
315 MemoryBlock(MemoryBlock &&) noexcept;
316 MemoryBlock &operator=(MemoryBlock &&) noexcept;
317
318 // memory blocks are not copyable
319 MemoryBlock(const MemoryBlock &) = delete;
320 MemoryBlock &operator=(const MemoryBlock &) = delete;
321
322 /* --- actual functions --- */
323 PetscErrorCode try_allocate_chunk(size_type, T **, const stream_type *, bool *) noexcept;
324 PetscErrorCode try_deallocate_chunk(T **, const stream_type *, bool *) noexcept;
325 PetscErrorCode try_find_chunk(const T *, chunk_type **) noexcept;
326 PETSC_NODISCARD bool owns_pointer(const T *) const noexcept;
327
size() const328 PETSC_NODISCARD size_type size() const noexcept { return size_; }
bytes() const329 PETSC_NODISCARD size_type bytes() const noexcept { return sizeof(value_type) * size(); }
num_chunks() const330 PETSC_NODISCARD size_type num_chunks() const noexcept { return chunks_.size(); }
331
332 private:
333 value_type *mem_{};
334 allocator_type *allocator_{};
335 size_type size_{};
336 chunk_list_type chunks_{};
337
338 PetscErrorCode clear_(const stream_type *) noexcept;
339 };
340
341 // ==========================================================================================
342 // MemoryBlock - Private API
343 // ==========================================================================================
344
345 // clear the memory block, called from destructors and move assignment/construction
346 template <typename T, typename A, typename S>
clear_(const stream_type * stream)347 PetscErrorCode MemoryBlock<T, A, S>::clear_(const stream_type *stream) noexcept
348 {
349 PetscFunctionBegin;
350 if (PetscLikely(mem_)) {
351 PetscCall(allocator_->deallocate(mem_, stream));
352 mem_ = nullptr;
353 }
354 size_ = 0;
355 PetscCallCXX(chunks_.clear());
356 PetscFunctionReturn(PETSC_SUCCESS);
357 }
358
359 // ==========================================================================================
360 // MemoryBlock - Public API
361 // ==========================================================================================
362
363 // default constructor, allocates memory immediately
364 template <typename T, typename A, typename S>
365 template <typename U>
MemoryBlock(allocator_type * alloc,size_type s,const device::StreamBase<U> * stream)366 MemoryBlock<T, A, S>::MemoryBlock(allocator_type *alloc, size_type s, const device::StreamBase<U> *stream) noexcept : allocator_(alloc), size_(s)
367 {
368 PetscFunctionBegin;
369 PetscCallAbort(PETSC_COMM_SELF, alloc->allocate(&mem_, s, stream));
370 PetscAssertAbort(mem_, PETSC_COMM_SELF, PETSC_ERR_MEM, "Failed to allocate memory block of size %zu", s);
371 PetscFunctionReturnVoid();
372 }
373
374 template <typename T, typename A, typename S>
~MemoryBlock()375 MemoryBlock<T, A, S>::~MemoryBlock() noexcept(std::is_nothrow_destructible<chunk_list_type>::value)
376 {
377 stream_type stream;
378
379 PetscFunctionBegin;
380 PetscCallAbort(PETSC_COMM_SELF, clear_(&stream));
381 PetscFunctionReturnVoid();
382 }
383
384 template <typename T, typename A, typename S>
MemoryBlock(MemoryBlock && other)385 MemoryBlock<T, A, S>::MemoryBlock(MemoryBlock &&other) noexcept : mem_(util::exchange(other.mem_, nullptr)), allocator_(other.allocator_), size_(util::exchange(other.size_, 0)), chunks_(std::move(other.chunks_))
386 {
387 }
388
389 template <typename T, typename A, typename S>
operator =(MemoryBlock && other)390 MemoryBlock<T, A, S> &MemoryBlock<T, A, S>::operator=(MemoryBlock &&other) noexcept
391 {
392 PetscFunctionBegin;
393 if (this != &other) {
394 stream_type stream;
395
396 PetscCallAbort(PETSC_COMM_SELF, clear_(&stream));
397 mem_ = util::exchange(other.mem_, nullptr);
398 allocator_ = other.allocator_;
399 size_ = util::exchange(other.size_, 0);
400 chunks_ = std::move(other.chunks_);
401 }
402 PetscFunctionReturn(*this);
403 }
404
405 /*
406 MemoryBock::owns_pointer - returns true if this block owns a pointer, false otherwise
407 */
408 template <typename T, typename A, typename S>
owns_pointer(const T * ptr) const409 inline bool MemoryBlock<T, A, S>::owns_pointer(const T *ptr) const noexcept
410 {
411 // each pool is linear in memory, so it suffices to check the bounds
412 return (ptr >= mem_) && (ptr < std::next(mem_, size()));
413 }
414
415 /*
416 MemoryBlock::try_allocate_chunk - try to get a chunk from this MemoryBlock
417
418 Input Parameters:
419 + req_size - the requested size of the allocation (in elements)
420 . ptr - ptr to fill
421 - stream - stream to fill the pointer on
422
423 Output Parameter:
424 . success - true if chunk was gotten, false otherwise
425
426 Notes:
427 If the current memory could not satisfy the memory request, ptr is unchanged
428 */
429 template <typename T, typename A, typename S>
try_allocate_chunk(size_type req_size,T ** ptr,const stream_type * stream,bool * success)430 inline PetscErrorCode MemoryBlock<T, A, S>::try_allocate_chunk(size_type req_size, T **ptr, const stream_type *stream, bool *success) noexcept
431 {
432 PetscFunctionBegin;
433 *success = false;
434 if (req_size <= size()) {
435 const auto try_create_chunk = [&]() {
436 const auto was_empty = chunks_.empty();
437 const auto block_alloced = was_empty ? 0 : chunks_.back().total_offset();
438
439 PetscFunctionBegin;
440 if (block_alloced + req_size <= size()) {
441 PetscCallCXX(chunks_.emplace_back(block_alloced, req_size));
442 PetscCall(chunks_.back().claim(stream, req_size, success));
443 *ptr = mem_ + block_alloced;
444 if (was_empty) PetscAssert(*success, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Failed to claim chunk (of size %zu) even though block (of size %zu) was empty!", req_size, size());
445 }
446 PetscFunctionReturn(PETSC_SUCCESS);
447 };
448 const auto try_find_open_chunk = [&](bool serialize = false) {
449 PetscFunctionBegin;
450 for (auto &chunk : chunks_) {
451 PetscCall(chunk.claim(stream, req_size, success, serialize));
452 if (*success) {
453 *ptr = mem_ + chunk.start();
454 break;
455 }
456 }
457 PetscFunctionReturn(PETSC_SUCCESS);
458 };
459 const auto try_steal_other_stream_chunk = [&]() {
460 PetscFunctionBegin;
461 PetscCall(try_find_open_chunk(true));
462 PetscFunctionReturn(PETSC_SUCCESS);
463 };
464
465 // search previously distributed chunks, but only claim one if it is on the same stream
466 // as us
467 PetscCall(try_find_open_chunk());
468
469 // if we are here we couldn't reuse one of our own chunks so check first if the pool
470 // has room for a new one
471 if (!*success) PetscCall(try_create_chunk());
472
473 // try pruning dead chunks off the back, note we do this regardless of whether we are
474 // successful
475 while (chunks_.back().can_claim(stream, 0, false)) {
476 PetscCallCXX(chunks_.pop_back());
477 if (chunks_.empty()) {
478 // if chunks are empty it implies we have managed to claim (and subsequently destroy)
479 // our own chunk twice! something has gone wrong
480 PetscAssert(!*success, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Successfully claimed a chunk (of size %zu, from block of size %zu) but have now managed to claim it for a second time (and destroyed it)!", req_size, size());
481 break;
482 }
483 }
484
485 // if previously unsuccessful see if enough space has opened up due to pruning. note that
486 // if the chunk list was emptied from the pruning this call must succeed in allocating a
487 // chunk, otherwise something is wrong
488 if (!*success) PetscCall(try_create_chunk());
489
490 // last resort, iterate over all chunks and see if we can steal one by waiting on the
491 // current owner to finish using it
492 if (!*success) PetscCall(try_steal_other_stream_chunk());
493 }
494 PetscFunctionReturn(PETSC_SUCCESS);
495 }
496
497 /*
498 MemoryBlock::try_deallocate_chunk - try to restore a chunk to this MemoryBlock
499
500 Input Parameters:
501 + ptr - ptr to restore
502 - stream - stream to restore the pointer on
503
504 Output Parameter:
505 . success - true if chunk was restored, false otherwise
506
507 Notes:
508 ptr is set to nullptr on successful restore, and is unchanged otherwise. If the ptr is owned
509 by this MemoryBlock then it is restored on stream. The same stream may receive ptr again
510 without synchronization, but other streams may not do so until either serializing or the
511 stream is idle again.
512 */
513 template <typename T, typename A, typename S>
try_deallocate_chunk(T ** ptr,const stream_type * stream,bool * success)514 inline PetscErrorCode MemoryBlock<T, A, S>::try_deallocate_chunk(T **ptr, const stream_type *stream, bool *success) noexcept
515 {
516 chunk_type *chunk = nullptr;
517
518 PetscFunctionBegin;
519 PetscCall(try_find_chunk(*ptr, &chunk));
520 if (chunk) {
521 PetscCall(chunk->release(stream));
522 *ptr = nullptr;
523 *success = true;
524 } else {
525 *success = false;
526 }
527 PetscFunctionReturn(PETSC_SUCCESS);
528 }
529
530 /*
531 MemoryBlock::try_find_chunk - try to find the chunk which owns ptr
532
533 Input Parameter:
534 . ptr - the pointer to look for
535
536 Output Parameter:
537 . ret_chunk - pointer to the owning chunk or nullptr if not found
538 */
539 template <typename T, typename A, typename S>
try_find_chunk(const T * ptr,chunk_type ** ret_chunk)540 inline PetscErrorCode MemoryBlock<T, A, S>::try_find_chunk(const T *ptr, chunk_type **ret_chunk) noexcept
541 {
542 PetscFunctionBegin;
543 *ret_chunk = nullptr;
544 if (owns_pointer(ptr)) {
545 const auto offset = static_cast<size_type>(ptr - mem_);
546
547 for (auto &chunk : chunks_) {
548 if (chunk.contains(offset)) {
549 *ret_chunk = &chunk;
550 break;
551 }
552 }
553
554 PetscAssert(*ret_chunk, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Failed to find %zu in block, even though it is within block range [%zu, %zu)", reinterpret_cast<uintptr_t>(ptr), reinterpret_cast<uintptr_t>(mem_), reinterpret_cast<uintptr_t>(std::next(mem_, size())));
555 }
556 PetscFunctionReturn(PETSC_SUCCESS);
557 }
558
559 namespace detail
560 {
561
562 template <typename T>
563 struct real_type {
564 using type = T;
565 };
566
567 template <>
568 struct real_type<PetscScalar> {
569 using type = PetscReal;
570 };
571
572 } // namespace detail
573
574 template <typename T>
575 struct SegmentedMemoryPoolAllocatorBase {
576 using value_type = T;
577 using size_type = std::size_t;
578 using real_value_type = typename detail::real_type<T>::type;
579
580 template <typename U>
581 static PetscErrorCode allocate(value_type **, size_type, const device::StreamBase<U> *) noexcept;
582 template <typename U>
583 static PetscErrorCode deallocate(value_type *, const device::StreamBase<U> *) noexcept;
584 template <typename U>
585 static PetscErrorCode zero(value_type *, size_type, const device::StreamBase<U> *) noexcept;
586 template <typename U>
587 static PetscErrorCode uninitialized_copy(value_type *, const value_type *, size_type, const device::StreamBase<U> *) noexcept;
588 template <typename U>
589 static PetscErrorCode set_canary(value_type *, size_type, const device::StreamBase<U> *) noexcept;
590 };
591
592 template <typename T>
593 template <typename U>
allocate(value_type ** ptr,size_type n,const device::StreamBase<U> *)594 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::allocate(value_type **ptr, size_type n, const device::StreamBase<U> *) noexcept
595 {
596 PetscFunctionBegin;
597 PetscCall(PetscMalloc1(n, ptr));
598 PetscFunctionReturn(PETSC_SUCCESS);
599 }
600
601 template <typename T>
602 template <typename U>
deallocate(value_type * ptr,const device::StreamBase<U> *)603 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::deallocate(value_type *ptr, const device::StreamBase<U> *) noexcept
604 {
605 PetscFunctionBegin;
606 PetscCall(PetscFree(ptr));
607 PetscFunctionReturn(PETSC_SUCCESS);
608 }
609
610 template <typename T>
611 template <typename U>
zero(value_type * ptr,size_type n,const device::StreamBase<U> *)612 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::zero(value_type *ptr, size_type n, const device::StreamBase<U> *) noexcept
613 {
614 PetscFunctionBegin;
615 PetscCall(PetscArrayzero(ptr, n));
616 PetscFunctionReturn(PETSC_SUCCESS);
617 }
618
619 template <typename T>
620 template <typename U>
uninitialized_copy(value_type * dest,const value_type * src,size_type n,const device::StreamBase<U> *)621 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::uninitialized_copy(value_type *dest, const value_type *src, size_type n, const device::StreamBase<U> *) noexcept
622 {
623 PetscFunctionBegin;
624 PetscCall(PetscArraycpy(dest, src, n));
625 PetscFunctionReturn(PETSC_SUCCESS);
626 }
627
628 template <typename T>
629 template <typename U>
set_canary(value_type * ptr,size_type n,const device::StreamBase<U> *)630 inline PetscErrorCode SegmentedMemoryPoolAllocatorBase<T>::set_canary(value_type *ptr, size_type n, const device::StreamBase<U> *) noexcept
631 {
632 using limit_type = std::numeric_limits<real_value_type>;
633 constexpr value_type canary = limit_type::has_signaling_NaN ? limit_type::signaling_NaN() : limit_type::max();
634
635 PetscFunctionBegin;
636 for (size_type i = 0; i < n; ++i) ptr[i] = canary;
637 PetscFunctionReturn(PETSC_SUCCESS);
638 }
639
640 } // namespace impl
641
642 // ==========================================================================================
643 // SegmentedMemoryPool
644 //
645 // Stream-aware async memory allocator. Holds a list of memory "blocks" which each control an
646 // allocated buffer. This buffer is further split into memory "chunks" which control
647 // consecutive, non-overlapping regions of the block. Chunks may be in 1 of 2 states:
648 //
649 // 1. Open:
650 // The chunk is free to be claimed by the next suitable allocation request. If the
651 // allocation request is made on the same stream as the chunk was deallocated on, no
652 // serialization needs to occur. If not, the allocating stream must wait for the
653 // event. Claiming the chunk "closes" the chunk.
654 //
655 // 2. Closed:
656 // The chunk has been claimed by an allocation request. It cannot be opened again until it
657 // is deallocated; doing so "opens" the chunk.
658 //
659 // Note that there does not need to be a chunk for every region, chunks are created to satisfy
660 // an allocation request.
661 //
662 // Thus there is usually a region of "unallocated" memory at the end of the buffer, which may
663 // be claimed by a newly created chunk if existing chunks cannot satisfy the allocation
664 // request. This region exists _only_ at the end, as there are no gaps between chunks.
665 //
666 //
667 // |-----------------------------------------------------------------------------------------
668 // | SegmentedMemoryPool
669 // |
670 // | ||-------------||
671 // | || || -------------------------------------------------------------------
672 // | || || | AAAAAAAAAAAAAABBBBBBBCCCCCCCCCCCCCCCCCCCCDDDDDDDDDDDDDXXXXXXXX...
673 // | || || | | | | | |
674 // | || || | x-----x-------x-----xx---------x---------x------x-----x
675 // | || MemoryBlock || -> | ------|-------------|----------|----------------|--------
676 // | || || | | MemoryChunk | MemoryChunk | MemoryChunk | MemoryChunk |
677 // | || || | ---------------------------------------------------------
678 // | || || -------------------------------------------------------------------
679 // | ||-------------||
680 // | || ||
681 // | || ... ||
682 // | || ||
683 // ==========================================================================================
684
685 template <typename MemType, typename StreamType = device::DefaultStream, typename AllocType = impl::SegmentedMemoryPoolAllocatorBase<MemType>, std::size_t DefaultChunkSize = 256>
686 class SegmentedMemoryPool;
687
688 // The actual memory pool class. It is in essence just a wrapper for a list of MemoryBlocks.
689 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
690 class SegmentedMemoryPool : public RegisterFinalizeable<SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>> {
691 public:
692 using value_type = MemType;
693 using stream_type = StreamType;
694 using allocator_type = AllocType;
695 using block_type = impl::MemoryBlock<value_type, allocator_type, stream_type>;
696 using pool_type = std::deque<block_type>;
697 using size_type = typename block_type::size_type;
698
699 explicit SegmentedMemoryPool(AllocType = AllocType{}, std::size_t = DefaultChunkSize) noexcept(std::is_nothrow_default_constructible<pool_type>::value);
700
701 PetscErrorCode allocate(size_t, value_type **, const stream_type *, size_type = std::alignment_of<MemType>::value) noexcept;
702 PetscErrorCode deallocate(value_type **, const stream_type *) noexcept;
703 PetscErrorCode reallocate(size_t, value_type **, const stream_type *) noexcept;
704
705 private:
706 pool_type pool_;
707 allocator_type allocator_;
708 size_type chunk_size_;
709
710 PetscErrorCode make_block_(size_type, const stream_type *) noexcept;
711
712 friend class RegisterFinalizeable<SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>>;
713 PetscErrorCode register_finalize_(const stream_type *) noexcept;
714 PetscErrorCode finalize_() noexcept;
715
716 PetscErrorCode allocate_(size_type, value_type **, const stream_type *) noexcept;
717 };
718
719 // ==========================================================================================
720 // SegmentedMemoryPool - Private API
721 // ==========================================================================================
722
723 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
make_block_(size_type size,const stream_type * stream)724 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::make_block_(size_type size, const stream_type *stream) noexcept
725 {
726 const auto block_size = std::max(size, chunk_size_);
727
728 PetscFunctionBegin;
729 PetscCallCXX(pool_.emplace_back(&allocator_, block_size, stream));
730 PetscCall(PetscInfo(nullptr, "Allocated new block of size %zu, total %zu blocks\n", block_size, pool_.size()));
731 PetscFunctionReturn(PETSC_SUCCESS);
732 }
733
734 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
register_finalize_(const stream_type * stream)735 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::register_finalize_(const stream_type *stream) noexcept
736 {
737 PetscFunctionBegin;
738 PetscCall(make_block_(chunk_size_, stream));
739 PetscFunctionReturn(PETSC_SUCCESS);
740 }
741
742 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
finalize_()743 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::finalize_() noexcept
744 {
745 PetscFunctionBegin;
746 PetscCallCXX(pool_.clear());
747 chunk_size_ = DefaultChunkSize;
748 PetscFunctionReturn(PETSC_SUCCESS);
749 }
750
751 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
allocate_(size_type size,value_type ** ptr,const stream_type * stream)752 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::allocate_(size_type size, value_type **ptr, const stream_type *stream) noexcept
753 {
754 auto found = false;
755
756 PetscFunctionBegin;
757 PetscCall(this->register_finalize(stream));
758 for (auto &block : pool_) {
759 PetscCall(block.try_allocate_chunk(size, ptr, stream, &found));
760 if (PetscLikely(found)) PetscFunctionReturn(PETSC_SUCCESS);
761 }
762
763 PetscCall(PetscInfo(nullptr, "Could not find an open block in the pool (%zu blocks) (requested size %zu), allocating new block\n", pool_.size(), size));
764 // if we are here we couldn't find an open block in the pool, so make a new block
765 PetscCall(make_block_(size, stream));
766 // and assign it
767 PetscCall(pool_.back().try_allocate_chunk(size, ptr, stream, &found));
768 PetscAssert(found, PETSC_COMM_SELF, PETSC_ERR_MEM, "Failed to get a suitable memory chunk (of size %zu) from newly allocated memory block (size %zu)", size, pool_.back().size());
769 PetscFunctionReturn(PETSC_SUCCESS);
770 }
771
772 // ==========================================================================================
773 // SegmentedMemoryPool - Public API
774 // ==========================================================================================
775
776 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
SegmentedMemoryPool(AllocType alloc,std::size_t size)777 inline SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::SegmentedMemoryPool(AllocType alloc, std::size_t size) noexcept(std::is_nothrow_default_constructible<pool_type>::value) : allocator_(std::move(alloc)), chunk_size_(size)
778 {
779 }
780
781 /*
782 SegmentedMemoryPool::allocate - get an allocation from the memory pool
783
784 Input Parameters:
785 + req_size - size (in elements) to get
786 . ptr - the pointer to hold the allocation
787 - stream - the stream on which to get the allocation
788
789 Output Parameter:
790 . ptr - the pointer holding the allocation
791
792 Notes:
793 `req_size` cannot be negative. If `req_size` if zero, `ptr` is set to `nullptr`
794 */
795 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
allocate(size_t req_size,value_type ** ptr,const stream_type * stream,size_type alignment)796 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::allocate(size_t req_size, value_type **ptr, const stream_type *stream, size_type alignment) noexcept
797 {
798 value_type *ret_ptr = nullptr;
799
800 PetscFunctionBegin;
801 PetscAssertPointer(ptr, 2);
802 PetscAssertPointer(stream, 3);
803 if (req_size) {
804 const auto size = static_cast<size_type>(req_size);
805 auto aligned_size = alignment == alignof(char) ? size : size + alignment;
806 void *vptr = nullptr;
807
808 PetscCall(allocate_(aligned_size, &ret_ptr, stream));
809 vptr = ret_ptr;
810 std::align(alignment, size, vptr, aligned_size);
811 ret_ptr = reinterpret_cast<value_type *>(vptr);
812 // sets memory to infinity or NaN depending on the type to catch out uninitialized memory accesses.
813 if (PetscDefined(USE_DEBUG)) PetscCall(allocator_.set_canary(ret_ptr, size, stream));
814 }
815 *ptr = ret_ptr;
816 PetscFunctionReturn(PETSC_SUCCESS);
817 }
818
819 /*
820 SegmentedMemoryPool::deallocate - release a pointer back to the memory pool
821
822 Input Parameters:
823 + ptr - the pointer to release
824 - stream - the stream to release it on
825
826 Notes:
827 If `ptr` is not owned by the pool it is unchanged.
828 */
829 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
deallocate(value_type ** ptr,const stream_type * stream)830 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::deallocate(value_type **ptr, const stream_type *stream) noexcept
831 {
832 PetscFunctionBegin;
833 PetscAssertPointer(ptr, 1);
834 PetscAssertPointer(stream, 2);
835 // nobody owns a nullptr, and if they do then they have bigger problems
836 if (!*ptr) PetscFunctionReturn(PETSC_SUCCESS);
837 for (auto &block : pool_) {
838 auto found = false;
839
840 PetscCall(block.try_deallocate_chunk(ptr, stream, &found));
841 if (PetscLikely(found)) break;
842 }
843 PetscFunctionReturn(PETSC_SUCCESS);
844 }
845
846 /*
847 SegmentedMemoryPool::reallocate - Resize an allocated buffer
848
849 Input Parameters:
850 + new_req_size - the new buffer size
851 . ptr - pointer to the buffer
852 - stream - stream to resize with
853
854 Output Parameter:
855 . ptr - pointer to the new region
856
857 Notes:
858 `ptr` must have been allocated by the pool.
859
860 It's OK to shrink the buffer, even down to 0 (in which case it is just deallocated).
861 */
862 template <typename MemType, typename StreamType, typename AllocType, std::size_t DefaultChunkSize>
reallocate(size_t new_req_size,value_type ** ptr,const stream_type * stream)863 inline PetscErrorCode SegmentedMemoryPool<MemType, StreamType, AllocType, DefaultChunkSize>::reallocate(size_t new_req_size, value_type **ptr, const stream_type *stream) noexcept
864 {
865 using chunk_type = typename block_type::chunk_type;
866
867 const auto new_size = static_cast<size_type>(new_req_size);
868 const auto old_ptr = *ptr;
869 chunk_type *chunk = nullptr;
870
871 PetscFunctionBegin;
872 PetscAssertPointer(ptr, 2);
873 PetscAssertPointer(stream, 3);
874
875 // if reallocating to zero, just free
876 if (PetscUnlikely(new_size == 0)) {
877 PetscCall(deallocate(ptr, stream));
878 PetscFunctionReturn(PETSC_SUCCESS);
879 }
880
881 // search the blocks for the owning chunk
882 for (auto &block : pool_) {
883 PetscCall(block.try_find_chunk(old_ptr, &chunk));
884 if (chunk) break; // found
885 }
886 PetscAssert(chunk, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Memory pool does not own %p, so cannot reallocate it", *ptr);
887
888 if (chunk->capacity() < new_size) {
889 // chunk does not have enough room, need to grab a fresh chunk and copy to it
890 *ptr = nullptr;
891 PetscCall(chunk->release(stream));
892 PetscCall(allocate(new_size, ptr, stream));
893 PetscCall(allocator_.uninitialized_copy(*ptr, old_ptr, new_size, stream));
894 } else {
895 // chunk had enough room we can simply grow (or shrink) to fit the new size
896 PetscCall(chunk->resize(new_size));
897 }
898 PetscFunctionReturn(PETSC_SUCCESS);
899 }
900
901 } // namespace memory
902
903 } // namespace Petsc
904