1 #include <petsc/private/deviceimpl.h> /*I <petscdevice.h> I*/ 2 3 #include <petsc/private/cpp/register_finalize.hpp> 4 #include <petsc/private/cpp/array.hpp> 5 6 #include <unordered_map> 7 #include <algorithm> // std::find_if 8 #include <cstring> // std::memset 9 10 const char *const PetscDeviceCopyModes[] = {"host_to_host", "device_to_host", "host_to_device", "device_to_device", "auto", "PetscDeviceCopyMode", "PETSC_DEVICE_COPY_", nullptr}; 11 static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_HTOH) == 0, ""); 12 static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_DTOH) == 1, ""); 13 static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_HTOD) == 2, ""); 14 static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_DTOD) == 3, ""); 15 static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_AUTO) == 4, ""); 16 17 // ========================================================================================== 18 // MemoryMap 19 // 20 // Since the pointers allocated via PetscDeviceAllocate() may be device pointers we cannot just 21 // store meta-data within the pointer itself (as we can't dereference them). So instead we need 22 // to keep an extra map to keep track of them 23 // 24 // Each entry maps pointer -> [PetscMemType, PetscObjectId, size] 25 // ========================================================================================== 26 27 // GCC implementation for std::hash<T*>. LLVM's libc++ is almost 2x slower because they do all 28 // kinds of complicated murmur hashing, so we make sure to enforce GCC's version. 29 struct PointerHash { 30 std::size_t operator()(const void *ptr) const noexcept { return reinterpret_cast<std::size_t>(ptr); } 31 }; 32 33 class MemoryMap : public Petsc::RegisterFinalizeable<MemoryMap> { 34 public: 35 struct PointerAttributes { 36 PetscMemType mtype{}; // memtype of allocation 37 PetscObjectId id{}; // id of allocation 38 std::size_t size{}; // size of allocation (bytes) 39 40 // even though this is a POD and can be aggregate initialized, the STL uses () constructors 41 // in unordered_map and so we need to provide a trivial contructor... 42 constexpr PointerAttributes(PetscMemType, PetscObjectId, std::size_t) noexcept; 43 constexpr PointerAttributes() noexcept = default; 44 constexpr PointerAttributes(const PointerAttributes &) noexcept = default; 45 PETSC_CONSTEXPR_14 PointerAttributes &operator=(const PointerAttributes &) noexcept = default; 46 constexpr PointerAttributes(PointerAttributes &&) noexcept = default; 47 PETSC_CONSTEXPR_14 PointerAttributes &operator=(PointerAttributes &&) noexcept = default; 48 49 bool operator==(const PointerAttributes &) const noexcept; 50 51 PETSC_NODISCARD bool contains(const void *, const void *) const noexcept; 52 }; 53 54 using map_type = std::unordered_map<void *, PointerAttributes, PointerHash>; 55 56 map_type map; 57 58 // return the iterator of the allocation containing ptr, or map.cend() if not found 59 PETSC_NODISCARD map_type::const_iterator search_for(const void *) const noexcept; 60 61 private: 62 friend class Petsc::RegisterFinalizeable<MemoryMap>; 63 PETSC_NODISCARD PetscErrorCode register_finalize_() noexcept; 64 PETSC_NODISCARD PetscErrorCode finalize_() noexcept; 65 }; 66 67 // ========================================================================================== 68 // PointerAttributes 69 // ========================================================================================== 70 71 constexpr MemoryMap::PointerAttributes::PointerAttributes(PetscMemType mtype_, PetscObjectId id_, std::size_t size_) noexcept : mtype(mtype_), id(id_), size(size_) { } 72 73 bool MemoryMap::PointerAttributes::operator==(const PointerAttributes &other) const noexcept { 74 return mtype == other.mtype && id == other.id && size == other.size; 75 } 76 77 bool MemoryMap::PointerAttributes::contains(const void *ptr_begin, const void *ptr) const noexcept { 78 return (ptr >= ptr_begin) && (ptr < (static_cast<const char *>(ptr_begin) + size)); 79 } 80 81 // ========================================================================================== 82 // Memory map functions 83 // ========================================================================================== 84 85 PetscErrorCode MemoryMap::register_finalize_() noexcept { 86 PetscFunctionBegin; 87 // Preallocate, this does give a modest performance bump since unordered_map is so __dog__ 88 // slow if it needs to rehash. Experiments show that users tend not to have more than 5 or 89 // so concurrently live pointers lying around. 10 at most. 90 PetscCallCXX(map.reserve(16)); 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MemoryMap::finalize_() noexcept { 95 PetscFunctionBegin; 96 PetscCall(PetscInfo(nullptr, "Finalizing memory map\n")); 97 PetscCallCXX(map = map_type{}); 98 PetscFunctionReturn(0); 99 } 100 101 /* 102 MemoryMap::search_for - retrieve an iterator to the key-value pair for a pointer in the map 103 104 Input Parameter: 105 . ptr - pointer to search for 106 107 Notes: 108 Accounts for sub-regions, i.e. if ptr is contained within another pointers region, it returns 109 the iterator to the super-pointers key-value pair. 110 111 If ptr is not found, returns map.end() 112 */ 113 MemoryMap::map_type::const_iterator MemoryMap::search_for(const void *ptr) const noexcept { 114 const auto end = map.end(); 115 const auto it = map.find(const_cast<map_type::key_type>(ptr)); 116 117 // ptr was found, and points to an entire block 118 if (it != end) return it; 119 // wasn't found, but maybe its part of a block. have to search every block for it 120 // clang-format off 121 return std::find_if(map.begin(), end, [ptr](const map_type::const_iterator::value_type &map_it) { 122 return map_it.second.contains(map_it.first, ptr); 123 }); 124 // clang-format on 125 } 126 127 static MemoryMap memory_map; 128 129 // ========================================================================================== 130 // Utility functions 131 // ========================================================================================== 132 133 static PetscErrorCode PetscDeviceCheckCapable_Private(PetscDeviceContext dctx, bool cond, const char descr[]) { 134 PetscFunctionBegin; 135 PetscCheck(cond, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device context (id: %" PetscInt64_FMT ", name: %s, type: %s) can only handle %s host memory", PetscObjectCast(dctx)->id, PetscObjectCast(dctx)->name, dctx->device ? PetscDeviceTypes[dctx->device->type] : "unknown", descr); 136 PetscFunctionReturn(0); 137 } 138 139 // A helper utility, since register is called from PetscDeviceRegisterMemory() and 140 // PetscDevicAllocate(). The latter also needs the generated id, so instead of making it search 141 // the map again we just return it here 142 static PetscErrorCode PetscDeviceRegisterMemory_Private(const void *PETSC_RESTRICT ptr, PetscMemType mtype, std::size_t size, PetscObjectId *PETSC_RESTRICT id = nullptr) { 143 auto &map = memory_map.map; 144 const auto it = memory_map.search_for(ptr); 145 146 PetscFunctionBegin; 147 if (it == map.cend()) { 148 // pointer was never registered with the map, insert it and bail 149 const auto newid = PetscObjectNewId_Internal(); 150 151 if (PetscDefined(USE_DEBUG)) { 152 const auto tmp = MemoryMap::PointerAttributes(mtype, newid, size); 153 154 for (const auto &entry : map) { 155 // REVIEW ME: maybe this should just be handled... 156 PetscCheck(!tmp.contains(ptr, entry.first), PETSC_COMM_SELF, PETSC_ERR_ORDER, "Trying to register pointer %p (memtype %s, size %zu) but it appears you have already registered a sub-region of it (pointer %p, memtype %s, size %zu). Must register the larger region first", ptr, PetscMemTypeToString(mtype), size, 157 entry.first, PetscMemTypeToString(entry.second.mtype), entry.second.size); 158 } 159 } 160 // clang-format off 161 if (id) *id = newid; 162 PetscCallCXX(map.emplace( 163 std::piecewise_construct, 164 std::forward_as_tuple(const_cast<MemoryMap::map_type::key_type>(ptr)), 165 std::forward_as_tuple(mtype, newid, size) 166 )); 167 // clang-format on 168 PetscFunctionReturn(0); 169 } 170 if (PetscDefined(USE_DEBUG)) { 171 const auto &old = it->second; 172 173 PetscCheck(MemoryMap::PointerAttributes(mtype, old.id, size) == old, PETSC_COMM_SELF, PETSC_ERR_LIB, "Pointer %p appears to have been previously allocated with memtype %s, size %zu and assigned id %" PetscInt64_FMT ", which does not match new values: (mtype %s, size %zu, id %" PetscInt64_FMT ")", it->first, 174 PetscMemTypeToString(old.mtype), old.size, old.id, PetscMemTypeToString(mtype), size, old.id); 175 } 176 if (id) *id = it->second.id; 177 PetscFunctionReturn(0); 178 } 179 180 /*@C 181 PetscDeviceRegisterMemory - Register a pointer for use with device-aware memory system 182 183 Not Collective 184 185 Input Parameters: 186 + ptr - The pointer to register 187 . mtype - The `PetscMemType` of the pointer 188 - size - The size (in bytes) of the memory region 189 190 Notes: 191 `ptr` need not point to the beginning of the memory range, however the user should register 192 the 193 194 It's OK to re-register the same `ptr` repeatedly (subsequent registrations do nothing) 195 however the given `mtype` and `size` must match the original registration. 196 197 `size` may be 0 (in which case this routine does nothing). 198 199 Level: intermediate 200 201 .seealso: `PetscDeviceMalloc()`, `PetscDeviceArrayCopy()`, `PetscDeviceFree()`, 202 `PetscDeviceArrayZero()` 203 @*/ 204 PetscErrorCode PetscDeviceRegisterMemory(const void *PETSC_RESTRICT ptr, PetscMemType mtype, std::size_t size) { 205 PetscFunctionBegin; 206 if (PetscMemTypeHost(mtype)) PetscValidPointer(ptr, 1); 207 if (PetscUnlikely(!size)) PetscFunctionReturn(0); // there is no point registering empty range 208 PetscCall(PetscDeviceRegisterMemory_Private(ptr, mtype, size)); 209 PetscFunctionReturn(0); 210 } 211 212 /*@C 213 PetscDeviceAllocate - Allocate device-aware memory 214 215 Not Collective, Asynchronous, Auto-dependency aware 216 217 Input Parameters: 218 + dctx - The `PetscDeviceContext` used to allocate the memory 219 . clear - Whether or not the memory should be zeroed 220 . mtype - The type of memory to allocate 221 - n - The amount (in bytes) to allocate 222 223 Output Parameter: 224 . ptr - The pointer to store the result in 225 226 Notes: 227 The user should prefer `PetscDeviceMalloc()` over this routine as it automatically computes 228 the size of the allocation based on the size of the datatype. 229 230 Memory allocated with this function must be freed with `PetscDeviceFree()` or 231 `PetscDeviceDeallocate()`. 232 233 Note result stored `ptr` is immediately valid and the user may freely inspect or manipulate 234 its value on function return, i.e.\: 235 236 .vb 237 PetscInt *ptr; 238 239 PetscDeviceAllocate(dctx, PETSC_FALSE, PETSC_MEMTYPE_DEVICE, 20, (void**)&ptr); 240 241 PetscInt *sub_ptr = ptr + 10; // OK, no need to synchronize 242 243 ptr[0] = 10; // ERROR, directly accessing contents of ptr is undefined until synchronization 244 .ve 245 246 If `n` is zero, then `ptr` is set to `PETSC_NULLPTR`. 247 248 DAG representation: 249 .vb 250 time -> 251 252 -> dctx - |= CALL =| -\- dctx --> 253 \- ptr -> 254 .ve 255 256 Level: intermediate 257 258 .N ASYNC_API 259 260 .seealso: `PetscDeviceMalloc()`, `PetscGetMemType()`, `PetscDeviceFree()`, 261 `PetscDeviceDeallocate()`, `PetscDeviceArrayCopy()`, `PetscDeviceArrayZero()` 262 @*/ 263 PetscErrorCode PetscDeviceAllocate(PetscDeviceContext dctx, PetscBool clear, PetscMemType mtype, size_t n, void **PETSC_RESTRICT ptr) { 264 PetscObjectId id = 0; 265 266 PetscFunctionBegin; 267 PetscValidPointer(ptr, 5); 268 *ptr = nullptr; 269 if (PetscUnlikely(!n)) PetscFunctionReturn(0); 270 PetscCall(memory_map.register_finalize()); 271 PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx)); 272 273 // get our pointer here 274 if (dctx->ops->memalloc) { 275 PetscUseTypeMethod(dctx, memalloc, clear, mtype, n, ptr); 276 } else { 277 PetscCall(PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(mtype), "allocating")); 278 PetscCall(PetscMallocA(1, clear, __LINE__, PETSC_FUNCTION_NAME, __FILE__, n, ptr)); 279 } 280 PetscCall(PetscDeviceRegisterMemory_Private(*ptr, mtype, n, &id)); 281 // Note this is a "write" so that the next dctx to try and read from the pointer has to wait 282 // for the allocation to be ready 283 PetscCall(PetscDeviceContextMarkIntentFromID(dctx, id, PETSC_MEMORY_ACCESS_WRITE, "memory allocation")); 284 PetscFunctionReturn(0); 285 } 286 287 /*@C 288 PetscDeviceDeallocate - Free device-aware memory 289 290 Not Collective, Asynchronous, Auto-dependency aware 291 292 Input Parameters: 293 + dctx - The `PetscDeviceContext` used to free the memory 294 - ptr - The pointer to free 295 296 Notes: 297 `ptr` must have been allocated using any of `PetscDeviceMalloc()`, `PetscDeviceCalloc()` or 298 `PetscDeviceAllocate()`, or registered with the system via `PetscDeviceRegisterMemory()`. 299 300 The user should prefer `PetscDeviceFree()` over this routine as it automatically sets `ptr` 301 to `PETSC_NULLPTR` on successful deallocation. 302 303 `ptr` may be `NULL`. 304 305 DAG representation: 306 .vb 307 time -> 308 309 -> dctx -/- |= CALL =| - dctx -> 310 -> ptr -/ 311 .ve 312 313 Level: intermediate 314 315 .N ASYNC_API 316 317 .seealso: `PetscDeviceFree()`, `PetscDeviceAllocate()` 318 @*/ 319 PetscErrorCode PetscDeviceDeallocate(PetscDeviceContext dctx, void *PETSC_RESTRICT ptr) { 320 PetscFunctionBegin; 321 if (ptr) { 322 auto &map = memory_map.map; 323 const auto found_it = map.find(const_cast<MemoryMap::map_type::key_type>(ptr)); 324 325 if (PetscUnlikelyDebug(found_it == map.end())) { 326 // OK this is a bad pointer, now determine why 327 const auto it = memory_map.search_for(ptr); 328 329 // if it is map.cend() then no allocation owns it, meaning it was not allocated by us! 330 PetscCheck(it != map.cend(), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Pointer %p was not allocated via PetscDeviceAllocate()", ptr); 331 // if we are here then we did allocate it but the user has tried to do something along 332 // the lines of: 333 // 334 // allocate(&ptr, size); 335 // deallocate(ptr+5); 336 // 337 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Attempting to deallocate pointer %p which is a suballocation of %p (memtype %s, id %" PetscInt64_FMT ", size %zu bytes)", ptr, it->first, PetscMemTypeToString(it->second.mtype), it->second.id, 338 it->second.size); 339 } 340 PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx)); 341 342 // mark intent BEFORE we free, note we mark as write so that we are made to wait on any 343 // outstanding reads (don't want to kill the pointer before they are done) 344 PetscCall(PetscDeviceContextMarkIntentFromID(dctx, found_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory deallocation")); 345 346 // do free 347 if (dctx->ops->memfree) { 348 PetscUseTypeMethod(dctx, memfree, found_it->second.mtype, (void **)&ptr); 349 } else { 350 PetscCall(PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(found_it->second.mtype), "freeing")); 351 } 352 // if ptr still exists, then the device context could not handle it 353 if (ptr) PetscCall(PetscFree(ptr)); 354 PetscCallCXX(map.erase(found_it)); 355 } 356 PetscFunctionReturn(0); 357 } 358 359 /*@C 360 PetscDeviceMemcpy - Copy memory in a device-aware manner 361 362 Not Collective, Asynchronous, Auto-dependency aware 363 364 Input Parameters: 365 + dctx - The `PetscDeviceContext` used to copy the memory 366 . dest - The pointer to copy to 367 . src - The pointer to copy from 368 - n - The amount (in bytes) to copy 369 370 Notes: 371 Both `dest` and `src` must have been allocated by `PetscDeviceAllocate()` or 372 `PetscDeviceMalloc()` or registered with `PetscDeviceRegisterMemory()`. 373 374 `src` and `dest` cannot overlap. 375 376 If both `src` and `dest` are on the host this routine is fully synchronous. 377 378 The user should prefer `PetscDeviceArrayCopy()` over this routine as it automatically 379 computes the number of bytes to copy from the size of the pointer types. 380 381 DAG representation: 382 .vb 383 time -> 384 385 -> dctx - |= CALL =| - dctx -> 386 -> dest ---------------------> 387 -> src ----------------------> 388 .ve 389 390 Level: intermediate 391 392 .N ASYNC_API 393 394 .seealso: `PetscDeviceArrayCopy()`, `PetscDeviceMalloc()`, `PetscDeviceFree()` 395 @*/ 396 PetscErrorCode PetscDeviceMemcpy(PetscDeviceContext dctx, void *PETSC_RESTRICT dest, const void *PETSC_RESTRICT src, std::size_t n) { 397 PetscFunctionBegin; 398 if (!n) PetscFunctionReturn(0); 399 PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer"); 400 PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer"); 401 if (dest == src) PetscFunctionReturn(0); 402 PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx)); 403 404 { 405 const auto dest_it = memory_map.search_for(dest); 406 const auto src_it = memory_map.search_for(src); 407 408 PetscAssert(dest_it != memory_map.map.cend(), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Destination pointer %p was not registered with the memory tracker, call PetscDeviceRegisterMemory() on it", dest); 409 PetscAssert(src_it != memory_map.map.cend(), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Source pointer %p was not registered with the memory tracker, call PetscDeviceRegisterMemory() on it", src); 410 411 PetscCall(PetscDeviceContextMarkIntentFromID(dctx, src_it->second.id, PETSC_MEMORY_ACCESS_READ, "memory copy (src)")); 412 PetscCall(PetscDeviceContextMarkIntentFromID(dctx, dest_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory copy (dest)")); 413 414 const auto mode = PetscMemTypeToDeviceCopyMode(dest_it->second.mtype, src_it->second.mtype); 415 // perform the copy 416 if (dctx->ops->memcopy) { 417 PetscUseTypeMethod(dctx, memcopy, dest, src, n, mode); 418 } else { 419 // REVIEW ME: we might potentially need to sync here if the memory is device-allocated 420 // (pinned) but being copied by a host dctx 421 PetscCall(PetscDeviceCheckCapable_Private(dctx, mode == PETSC_DEVICE_COPY_HTOH, "copying")); 422 PetscCall(PetscMemcpy(dest, src, n)); 423 } 424 425 if (mode == PETSC_DEVICE_COPY_HTOD) { 426 PetscCall(PetscLogCpuToGpu(n)); 427 } else if (mode == PETSC_DEVICE_COPY_DTOH) { 428 PetscCall(PetscLogGpuToCpu(n)); 429 } 430 } 431 PetscFunctionReturn(0); 432 } 433 434 /*@C 435 PetscDeviceMemset - Memset device-aware memory 436 437 Not Collective, Asynchronous, Auto-dependency aware 438 439 Input Parameters: 440 + dctx - The `PetscDeviceContext` used to memset the memory 441 . ptr - The pointer to the memory 442 . v - The value to set 443 - n - The amount (in bytes) to set 444 445 Notes: 446 `ptr` must have been allocated by `PetscDeviceAllocate()` or `PetscDeviceMalloc()` or 447 registered with `PetscDeviceRegisterMemory()`. 448 449 The user should prefer `PetscDeviceArrayZero()` over this routine as it automatically 450 computes the number of bytes to copy from the size of the pointer types, though they should 451 note that it only zeros memory. 452 453 This routine is analogous to `memset()`. That is, this routine copies the value 454 `static_cast<unsigned char>(v)` into each of the first count characters of the object pointed 455 to by `dest`. 456 457 If `dest` is on device, this routine is asynchronous. 458 459 DAG representation: 460 .vb 461 time -> 462 463 -> dctx - |= CALL =| - dctx -> 464 -> dest ---------------------> 465 .ve 466 467 Level: intermediate 468 469 .N ASYNC_API 470 471 .seealso: `PetscDeviceArrayZero()`, `PetscDeviceMalloc()`, `PetscDeviceFree()` 472 @*/ 473 PetscErrorCode PetscDeviceMemset(PetscDeviceContext dctx, void *ptr, PetscInt v, std::size_t n) { 474 PetscFunctionBegin; 475 if (PetscUnlikely(!n)) PetscFunctionReturn(0); 476 PetscCheck(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer"); 477 PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx)); 478 { 479 const auto ptr_it = memory_map.search_for(ptr); 480 481 // mark 482 PetscAssert(ptr_it != memory_map.map.cend(), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p was not registered with the memory tracker, call PetscDeviceRegisterMemory() on it", ptr); 483 PetscCall(PetscDeviceContextMarkIntentFromID(dctx, ptr_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory set")); 484 485 // set 486 if (dctx->ops->memset) { 487 PetscUseTypeMethod(dctx, memset, ptr_it->second.mtype, ptr, v, n); 488 } else { 489 // REVIEW ME: we might potentially need to sync here if the memory is device-allocated 490 // (pinned) but being memset by a host dctx 491 PetscCall(PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(ptr_it->second.mtype), "memsetting")); 492 std::memset(ptr, static_cast<int>(v), n); 493 } 494 } 495 PetscFunctionReturn(0); 496 } 497