1 #pragma once 2 3 #include <petscdevice_cupm.h> 4 5 #include <petsc/private/cpputil.hpp> 6 #include <petsc/private/petscadvancedmacros.h> 7 8 #include <petsc/private/cpp/array.hpp> 9 10 namespace Petsc 11 { 12 13 namespace device 14 { 15 16 namespace cupm 17 { 18 19 // enum describing available cupm devices, this is used as the template parameter to any 20 // class subclassing the Interface or using it as a member variable 21 enum class DeviceType : int { 22 CUDA, 23 HIP 24 }; 25 26 // clang-format off 27 static constexpr std::array<const char *const, 5> DeviceTypes = { 28 "cuda", 29 "hip", 30 "Petsc::device::cupm::DeviceType", 31 "Petsc::device::cupm::DeviceType::", 32 nullptr 33 }; 34 // clang-format on 35 36 namespace impl 37 { 38 39 #define PetscCallCUPM_(__abort_fn__, __comm__, ...) \ 40 do { \ 41 PetscStackUpdateLine; \ 42 const cupmError_t cerr_p_ = __VA_ARGS__; \ 43 __abort_fn__(cerr_p_ == cupmSuccess, __comm__, PETSC_ERR_GPU, "%s error %d (%s) : %s", cupmName(), static_cast<PetscErrorCode>(cerr_p_), cupmGetErrorName(cerr_p_), cupmGetErrorString(cerr_p_)); \ 44 } while (0) 45 46 // A backend agnostic PetscCallCUPM() function, this will only work inside the member 47 // functions of a class inheriting from CUPM::Interface. Thanks to __VA_ARGS__ templated 48 // functions can also be wrapped inline: 49 // 50 // PetscCallCUPM(foo<int,char,bool>()); 51 #define PetscCallCUPM(...) PetscCallCUPM_(PetscCheck, PETSC_COMM_SELF, __VA_ARGS__) 52 #define PetscCallCUPMAbort(comm_, ...) PetscCallCUPM_(PetscCheckAbort, comm_, __VA_ARGS__) 53 54 // PETSC_CUPM_ALIAS_FUNCTION() - declaration to alias a cuda/hip function 55 // 56 // input params: 57 // our_name - the name of the alias 58 // their_name - the name of the function being aliased 59 // 60 // notes: 61 // see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion 62 // 63 // example usage: 64 // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc) -> 65 // template <typename... T> 66 // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction* 67 // { 68 // return cudaMalloc(std::forward<T>(args)...); 69 // } 70 // 71 // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc) -> 72 // template <typename... T> 73 // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction* 74 // { 75 // return hipMalloc(std::forward<T>(args)...); 76 // } 77 #define PETSC_CUPM_ALIAS_FUNCTION(our_name, their_name) PETSC_ALIAS_FUNCTION(static our_name, their_name) 78 79 // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE() - declaration to alias a cuda/hip function but 80 // discard the last N arguments 81 // 82 // input params: 83 // our_name - the name of the alias 84 // their_name - the name of the function being aliased 85 // N - integer constant [0, INT_MAX) dictating how many arguments to chop off the end 86 // 87 // notes: 88 // see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the expansion 89 // 90 // example use: 91 // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(cupmMallocAsync, cudaMalloc, 1) -> 92 // template <typename... T, typename Tend> 93 // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing 94 // return type deduction* 95 // { 96 // (void)argend; 97 // return cudaMalloc(std::forward<T>(args)...); 98 // } 99 #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(our_name, their_name, N) PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS(static our_name, their_name, N) 100 101 // Base class that holds functions and variables that don't require CUDA or HIP to be present 102 // on the system 103 template <DeviceType T> 104 struct InterfaceBase { 105 static const DeviceType type = T; 106 cupmNamePetsc::device::cupm::impl::InterfaceBase107 PETSC_NODISCARD static constexpr const char *cupmName() noexcept 108 { 109 static_assert(util::to_underlying(DeviceType::CUDA) == 0, ""); 110 static_assert(util::to_underlying(DeviceType::HIP) == 1, ""); 111 return std::get<util::to_underlying(T)>(DeviceTypes); 112 } 113 cupmNAMEPetsc::device::cupm::impl::InterfaceBase114 PETSC_NODISCARD static constexpr const char *cupmNAME() noexcept { return T == DeviceType::CUDA ? "CUDA" : "HIP"; } 115 PETSC_DEVICE_CUPMPetsc::device::cupm::impl::InterfaceBase116 PETSC_NODISCARD static constexpr PetscDeviceType PETSC_DEVICE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_DEVICE_CUDA : PETSC_DEVICE_HIP; } 117 PETSC_MEMTYPE_CUPMPetsc::device::cupm::impl::InterfaceBase118 PETSC_NODISCARD static constexpr PetscMemType PETSC_MEMTYPE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_MEMTYPE_CUDA : PETSC_MEMTYPE_HIP; } 119 }; 120 121 // declare the base class static member variables 122 template <DeviceType T> 123 const DeviceType InterfaceBase<T>::type; 124 125 #define PETSC_CUPM_BASE_CLASS_HEADER(T) \ 126 using ::Petsc::device::cupm::impl::InterfaceBase<T>::type; \ 127 using ::Petsc::device::cupm::impl::InterfaceBase<T>::cupmName; \ 128 using ::Petsc::device::cupm::impl::InterfaceBase<T>::cupmNAME; \ 129 using ::Petsc::device::cupm::impl::InterfaceBase<T>::PETSC_DEVICE_CUPM; \ 130 using ::Petsc::device::cupm::impl::InterfaceBase<T>::PETSC_MEMTYPE_CUPM 131 132 // A templated C++ struct that defines the entire CUPM interface. Use of templating vs 133 // preprocessor macros allows us to use both interfaces simultaneously as well as easily 134 // import them into classes. 135 template <DeviceType> 136 struct InterfaceImpl; 137 138 #if PetscDefined(HAVE_CUDA) 139 template <> 140 struct InterfaceImpl<DeviceType::CUDA> : InterfaceBase<DeviceType::CUDA> { 141 PETSC_CUPM_BASE_CLASS_HEADER(DeviceType::CUDA); 142 143 // typedefs 144 using cupmError_t = cudaError_t; 145 using cupmEvent_t = cudaEvent_t; 146 using cupmStream_t = cudaStream_t; 147 using cupmDeviceProp_t = cudaDeviceProp; 148 using cupmMemcpyKind_t = cudaMemcpyKind; 149 using cupmDeviceAttr_t = cudaDeviceAttr; 150 using cupmComplex_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), cuComplex, cuDoubleComplex>; 151 using cupmPointerAttributes_t = cudaPointerAttributes; 152 using cupmMemoryType_t = enum cudaMemoryType; 153 using cupmDim3 = dim3; 154 using cupmHostFn_t = cudaHostFn_t; 155 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0) 156 using cupmMemPool_t = cudaMemPool_t; 157 using cupmMemPoolAttr = cudaMemPoolAttr; 158 #else 159 using cupmMemPool_t = void *; 160 using cupmMemPoolAttr = unsigned int; 161 #endif 162 163 // values 164 static const auto cupmSuccess = cudaSuccess; 165 static const auto cupmErrorNotReady = cudaErrorNotReady; 166 static const auto cupmErrorDeviceAlreadyInUse = cudaErrorDeviceAlreadyInUse; 167 static const auto cupmErrorSetOnActiveProcess = cudaErrorSetOnActiveProcess; 168 static const auto cupmErrorStubLibrary = 169 #if PETSC_PKG_CUDA_VERSION_GE(11, 1, 0) 170 cudaErrorStubLibrary; 171 #else 172 cudaErrorInsufficientDriver; 173 #endif 174 175 static const auto cupmErrorNoDevice = cudaErrorNoDevice; 176 static const auto cupmStreamDefault = cudaStreamDefault; 177 static const auto cupmStreamNonBlocking = cudaStreamNonBlocking; 178 static const auto cupmDeviceMapHost = cudaDeviceMapHost; 179 static const auto cupmMemcpyHostToDevice = cudaMemcpyHostToDevice; 180 static const auto cupmMemcpyDeviceToHost = cudaMemcpyDeviceToHost; 181 static const auto cupmMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice; 182 static const auto cupmMemcpyHostToHost = cudaMemcpyHostToHost; 183 static const auto cupmMemcpyDefault = cudaMemcpyDefault; 184 static const auto cupmMemoryTypeHost = cudaMemoryTypeHost; 185 static const auto cupmMemoryTypeDevice = cudaMemoryTypeDevice; 186 static const auto cupmMemoryTypeManaged = cudaMemoryTypeManaged; 187 static const auto cupmEventDisableTiming = cudaEventDisableTiming; 188 static const auto cupmHostAllocDefault = cudaHostAllocDefault; 189 static const auto cupmHostAllocWriteCombined = cudaHostAllocWriteCombined; 190 static const auto cupmMemPoolAttrReleaseThreshold = 191 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0) 192 cudaMemPoolAttrReleaseThreshold; 193 #else 194 cupmMemPoolAttr{0}; 195 #endif 196 static const auto cupmDevAttrClockRate = cudaDevAttrClockRate; 197 static const auto cupmDevAttrMemoryClockRate = cudaDevAttrMemoryClockRate; 198 199 // error functions PETSC_CUPM_ALIAS_FUNCTIONPetsc::device::cupm::impl::InterfaceImpl200 PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, cudaGetErrorName) 201 PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, cudaGetErrorString) 202 PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, cudaGetLastError) 203 204 // device management 205 PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, cudaGetDeviceCount) 206 PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, cudaGetDeviceProperties) 207 PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetAttribute, cudaDeviceGetAttribute) 208 PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, cudaGetDevice) 209 PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, cudaSetDevice) 210 PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, cudaGetDeviceFlags) 211 PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, cudaSetDeviceFlags) 212 PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, cudaPointerGetAttributes) 213 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0) 214 PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, cudaDeviceGetMemPool) 215 PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, cudaMemPoolSetAttribute) 216 #else 217 PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept 218 { 219 *pool = nullptr; 220 return cupmSuccess; 221 } 222 223 PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; } 224 #endif 225 // CUDA has no cudaInit() to match hipInit() 226 PETSC_NODISCARD static cupmError_t cupmInit(unsigned int) noexcept { return cudaFree(nullptr); } 227 228 // stream management PETSC_CUPM_ALIAS_FUNCTIONPetsc::device::cupm::impl::InterfaceImpl229 PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, cudaEventCreate) 230 PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, cudaEventCreateWithFlags) 231 PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, cudaEventDestroy) 232 PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, cudaEventRecord) 233 PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, cudaEventSynchronize) 234 PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, cudaEventElapsedTime) 235 PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, cudaEventQuery) 236 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, cudaStreamCreate) 237 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, cudaStreamCreateWithFlags) 238 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, cudaStreamGetFlags) 239 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, cudaStreamDestroy) 240 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, cudaStreamWaitEvent) 241 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, cudaStreamQuery) 242 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, cudaStreamSynchronize) 243 PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, cudaDeviceSynchronize) 244 PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, cudaGetSymbolAddress) 245 246 // memory management 247 PETSC_CUPM_ALIAS_FUNCTION(cupmFree, cudaFree) 248 PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc) 249 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0) 250 PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, cudaFreeAsync) 251 PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, cudaMallocAsync) 252 #else 253 PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, cudaFree, 1) 254 PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, cudaMalloc, 1) 255 #endif 256 PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, cudaMemcpy) 257 PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, cudaMemcpyAsync) 258 PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, cudaMallocHost) 259 PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, cudaFreeHost) 260 PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, cudaMemset) 261 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0) 262 PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, cudaMemsetAsync) 263 #else 264 PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemsetAsync, cudaMemset, 1) 265 #endif 266 PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy2D, cudaMemcpy2D) 267 PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemcpy2DAsync, cudaMemcpy2DAsync, 1) 268 PETSC_CUPM_ALIAS_FUNCTION(cupmMemset2D, cudaMemset2D) 269 PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemset2DAsync, cudaMemset2DAsync, 1) 270 271 // launch control 272 PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, cudaLaunchHostFunc) 273 template <typename FunctionT, typename... KernelArgsT> 274 PETSC_NODISCARD static cudaError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, cudaStream_t stream, KernelArgsT &&...kernelArgs) noexcept 275 { 276 static_assert(!std::is_pointer<FunctionT>::value, "kernel function must not be passed by pointer"); 277 void *args[] = {(void *)std::addressof(kernelArgs)...}; 278 279 return cudaLaunchKernel<util::remove_reference_t<FunctionT>>(std::addressof(func), std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream)); 280 } 281 }; 282 #endif // PetscDefined(HAVE_CUDA) 283 284 #if PetscDefined(HAVE_HIP) 285 template <> 286 struct InterfaceImpl<DeviceType::HIP> : InterfaceBase<DeviceType::HIP> { 287 PETSC_CUPM_BASE_CLASS_HEADER(DeviceType::HIP); 288 289 // typedefs 290 using cupmError_t = hipError_t; 291 using cupmEvent_t = hipEvent_t; 292 using cupmStream_t = hipStream_t; 293 using cupmDeviceProp_t = hipDeviceProp_t; 294 using cupmMemcpyKind_t = hipMemcpyKind; 295 using cupmDeviceAttr_t = hipDeviceAttribute_t; 296 using cupmComplex_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), hipComplex, hipDoubleComplex>; 297 using cupmPointerAttributes_t = hipPointerAttribute_t; 298 using cupmMemoryType_t = enum hipMemoryType; 299 using cupmDim3 = dim3; 300 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0) 301 using cupmHostFn_t = hipHostFn_t; 302 using cupmMemPool_t = hipMemPool_t; 303 using cupmMemPoolAttr = hipMemPoolAttr; 304 #else 305 using cupmHostFn_t = void (*)(void *); 306 using cupmMemPool_t = void *; 307 using cupmMemPoolAttr = unsigned int; 308 #endif 309 310 // values 311 static const auto cupmSuccess = hipSuccess; 312 static const auto cupmErrorNotReady = hipErrorNotReady; 313 // see https://github.com/ROCm-Developer-Tools/HIP/blob/develop/bin/hipify-perl 314 static const auto cupmErrorDeviceAlreadyInUse = hipErrorContextAlreadyInUse; 315 static const auto cupmErrorSetOnActiveProcess = hipErrorSetOnActiveProcess; 316 // as of HIP v4.2 cudaErrorStubLibrary has no HIP equivalent 317 static const auto cupmErrorStubLibrary = hipErrorInsufficientDriver; 318 static const auto cupmErrorNoDevice = hipErrorNoDevice; 319 static const auto cupmStreamDefault = hipStreamDefault; 320 static const auto cupmStreamNonBlocking = hipStreamNonBlocking; 321 static const auto cupmDeviceMapHost = hipDeviceMapHost; 322 static const auto cupmMemcpyHostToDevice = hipMemcpyHostToDevice; 323 static const auto cupmMemcpyDeviceToHost = hipMemcpyDeviceToHost; 324 static const auto cupmMemcpyDeviceToDevice = hipMemcpyDeviceToDevice; 325 static const auto cupmMemcpyHostToHost = hipMemcpyHostToHost; 326 static const auto cupmMemcpyDefault = hipMemcpyDefault; 327 static const auto cupmMemoryTypeHost = hipMemoryTypeHost; 328 static const auto cupmMemoryTypeDevice = hipMemoryTypeDevice; 329 // see 330 // https://github.com/ROCm-Developer-Tools/HIP/blob/develop/include/hip/hip_runtime_api.h#L156 331 static const auto cupmMemoryTypeManaged = hipMemoryTypeUnified; 332 static const auto cupmEventDisableTiming = hipEventDisableTiming; 333 static const auto cupmHostAllocDefault = hipHostMallocDefault; 334 static const auto cupmHostAllocWriteCombined = hipHostMallocWriteCombined; 335 static const auto cupmMemPoolAttrReleaseThreshold = 336 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0) 337 hipMemPoolAttrReleaseThreshold; 338 #else 339 cupmMemPoolAttr{0}; 340 #endif 341 static const auto cupmDevAttrClockRate = hipDeviceAttributeClockRate; 342 static const auto cupmDevAttrMemoryClockRate = hipDeviceAttributeMemoryClockRate; 343 344 // error functions PETSC_CUPM_ALIAS_FUNCTIONPetsc::device::cupm::impl::InterfaceImpl345 PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, hipGetErrorName) 346 PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, hipGetErrorString) 347 PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, hipGetLastError) 348 349 // device management 350 PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, hipGetDeviceCount) 351 PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, hipGetDeviceProperties) 352 PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetAttribute, hipDeviceGetAttribute) 353 PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, hipGetDevice) 354 PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, hipSetDevice) 355 PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, hipGetDeviceFlags) 356 PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, hipSetDeviceFlags) 357 PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, hipPointerGetAttributes) 358 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0) 359 PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, hipDeviceGetMemPool) 360 PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, hipMemPoolSetAttribute) 361 #else 362 PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept 363 { 364 *pool = nullptr; 365 return cupmSuccess; 366 } 367 368 PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; } 369 #endif 370 PETSC_CUPM_ALIAS_FUNCTION(cupmInit, hipInit) 371 372 // stream management 373 PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, hipEventCreate) 374 PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, hipEventCreateWithFlags) 375 PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, hipEventDestroy) 376 PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, hipEventRecord) 377 PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, hipEventSynchronize) 378 PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, hipEventElapsedTime) 379 PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, hipEventQuery) 380 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, hipStreamCreate) 381 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, hipStreamCreateWithFlags) 382 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, hipStreamGetFlags) 383 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, hipStreamDestroy) 384 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, hipStreamWaitEvent) 385 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, hipStreamQuery) 386 PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, hipStreamSynchronize) 387 PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, hipDeviceSynchronize) 388 PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, hipGetSymbolAddress) 389 390 // memory management 391 PETSC_CUPM_ALIAS_FUNCTION(cupmFree, hipFree) 392 PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc) 393 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0) 394 PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, hipMallocAsync) 395 PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, hipFreeAsync) 396 #else 397 PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, hipMalloc, 1) 398 PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, hipFree, 1) 399 #endif 400 PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, hipMemcpy) 401 PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, hipMemcpyAsync) 402 // hipMallocHost is deprecated 403 PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, hipHostMalloc) 404 // hipFreeHost is deprecated 405 PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, hipHostFree) 406 PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, hipMemset) 407 PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, hipMemsetAsync) 408 PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy2D, hipMemcpy2D) 409 PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemcpy2DAsync, hipMemcpy2DAsync, 1) 410 PETSC_CUPM_ALIAS_FUNCTION(cupmMemset2D, hipMemset2D) 411 PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemset2DAsync, hipMemset2DAsync, 1) 412 413 // launch control 414 // HIP appears to only have hipLaunchHostFunc from 5.2.0 onwards 415 // https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md#7-execution-control= 416 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0) 417 PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, hipLaunchHostFunc) 418 #else 419 PETSC_NODISCARD static hipError_t cupmLaunchHostFunc(hipStream_t stream, cupmHostFn_t fn, void *ctx) noexcept 420 { 421 // the only correct way to spoof this function is to do it synchronously... 422 auto herr = hipStreamSynchronize(stream); 423 if (PetscUnlikely(herr != hipSuccess)) return herr; 424 fn(ctx); 425 return herr; 426 } 427 #endif 428 429 template <typename FunctionT, typename... KernelArgsT> 430 PETSC_NODISCARD static hipError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, hipStream_t stream, KernelArgsT &&...kernelArgs) noexcept 431 { 432 void *args[] = {(void *)std::addressof(kernelArgs)...}; 433 434 return hipLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream)); 435 } 436 }; 437 #endif // PetscDefined(HAVE_HIP) 438 439 // shorthand for bringing all of the typedefs from the base Interface class into your own, 440 // it's annoying that c++ doesn't have a way to do this automatically 441 #define PETSC_CUPM_IMPL_CLASS_HEADER(T) \ 442 PETSC_CUPM_BASE_CLASS_HEADER(T); \ 443 /* types */ \ 444 using cupmError_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmError_t; \ 445 using cupmEvent_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEvent_t; \ 446 using cupmStream_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStream_t; \ 447 using cupmDeviceProp_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceProp_t; \ 448 using cupmMemcpyKind_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyKind_t; \ 449 using cupmDeviceAttr_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceAttr_t; \ 450 using cupmComplex_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmComplex_t; \ 451 using cupmPointerAttributes_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmPointerAttributes_t; \ 452 using cupmMemoryType_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryType_t; \ 453 using cupmDim3 = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDim3; \ 454 using cupmHostFn_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostFn_t; \ 455 using cupmMemPool_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPool_t; \ 456 using cupmMemPoolAttr = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolAttr; \ 457 /* variables */ \ 458 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSuccess; \ 459 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorNotReady; \ 460 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorDeviceAlreadyInUse; \ 461 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorSetOnActiveProcess; \ 462 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorStubLibrary; \ 463 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorNoDevice; \ 464 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamDefault; \ 465 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamNonBlocking; \ 466 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceMapHost; \ 467 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyHostToDevice; \ 468 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDeviceToHost; \ 469 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDeviceToDevice; \ 470 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyHostToHost; \ 471 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDefault; \ 472 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeHost; \ 473 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeDevice; \ 474 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeManaged; \ 475 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventDisableTiming; \ 476 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostAllocDefault; \ 477 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostAllocWriteCombined; \ 478 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolAttrReleaseThreshold; \ 479 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDevAttrClockRate; \ 480 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDevAttrMemoryClockRate; \ 481 /* functions */ \ 482 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetErrorName; \ 483 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetErrorString; \ 484 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetLastError; \ 485 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceCount; \ 486 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceProperties; \ 487 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceGetAttribute; \ 488 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDevice; \ 489 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSetDevice; \ 490 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceFlags; \ 491 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSetDeviceFlags; \ 492 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmPointerGetAttributes; \ 493 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceGetMemPool; \ 494 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolSetAttribute; \ 495 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmInit; \ 496 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventCreate; \ 497 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventCreateWithFlags; \ 498 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventDestroy; \ 499 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventRecord; \ 500 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventSynchronize; \ 501 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventElapsedTime; \ 502 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventQuery; \ 503 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamCreate; \ 504 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamCreateWithFlags; \ 505 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamGetFlags; \ 506 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamDestroy; \ 507 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamWaitEvent; \ 508 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamQuery; \ 509 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamSynchronize; \ 510 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceSynchronize; \ 511 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetSymbolAddress; \ 512 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMalloc; \ 513 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMallocAsync; \ 514 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy; \ 515 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyAsync; \ 516 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMallocHost; \ 517 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset; \ 518 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemsetAsync; \ 519 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy2D; \ 520 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy2DAsync; \ 521 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset2D; \ 522 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset2DAsync; \ 523 using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmLaunchHostFunc 524 525 #if PetscHasAttribute(always_inline) 526 // https://gcc.gnu.org/bugzilla//show_bug.cgi?id=109464 527 #define PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND __attribute__((always_inline)) 528 #else 529 #define PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND 530 #endif 531 532 // The actual interface class 533 template <DeviceType T> 534 struct Interface : InterfaceImpl<T> { 535 private: 536 using interface_type = InterfaceImpl<T>; 537 538 public: 539 PETSC_CUPM_IMPL_CLASS_HEADER(T); 540 541 using cupmReal_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), float, double>; 542 using cupmScalar_t = util::conditional_t<PetscDefined(USE_COMPLEX), cupmComplex_t, cupmReal_t>; 543 cupmScalarCastPetsc::device::cupm::impl::Interface544 PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr cupmScalar_t cupmScalarCast(PetscScalar s) noexcept 545 { 546 #if PetscDefined(USE_COMPLEX) 547 return cupmComplex_t{PetscRealPart(s), PetscImaginaryPart(s)}; 548 #else 549 return static_cast<cupmScalar_t>(s); 550 #endif 551 } 552 cupmScalarPtrCastPetsc::device::cupm::impl::Interface553 PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr const cupmScalar_t *cupmScalarPtrCast(const PetscScalar *s) noexcept { return reinterpret_cast<const cupmScalar_t *>(s); } 554 cupmScalarPtrCastPetsc::device::cupm::impl::Interface555 PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr cupmScalar_t *cupmScalarPtrCast(PetscScalar *s) noexcept { return reinterpret_cast<cupmScalar_t *>(s); } 556 cupmRealPtrCastPetsc::device::cupm::impl::Interface557 PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr const cupmReal_t *cupmRealPtrCast(const PetscReal *s) noexcept { return reinterpret_cast<const cupmReal_t *>(s); } 558 cupmRealPtrCastPetsc::device::cupm::impl::Interface559 PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr cupmReal_t *cupmRealPtrCast(PetscReal *s) noexcept { return reinterpret_cast<cupmReal_t *>(s); } 560 561 #if !defined(PETSC_PKG_CUDA_VERSION_GE) 562 #define PETSC_PKG_CUDA_VERSION_GE(...) 0 563 #define CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE 564 #endif 565 566 #if !defined(PETSC_PKG_HIP_VERSION_LT) 567 #define PETSC_PKG_HIP_VERSION_LT(...) 0 568 #define CUPM_DEFINED_PETSC_PKG_HIP_VERSION_LT 569 #endif 570 PetscCUPMGetMemTypePetsc::device::cupm::impl::Interface571 static PetscErrorCode PetscCUPMGetMemType(const void *data, PetscMemType *type, PetscBool *registered = nullptr, PetscBool *managed = nullptr) noexcept 572 { 573 cupmPointerAttributes_t attr; 574 cupmError_t cerr; 575 576 PetscFunctionBegin; 577 if (type) PetscAssertPointer(type, 2); 578 if (registered) { 579 PetscAssertPointer(registered, 3); 580 *registered = PETSC_FALSE; 581 } 582 if (managed) { 583 PetscAssertPointer(managed, 4); 584 *managed = PETSC_FALSE; 585 } 586 // Do not check error, instead reset it via GetLastError() since before CUDA 11.0, passing 587 // a host pointer returns cudaErrorInvalidValue 588 cerr = cupmPointerGetAttributes(&attr, data); 589 cerr = cupmGetLastError(); 590 // HIP seems to always have used memoryType though 591 #if (defined(CUDART_VERSION) && (CUDART_VERSION < 10000)) || (defined(__HIP_PLATFORM_HCC__) && PETSC_PKG_HIP_VERSION_LT(5, 5, 0)) 592 const auto mtype = attr.memoryType; 593 if (managed) *managed = static_cast<PetscBool>((cerr == cupmSuccess) && attr.isManaged); 594 #else 595 if (PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) && (T == DeviceType::CUDA)) PetscCallCUPM(cerr); 596 const auto mtype = attr.type; 597 if (managed) *managed = static_cast<PetscBool>(mtype == cupmMemoryTypeManaged); 598 #endif // CUDART_VERSION && CUDART_VERSION < 10000 || (defined(__HIP_PLATFORM_HCC__) && PETSC_PKG_HIP_VERSION_LT(5, 5, 0)) 599 if (type) *type = ((cerr == cupmSuccess) && (mtype == cupmMemoryTypeDevice)) ? PETSC_MEMTYPE_CUPM() : PETSC_MEMTYPE_HOST; 600 if (registered && (cerr == cupmSuccess) && (mtype == cupmMemoryTypeHost)) *registered = PETSC_TRUE; 601 PetscFunctionReturn(PETSC_SUCCESS); 602 } 603 #if defined(CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE) 604 #undef PETSC_PKG_CUDA_VERSION_GE 605 #endif 606 #if defined(CUPM_DEFINED_PETSC_PKG_HIP_VERSION_LT) 607 #undef PETSC_PKG_HIP_VERSION_LT 608 #endif 609 PetscDeviceCopyModeToCUPMMemcpyKindPetsc::device::cupm::impl::Interface610 PETSC_NODISCARD static PETSC_CONSTEXPR_14 cupmMemcpyKind_t PetscDeviceCopyModeToCUPMMemcpyKind(PetscDeviceCopyMode mode) noexcept 611 { 612 switch (mode) { 613 case PETSC_DEVICE_COPY_HTOH: 614 return cupmMemcpyHostToHost; 615 case PETSC_DEVICE_COPY_HTOD: 616 return cupmMemcpyHostToDevice; 617 case PETSC_DEVICE_COPY_DTOD: 618 return cupmMemcpyDeviceToDevice; 619 case PETSC_DEVICE_COPY_DTOH: 620 return cupmMemcpyDeviceToHost; 621 case PETSC_DEVICE_COPY_AUTO: 622 return cupmMemcpyDefault; 623 } 624 PetscUnreachable(); 625 return cupmMemcpyDefault; 626 } 627 628 // these change what the arguments mean, so need to namespace these 629 template <typename M> PetscCUPMMallocAsyncPetsc::device::cupm::impl::Interface630 static PetscErrorCode PetscCUPMMallocAsync(M **ptr, std::size_t n, cupmStream_t stream = nullptr) noexcept 631 { 632 static_assert(!std::is_void<M>::value, ""); 633 634 PetscFunctionBegin; 635 PetscAssertPointer(ptr, 1); 636 *ptr = nullptr; 637 if (n) { 638 const auto bytes = n * sizeof(M); 639 // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-2/ 640 // 641 // TLD;DR: cudaMallocAsync() does not work with NVIDIA GPUDirect which OPENMPI uses to 642 // underpin its cuda-aware MPI implementation, so we cannot just async allocate 643 // blindly... 644 if (stream) { 645 PetscCallCUPM(cupmMallocAsync(reinterpret_cast<void **>(ptr), bytes, stream)); 646 } else { 647 PetscCallCUPM(cupmMalloc(reinterpret_cast<void **>(ptr), bytes)); 648 } 649 } 650 PetscFunctionReturn(PETSC_SUCCESS); 651 } 652 653 template <typename M> PetscCUPMMallocPetsc::device::cupm::impl::Interface654 static PetscErrorCode PetscCUPMMalloc(M **ptr, std::size_t n) noexcept 655 { 656 PetscFunctionBegin; 657 PetscCall(PetscCUPMMallocAsync(ptr, n)); 658 PetscFunctionReturn(PETSC_SUCCESS); 659 } 660 661 template <typename M> PetscCUPMMallocHostPetsc::device::cupm::impl::Interface662 static PetscErrorCode PetscCUPMMallocHost(M **ptr, std::size_t n, unsigned int flags = cupmHostAllocDefault) noexcept 663 { 664 static_assert(!std::is_void<M>::value, ""); 665 666 PetscFunctionBegin; 667 PetscAssertPointer(ptr, 1); 668 *ptr = nullptr; 669 if (n) PetscCallCUPM(cupmMallocHost(reinterpret_cast<void **>(ptr), n * sizeof(M), flags)); 670 PetscFunctionReturn(PETSC_SUCCESS); 671 } 672 673 template <typename D> PetscCUPMMemcpyAsyncPetsc::device::cupm::impl::Interface674 static PetscErrorCode PetscCUPMMemcpyAsync(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr, bool use_async = false) noexcept 675 { 676 static_assert(!std::is_void<D>::value, ""); 677 const auto size = n * sizeof(D); 678 679 PetscFunctionBegin; 680 if (PetscUnlikely(!n)) PetscFunctionReturn(PETSC_SUCCESS); 681 // cannot dereference (i.e. cannot call PetscAssertPointer() here) 682 PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer"); 683 PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer"); 684 // do early return after nullptr check since we need to check that they are not both nullptrs 685 if (PetscUnlikely(dest == src)) PetscFunctionReturn(PETSC_SUCCESS); 686 if (kind == cupmMemcpyHostToHost) { 687 // If we are HTOH it is cheaper to check if the stream is idle and do a basic mempcy() 688 // than it is to just call the vendor functions. This assumes of course that the stream 689 // accounts for both memory regions being "idle" 690 if (cupmStreamQuery(stream) == cupmSuccess) { 691 PetscCall(PetscMemcpy(dest, src, size)); 692 PetscFunctionReturn(PETSC_SUCCESS); 693 } 694 // need to clear the potential cupmErrorNotReady generated by query above... 695 auto cerr = cupmGetLastError(); 696 697 if (PetscUnlikely(cerr != cupmErrorNotReady)) PetscCallCUPM(cerr); 698 } 699 if (use_async || stream || (kind != cupmMemcpyDeviceToHost)) { 700 PetscCallCUPM(cupmMemcpyAsync(dest, src, size, kind, stream)); 701 } else { 702 PetscCallCUPM(cupmMemcpy(dest, src, size, kind)); 703 } 704 PetscCall(PetscLogCUPMMemcpyTransfer(kind, size)); 705 PetscFunctionReturn(PETSC_SUCCESS); 706 } 707 708 template <typename D> PetscCUPMMemcpyPetsc::device::cupm::impl::Interface709 static PetscErrorCode PetscCUPMMemcpy(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind) noexcept 710 { 711 PetscFunctionBegin; 712 PetscCall(PetscCUPMMemcpyAsync(dest, src, n, kind)); 713 PetscFunctionReturn(PETSC_SUCCESS); 714 } 715 716 template <typename D> PetscCUPMMemcpy2DAsyncPetsc::device::cupm::impl::Interface717 static PetscErrorCode PetscCUPMMemcpy2DAsync(D *dest, std::size_t dest_pitch, const util::type_identity_t<D> *src, std::size_t src_pitch, std::size_t width, std::size_t height, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr) 718 { 719 static_assert(!std::is_void<D>::value, ""); 720 const auto dest_pitch_bytes = dest_pitch * sizeof(D); 721 const auto src_pitch_bytes = src_pitch * sizeof(D); 722 const auto width_bytes = width * sizeof(D); 723 const auto size = height * width_bytes; 724 725 PetscFunctionBegin; 726 if (PetscUnlikely(!size)) PetscFunctionReturn(PETSC_SUCCESS); 727 PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer"); 728 PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer"); 729 if (stream || (kind != cupmMemcpyDeviceToHost)) { 730 PetscCallCUPM(cupmMemcpy2DAsync(dest, dest_pitch_bytes, src, src_pitch_bytes, width_bytes, height, kind, stream)); 731 } else { 732 PetscCallCUPM(cupmMemcpy2D(dest, dest_pitch_bytes, src, src_pitch_bytes, width_bytes, height, kind)); 733 } 734 PetscCall(PetscLogCUPMMemcpyTransfer(kind, size)); 735 PetscFunctionReturn(PETSC_SUCCESS); 736 } 737 738 template <typename D> PetscCUPMMemcpy2DPetsc::device::cupm::impl::Interface739 static PetscErrorCode PetscCUPMMemcpy2D(D *dest, std::size_t dest_pitch, const util::type_identity_t<D> *src, std::size_t src_pitch, std::size_t width, std::size_t height, cupmMemcpyKind_t kind) 740 { 741 PetscFunctionBegin; 742 PetscCall(PetscCUPMMemcpy2DAsync(dest, dest_pitch, src, src_pitch, width, height, kind)); 743 PetscFunctionReturn(PETSC_SUCCESS); 744 } 745 746 template <typename M> PetscCUPMMemsetAsyncPetsc::device::cupm::impl::Interface747 static PetscErrorCode PetscCUPMMemsetAsync(M *ptr, int value, std::size_t n, cupmStream_t stream = nullptr, bool use_async = false) noexcept 748 { 749 static_assert(!std::is_void<M>::value, ""); 750 751 PetscFunctionBegin; 752 if (PetscLikely(n)) { 753 const auto bytes = n * sizeof(M); 754 755 PetscCheck(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", n); 756 if (stream || use_async) { 757 PetscCallCUPM(cupmMemsetAsync(ptr, value, bytes, stream)); 758 } else { 759 PetscCallCUPM(cupmMemset(ptr, value, bytes)); 760 } 761 } 762 PetscFunctionReturn(PETSC_SUCCESS); 763 } 764 765 template <typename M> PetscCUPMMemsetPetsc::device::cupm::impl::Interface766 static PetscErrorCode PetscCUPMMemset(M *ptr, int value, std::size_t n) noexcept 767 { 768 PetscFunctionBegin; 769 PetscCall(PetscCUPMMemsetAsync(ptr, value, n)); 770 PetscFunctionReturn(PETSC_SUCCESS); 771 } 772 773 template <typename D> PetscCUPMMemset2DAsyncPetsc::device::cupm::impl::Interface774 static PetscErrorCode PetscCUPMMemset2DAsync(D *ptr, std::size_t pitch, int value, std::size_t width, std::size_t height, cupmStream_t stream = nullptr) 775 { 776 static_assert(!std::is_void<D>::value, ""); 777 const auto pitch_bytes = pitch * sizeof(D); 778 const auto width_bytes = width * sizeof(D); 779 const auto size = width_bytes * height; 780 781 PetscFunctionBegin; 782 if (PetscUnlikely(!size)) PetscFunctionReturn(PETSC_SUCCESS); 783 PetscAssert(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", size); 784 if (stream) { 785 PetscCallCUPM(cupmMemset2DAsync(ptr, pitch_bytes, value, width_bytes, height, stream)); 786 } else { 787 PetscCallCUPM(cupmMemset2D(ptr, pitch_bytes, value, width_bytes, height)); 788 } 789 PetscFunctionReturn(PETSC_SUCCESS); 790 } 791 792 // these we can transparently wrap, no need to namespace it to Petsc 793 template <typename M> cupmFreeAsyncPetsc::device::cupm::impl::Interface794 PETSC_NODISCARD static cupmError_t cupmFreeAsync(M &ptr, cupmStream_t stream = nullptr) noexcept 795 { 796 static_assert(std::is_pointer<util::decay_t<M>>::value, ""); 797 static_assert(!std::is_const<M>::value, ""); 798 799 if (ptr) { 800 auto cerr = interface_type::cupmFreeAsync(std::forward<M>(ptr), stream); 801 802 ptr = nullptr; 803 if (PetscUnlikely(cerr != cupmSuccess)) return cerr; 804 } 805 return cupmSuccess; 806 } 807 cupmFreeAsyncPetsc::device::cupm::impl::Interface808 PETSC_NODISCARD static cupmError_t cupmFreeAsync(std::nullptr_t ptr, cupmStream_t stream = nullptr) { return interface_type::cupmFreeAsync(ptr, stream); } 809 810 template <typename M> cupmFreePetsc::device::cupm::impl::Interface811 PETSC_NODISCARD static cupmError_t cupmFree(M &ptr) noexcept 812 { 813 return cupmFreeAsync(ptr); 814 } 815 cupmFreePetsc::device::cupm::impl::Interface816 PETSC_NODISCARD static cupmError_t cupmFree(std::nullptr_t ptr) { return cupmFreeAsync(ptr); } 817 818 template <typename M> cupmFreeHostPetsc::device::cupm::impl::Interface819 PETSC_NODISCARD static cupmError_t cupmFreeHost(M &ptr) noexcept 820 { 821 static_assert(std::is_pointer<util::decay_t<M>>::value, ""); 822 const auto cerr = interface_type::cupmFreeHost(std::forward<M>(ptr)); 823 ptr = nullptr; 824 return cerr; 825 } 826 cupmFreeHostPetsc::device::cupm::impl::Interface827 PETSC_NODISCARD static cupmError_t cupmFreeHost(std::nullptr_t ptr) { return interface_type::cupmFreeHost(ptr); } 828 829 // specific wrapper for device launch function, as the real function is a C routine and 830 // doesn't have variable arguments. The actual mechanics of this are a bit complicated but 831 // boils down to the fact that ultimately we pass a 832 // 833 // void *args[] = {(void*)&kernel_args...}; 834 // 835 // to the kernel launcher. Since we pass void* this means implicit conversion does **not** 836 // happen to the kernel arguments so we must do it ourselves here. This function does this in 837 // 3 stages: 838 // 1. Enumerate the kernel arguments (cupmLaunchKernel) 839 // 2. Deduce the signature of func() and static_cast the kernel arguments to the type 840 // expected by func() using the enumeration above (deduceKernelCall) 841 // 3. Form the void* array with the converted arguments and call cuda/hipLaunchKernel with 842 // it. (interface_type::cupmLaunchKernel) 843 template <typename F, typename... Args> cupmLaunchKernelPetsc::device::cupm::impl::Interface844 PETSC_NODISCARD static cupmError_t cupmLaunchKernel(F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept 845 { 846 return deduceKernelCall(util::index_sequence_for<Args...>{}, std::forward<F>(func), std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), std::forward<Args>(kernelArgs)...); 847 } 848 849 template <std::size_t block_size = 256, std::size_t warp_size = 32, typename F, typename... Args> PetscCUPMLaunchKernel1DPetsc::device::cupm::impl::Interface850 static PetscErrorCode PetscCUPMLaunchKernel1D(std::size_t n, std::size_t sharedMem, cupmStream_t stream, F &&func, Args &&...kernelArgs) noexcept 851 { 852 static_assert(block_size > 0, ""); 853 static_assert(warp_size > 0, ""); 854 // want block_size to be a multiple of the warp_size 855 static_assert(block_size % warp_size == 0, ""); 856 const auto nthread = std::min(n, block_size); 857 const auto nblock = (n + block_size - 1) / block_size; 858 859 PetscFunctionBegin; 860 // if n = 0 then nthread = 0, which is not allowed. rather than letting the user try to 861 // decipher cryptic 'cuda/hipErrorLaunchFailure' we explicitly check for zero here 862 PetscAssert(nthread, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Trying to launch kernel with grid/block size 0"); 863 PetscCallCUPM(cupmLaunchKernel(std::forward<F>(func), (unsigned int)nblock, (unsigned int)nthread, sharedMem, stream, std::forward<Args>(kernelArgs)...)); 864 PetscFunctionReturn(PETSC_SUCCESS); 865 } 866 867 private: 868 template <typename S, typename D, typename = void> 869 struct is_static_castable : std::false_type { }; 870 871 template <typename S, typename D> 872 struct is_static_castable<S, D, util::void_t<decltype(static_cast<D>(std::declval<S>()))>> : std::true_type { }; 873 874 template <typename D, typename S> cast_toPetsc::device::cupm::impl::Interface875 static constexpr util::enable_if_t<is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept 876 { 877 return static_cast<D>(std::forward<S>(src)); 878 } 879 880 template <typename D, typename S> cast_toPetsc::device::cupm::impl::Interface881 static constexpr util::enable_if_t<!is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept 882 { 883 return const_cast<D>(std::forward<S>(src)); 884 } 885 886 template <typename F, typename... Args, std::size_t... Idx> deduceKernelCallPetsc::device::cupm::impl::Interface887 PETSC_NODISCARD static cupmError_t deduceKernelCall(util::index_sequence<Idx...>, F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept 888 { 889 // clang-format off 890 return interface_type::cupmLaunchKernel( 891 std::forward<F>(func), 892 std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), 893 // can't static_cast() here since the function argument type may be cv-qualified, in 894 // which case we would need to const_cast(). But you can only const_cast() indirect types 895 // (pointers, references). So we need a SFINAE monster that is a static_cast() if 896 // possible, and a const_cast() if not. We could just use a C-style cast which *would* 897 // work here since it tries the following and uses the first one that succeeds: 898 // 899 // 1. const_cast() 900 // 2. static_cast() 901 // 3. static_cast() then const_cast() 902 // 4. reinterpret_cast()... 903 // 904 // the issue however is the final reinterpret_cast(). We absolutely cannot get there 905 // because doing so would silently hide a ton of bugs, for example casting a PetscScalar 906 // * to double * in complex builds, a PetscInt * to int * in 64idx builds, etc. 907 cast_to<typename util::func_traits<F>::template arg<Idx>::type>(std::forward<Args>(kernelArgs))... 908 ); 909 // clang-format on 910 } 911 PetscLogCUPMMemcpyTransferPetsc::device::cupm::impl::Interface912 static PetscErrorCode PetscLogCUPMMemcpyTransfer(cupmMemcpyKind_t kind, std::size_t size) noexcept 913 { 914 PetscFunctionBegin; 915 // only the explicit HTOD or DTOH are handled, since we either don't log the other cases 916 // (yet) or don't know the direction 917 if (kind == cupmMemcpyDeviceToHost) PetscCall(PetscLogGpuToCpu(static_cast<PetscLogDouble>(size))); 918 else if (kind == cupmMemcpyHostToDevice) PetscCall(PetscLogCpuToGpu(static_cast<PetscLogDouble>(size))); 919 else (void)size; 920 PetscFunctionReturn(PETSC_SUCCESS); 921 } 922 }; 923 924 #undef PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND 925 926 #define PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(T) \ 927 PETSC_CUPM_IMPL_CLASS_HEADER(T); \ 928 using cupmReal_t = typename ::Petsc::device::cupm::impl::Interface<T>::cupmReal_t; \ 929 using cupmScalar_t = typename ::Petsc::device::cupm::impl::Interface<T>::cupmScalar_t; \ 930 using ::Petsc::device::cupm::impl::Interface<T>::cupmScalarCast; \ 931 using ::Petsc::device::cupm::impl::Interface<T>::cupmScalarPtrCast; \ 932 using ::Petsc::device::cupm::impl::Interface<T>::cupmRealPtrCast; \ 933 using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMGetMemType; \ 934 using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemset; \ 935 using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemsetAsync; \ 936 using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMalloc; \ 937 using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMallocAsync; \ 938 using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMallocHost; \ 939 using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy; \ 940 using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpyAsync; \ 941 using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy2D; \ 942 using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy2DAsync; \ 943 using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemset2DAsync; \ 944 using ::Petsc::device::cupm::impl::Interface<T>::cupmFree; \ 945 using ::Petsc::device::cupm::impl::Interface<T>::cupmFreeAsync; \ 946 using ::Petsc::device::cupm::impl::Interface<T>::cupmFreeHost; \ 947 using ::Petsc::device::cupm::impl::Interface<T>::cupmLaunchKernel; \ 948 using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMLaunchKernel1D; \ 949 using ::Petsc::device::cupm::impl::Interface<T>::PetscDeviceCopyModeToCUPMMemcpyKind 950 951 #if PetscDefined(HAVE_CUDA) 952 extern template struct PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL Interface<DeviceType::CUDA>; 953 #endif 954 955 #if PetscDefined(HAVE_HIP) 956 extern template struct PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL Interface<DeviceType::HIP>; 957 #endif 958 959 } // namespace impl 960 961 } // namespace cupm 962 963 } // namespace device 964 965 } // namespace Petsc 966