xref: /petsc/include/petsc/private/cupminterface.hpp (revision cd871708d6ae82bd70cc1a9e2138f9b57839fe75)
1 #pragma once
2 
3 #include <petscdevice_cupm.h>
4 
5 #include <petsc/private/cpputil.hpp>
6 #include <petsc/private/petscadvancedmacros.h>
7 
8 #include <petsc/private/cpp/array.hpp>
9 
10 namespace Petsc
11 {
12 
13 namespace device
14 {
15 
16 namespace cupm
17 {
18 
19 // enum describing available cupm devices, this is used as the template parameter to any
20 // class subclassing the Interface or using it as a member variable
21 enum class DeviceType : int {
22   CUDA,
23   HIP
24 };
25 
26 // clang-format off
27 static constexpr std::array<const char *const, 5> DeviceTypes = {
28   "cuda",
29   "hip",
30   "Petsc::device::cupm::DeviceType",
31   "Petsc::device::cupm::DeviceType::",
32   nullptr
33 };
34 // clang-format on
35 
36 namespace impl
37 {
38 
39 #define PetscCallCUPM_(__abort_fn__, __comm__, ...) \
40   do { \
41     PetscStackUpdateLine; \
42     const cupmError_t cerr_p_ = __VA_ARGS__; \
43     __abort_fn__(cerr_p_ == cupmSuccess, __comm__, PETSC_ERR_GPU, "%s error %d (%s) : %s", cupmName(), static_cast<PetscErrorCode>(cerr_p_), cupmGetErrorName(cerr_p_), cupmGetErrorString(cerr_p_)); \
44   } while (0)
45 
46 // A backend agnostic PetscCallCUPM() function, this will only work inside the member
47 // functions of a class inheriting from CUPM::Interface. Thanks to __VA_ARGS__ templated
48 // functions can also be wrapped inline:
49 //
50 // PetscCallCUPM(foo<int,char,bool>());
51 #define PetscCallCUPM(...)             PetscCallCUPM_(PetscCheck, PETSC_COMM_SELF, __VA_ARGS__)
52 #define PetscCallCUPMAbort(comm_, ...) PetscCallCUPM_(PetscCheckAbort, comm_, __VA_ARGS__)
53 
54 // PETSC_CUPM_ALIAS_FUNCTION() - declaration to alias a cuda/hip function
55 //
56 // input params:
57 // our_name   - the name of the alias
58 // their_name - the name of the function being aliased
59 //
60 // notes:
61 // see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
62 //
63 // example usage:
64 // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc) ->
65 // template <typename... T>
66 // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
67 // {
68 //   return cudaMalloc(std::forward<T>(args)...);
69 // }
70 //
71 // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc) ->
72 // template <typename... T>
73 // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
74 // {
75 //   return hipMalloc(std::forward<T>(args)...);
76 // }
77 #define PETSC_CUPM_ALIAS_FUNCTION(our_name, their_name) PETSC_ALIAS_FUNCTION(static our_name, their_name)
78 
79 // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE() - declaration to alias a cuda/hip function but
80 // discard the last N arguments
81 //
82 // input params:
83 // our_name   - the name of the alias
84 // their_name - the name of the function being aliased
85 // N          - integer constant [0, INT_MAX) dictating how many arguments to chop off the end
86 //
87 // notes:
88 // see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the expansion
89 //
90 // example use:
91 // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(cupmMallocAsync, cudaMalloc, 1) ->
92 // template <typename... T, typename Tend>
93 // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
94 // return type deduction*
95 // {
96 //   (void)argend;
97 //   return cudaMalloc(std::forward<T>(args)...);
98 // }
99 #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(our_name, their_name, N) PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS(static our_name, their_name, N)
100 
101 // Base class that holds functions and variables that don't require CUDA or HIP to be present
102 // on the system
103 template <DeviceType T>
104 struct InterfaceBase {
105   static const DeviceType type = T;
106 
cupmNamePetsc::device::cupm::impl::InterfaceBase107   PETSC_NODISCARD static constexpr const char *cupmName() noexcept
108   {
109     static_assert(util::to_underlying(DeviceType::CUDA) == 0, "");
110     static_assert(util::to_underlying(DeviceType::HIP) == 1, "");
111     return std::get<util::to_underlying(T)>(DeviceTypes);
112   }
113 
cupmNAMEPetsc::device::cupm::impl::InterfaceBase114   PETSC_NODISCARD static constexpr const char *cupmNAME() noexcept { return T == DeviceType::CUDA ? "CUDA" : "HIP"; }
115 
PETSC_DEVICE_CUPMPetsc::device::cupm::impl::InterfaceBase116   PETSC_NODISCARD static constexpr PetscDeviceType PETSC_DEVICE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_DEVICE_CUDA : PETSC_DEVICE_HIP; }
117 
PETSC_MEMTYPE_CUPMPetsc::device::cupm::impl::InterfaceBase118   PETSC_NODISCARD static constexpr PetscMemType PETSC_MEMTYPE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_MEMTYPE_CUDA : PETSC_MEMTYPE_HIP; }
119 };
120 
121 // declare the base class static member variables
122 template <DeviceType T>
123 const DeviceType InterfaceBase<T>::type;
124 
125 #define PETSC_CUPM_BASE_CLASS_HEADER(T) \
126   using ::Petsc::device::cupm::impl::InterfaceBase<T>::type; \
127   using ::Petsc::device::cupm::impl::InterfaceBase<T>::cupmName; \
128   using ::Petsc::device::cupm::impl::InterfaceBase<T>::cupmNAME; \
129   using ::Petsc::device::cupm::impl::InterfaceBase<T>::PETSC_DEVICE_CUPM; \
130   using ::Petsc::device::cupm::impl::InterfaceBase<T>::PETSC_MEMTYPE_CUPM
131 
132 // A templated C++ struct that defines the entire CUPM interface. Use of templating vs
133 // preprocessor macros allows us to use both interfaces simultaneously as well as easily
134 // import them into classes.
135 template <DeviceType>
136 struct InterfaceImpl;
137 
138 #if PetscDefined(HAVE_CUDA)
139 template <>
140 struct InterfaceImpl<DeviceType::CUDA> : InterfaceBase<DeviceType::CUDA> {
141   PETSC_CUPM_BASE_CLASS_HEADER(DeviceType::CUDA);
142 
143   // typedefs
144   using cupmError_t             = cudaError_t;
145   using cupmEvent_t             = cudaEvent_t;
146   using cupmStream_t            = cudaStream_t;
147   using cupmDeviceProp_t        = cudaDeviceProp;
148   using cupmMemcpyKind_t        = cudaMemcpyKind;
149   using cupmDeviceAttr_t        = cudaDeviceAttr;
150   using cupmComplex_t           = util::conditional_t<PetscDefined(USE_REAL_SINGLE), cuComplex, cuDoubleComplex>;
151   using cupmPointerAttributes_t = cudaPointerAttributes;
152   using cupmMemoryType_t        = enum cudaMemoryType;
153   using cupmDim3                = dim3;
154   using cupmHostFn_t            = cudaHostFn_t;
155   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
156   using cupmMemPool_t   = cudaMemPool_t;
157   using cupmMemPoolAttr = cudaMemPoolAttr;
158   #else
159   using cupmMemPool_t   = void *;
160   using cupmMemPoolAttr = unsigned int;
161   #endif
162 
163   // values
164   static const auto cupmSuccess                 = cudaSuccess;
165   static const auto cupmErrorNotReady           = cudaErrorNotReady;
166   static const auto cupmErrorDeviceAlreadyInUse = cudaErrorDeviceAlreadyInUse;
167   static const auto cupmErrorSetOnActiveProcess = cudaErrorSetOnActiveProcess;
168   static const auto cupmErrorStubLibrary =
169   #if PETSC_PKG_CUDA_VERSION_GE(11, 1, 0)
170     cudaErrorStubLibrary;
171   #else
172     cudaErrorInsufficientDriver;
173   #endif
174 
175   static const auto cupmErrorNoDevice          = cudaErrorNoDevice;
176   static const auto cupmStreamDefault          = cudaStreamDefault;
177   static const auto cupmStreamNonBlocking      = cudaStreamNonBlocking;
178   static const auto cupmDeviceMapHost          = cudaDeviceMapHost;
179   static const auto cupmMemcpyHostToDevice     = cudaMemcpyHostToDevice;
180   static const auto cupmMemcpyDeviceToHost     = cudaMemcpyDeviceToHost;
181   static const auto cupmMemcpyDeviceToDevice   = cudaMemcpyDeviceToDevice;
182   static const auto cupmMemcpyHostToHost       = cudaMemcpyHostToHost;
183   static const auto cupmMemcpyDefault          = cudaMemcpyDefault;
184   static const auto cupmMemoryTypeHost         = cudaMemoryTypeHost;
185   static const auto cupmMemoryTypeDevice       = cudaMemoryTypeDevice;
186   static const auto cupmMemoryTypeManaged      = cudaMemoryTypeManaged;
187   static const auto cupmEventDisableTiming     = cudaEventDisableTiming;
188   static const auto cupmHostAllocDefault       = cudaHostAllocDefault;
189   static const auto cupmHostAllocWriteCombined = cudaHostAllocWriteCombined;
190   static const auto cupmMemPoolAttrReleaseThreshold =
191   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
192     cudaMemPoolAttrReleaseThreshold;
193   #else
194     cupmMemPoolAttr{0};
195   #endif
196   static const auto cupmDevAttrClockRate       = cudaDevAttrClockRate;
197   static const auto cupmDevAttrMemoryClockRate = cudaDevAttrMemoryClockRate;
198 
199   // error functions
PETSC_CUPM_ALIAS_FUNCTIONPetsc::device::cupm::impl::InterfaceImpl200   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, cudaGetErrorName)
201   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, cudaGetErrorString)
202   PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, cudaGetLastError)
203 
204   // device management
205   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, cudaGetDeviceCount)
206   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, cudaGetDeviceProperties)
207   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetAttribute, cudaDeviceGetAttribute)
208   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, cudaGetDevice)
209   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, cudaSetDevice)
210   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, cudaGetDeviceFlags)
211   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, cudaSetDeviceFlags)
212   PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, cudaPointerGetAttributes)
213   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
214   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, cudaDeviceGetMemPool)
215   PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, cudaMemPoolSetAttribute)
216   #else
217   PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
218   {
219     *pool = nullptr;
220     return cupmSuccess;
221   }
222 
223   PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
224   #endif
225   // CUDA has no cudaInit() to match hipInit()
226   PETSC_NODISCARD static cupmError_t cupmInit(unsigned int) noexcept { return cudaFree(nullptr); }
227 
228   // stream management
PETSC_CUPM_ALIAS_FUNCTIONPetsc::device::cupm::impl::InterfaceImpl229   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, cudaEventCreate)
230   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, cudaEventCreateWithFlags)
231   PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, cudaEventDestroy)
232   PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, cudaEventRecord)
233   PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, cudaEventSynchronize)
234   PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, cudaEventElapsedTime)
235   PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, cudaEventQuery)
236   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, cudaStreamCreate)
237   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, cudaStreamCreateWithFlags)
238   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, cudaStreamGetFlags)
239   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, cudaStreamDestroy)
240   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, cudaStreamWaitEvent)
241   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, cudaStreamQuery)
242   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, cudaStreamSynchronize)
243   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, cudaDeviceSynchronize)
244   PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, cudaGetSymbolAddress)
245 
246   // memory management
247   PETSC_CUPM_ALIAS_FUNCTION(cupmFree, cudaFree)
248   PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc)
249   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
250   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, cudaFreeAsync)
251   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, cudaMallocAsync)
252   #else
253   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, cudaFree, 1)
254   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, cudaMalloc, 1)
255   #endif
256   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, cudaMemcpy)
257   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, cudaMemcpyAsync)
258   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, cudaMallocHost)
259   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, cudaFreeHost)
260   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, cudaMemset)
261   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
262   PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, cudaMemsetAsync)
263   #else
264   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemsetAsync, cudaMemset, 1)
265   #endif
266   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy2D, cudaMemcpy2D)
267   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemcpy2DAsync, cudaMemcpy2DAsync, 1)
268   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset2D, cudaMemset2D)
269   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemset2DAsync, cudaMemset2DAsync, 1)
270 
271   // launch control
272   PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, cudaLaunchHostFunc)
273   template <typename FunctionT, typename... KernelArgsT>
274   PETSC_NODISCARD static cudaError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, cudaStream_t stream, KernelArgsT &&...kernelArgs) noexcept
275   {
276     static_assert(!std::is_pointer<FunctionT>::value, "kernel function must not be passed by pointer");
277     void *args[] = {(void *)std::addressof(kernelArgs)...};
278 
279     return cudaLaunchKernel<util::remove_reference_t<FunctionT>>(std::addressof(func), std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
280   }
281 };
282 #endif // PetscDefined(HAVE_CUDA)
283 
284 #if PetscDefined(HAVE_HIP)
285 template <>
286 struct InterfaceImpl<DeviceType::HIP> : InterfaceBase<DeviceType::HIP> {
287   PETSC_CUPM_BASE_CLASS_HEADER(DeviceType::HIP);
288 
289   // typedefs
290   using cupmError_t             = hipError_t;
291   using cupmEvent_t             = hipEvent_t;
292   using cupmStream_t            = hipStream_t;
293   using cupmDeviceProp_t        = hipDeviceProp_t;
294   using cupmMemcpyKind_t        = hipMemcpyKind;
295   using cupmDeviceAttr_t        = hipDeviceAttribute_t;
296   using cupmComplex_t           = util::conditional_t<PetscDefined(USE_REAL_SINGLE), hipComplex, hipDoubleComplex>;
297   using cupmPointerAttributes_t = hipPointerAttribute_t;
298   using cupmMemoryType_t        = enum hipMemoryType;
299   using cupmDim3                = dim3;
300   #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
301   using cupmHostFn_t    = hipHostFn_t;
302   using cupmMemPool_t   = hipMemPool_t;
303   using cupmMemPoolAttr = hipMemPoolAttr;
304   #else
305   using cupmHostFn_t    = void (*)(void *);
306   using cupmMemPool_t   = void *;
307   using cupmMemPoolAttr = unsigned int;
308   #endif
309 
310   // values
311   static const auto cupmSuccess       = hipSuccess;
312   static const auto cupmErrorNotReady = hipErrorNotReady;
313   // see https://github.com/ROCm-Developer-Tools/HIP/blob/develop/bin/hipify-perl
314   static const auto cupmErrorDeviceAlreadyInUse = hipErrorContextAlreadyInUse;
315   static const auto cupmErrorSetOnActiveProcess = hipErrorSetOnActiveProcess;
316   // as of HIP v4.2 cudaErrorStubLibrary has no HIP equivalent
317   static const auto cupmErrorStubLibrary     = hipErrorInsufficientDriver;
318   static const auto cupmErrorNoDevice        = hipErrorNoDevice;
319   static const auto cupmStreamDefault        = hipStreamDefault;
320   static const auto cupmStreamNonBlocking    = hipStreamNonBlocking;
321   static const auto cupmDeviceMapHost        = hipDeviceMapHost;
322   static const auto cupmMemcpyHostToDevice   = hipMemcpyHostToDevice;
323   static const auto cupmMemcpyDeviceToHost   = hipMemcpyDeviceToHost;
324   static const auto cupmMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
325   static const auto cupmMemcpyHostToHost     = hipMemcpyHostToHost;
326   static const auto cupmMemcpyDefault        = hipMemcpyDefault;
327   static const auto cupmMemoryTypeHost       = hipMemoryTypeHost;
328   static const auto cupmMemoryTypeDevice     = hipMemoryTypeDevice;
329   // see
330   // https://github.com/ROCm-Developer-Tools/HIP/blob/develop/include/hip/hip_runtime_api.h#L156
331   static const auto cupmMemoryTypeManaged      = hipMemoryTypeUnified;
332   static const auto cupmEventDisableTiming     = hipEventDisableTiming;
333   static const auto cupmHostAllocDefault       = hipHostMallocDefault;
334   static const auto cupmHostAllocWriteCombined = hipHostMallocWriteCombined;
335   static const auto cupmMemPoolAttrReleaseThreshold =
336   #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
337     hipMemPoolAttrReleaseThreshold;
338   #else
339     cupmMemPoolAttr{0};
340   #endif
341   static const auto cupmDevAttrClockRate       = hipDeviceAttributeClockRate;
342   static const auto cupmDevAttrMemoryClockRate = hipDeviceAttributeMemoryClockRate;
343 
344   // error functions
PETSC_CUPM_ALIAS_FUNCTIONPetsc::device::cupm::impl::InterfaceImpl345   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, hipGetErrorName)
346   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, hipGetErrorString)
347   PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, hipGetLastError)
348 
349   // device management
350   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, hipGetDeviceCount)
351   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, hipGetDeviceProperties)
352   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetAttribute, hipDeviceGetAttribute)
353   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, hipGetDevice)
354   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, hipSetDevice)
355   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, hipGetDeviceFlags)
356   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, hipSetDeviceFlags)
357   PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, hipPointerGetAttributes)
358   #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
359   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, hipDeviceGetMemPool)
360   PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, hipMemPoolSetAttribute)
361   #else
362   PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
363   {
364     *pool = nullptr;
365     return cupmSuccess;
366   }
367 
368   PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
369   #endif
370   PETSC_CUPM_ALIAS_FUNCTION(cupmInit, hipInit)
371 
372   // stream management
373   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, hipEventCreate)
374   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, hipEventCreateWithFlags)
375   PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, hipEventDestroy)
376   PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, hipEventRecord)
377   PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, hipEventSynchronize)
378   PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, hipEventElapsedTime)
379   PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, hipEventQuery)
380   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, hipStreamCreate)
381   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, hipStreamCreateWithFlags)
382   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, hipStreamGetFlags)
383   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, hipStreamDestroy)
384   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, hipStreamWaitEvent)
385   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, hipStreamQuery)
386   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, hipStreamSynchronize)
387   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, hipDeviceSynchronize)
388   PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, hipGetSymbolAddress)
389 
390   // memory management
391   PETSC_CUPM_ALIAS_FUNCTION(cupmFree, hipFree)
392   PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc)
393   #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
394   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, hipMallocAsync)
395   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, hipFreeAsync)
396   #else
397   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, hipMalloc, 1)
398   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, hipFree, 1)
399   #endif
400   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, hipMemcpy)
401   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, hipMemcpyAsync)
402   // hipMallocHost is deprecated
403   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, hipHostMalloc)
404   // hipFreeHost is deprecated
405   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, hipHostFree)
406   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, hipMemset)
407   PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, hipMemsetAsync)
408   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy2D, hipMemcpy2D)
409   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemcpy2DAsync, hipMemcpy2DAsync, 1)
410   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset2D, hipMemset2D)
411   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemset2DAsync, hipMemset2DAsync, 1)
412 
413   // launch control
414   // HIP appears to only have hipLaunchHostFunc from 5.2.0 onwards
415   // https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md#7-execution-control=
416   #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
417   PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, hipLaunchHostFunc)
418   #else
419   PETSC_NODISCARD static hipError_t cupmLaunchHostFunc(hipStream_t stream, cupmHostFn_t fn, void *ctx) noexcept
420   {
421     // the only correct way to spoof this function is to do it synchronously...
422     auto herr = hipStreamSynchronize(stream);
423     if (PetscUnlikely(herr != hipSuccess)) return herr;
424     fn(ctx);
425     return herr;
426   }
427   #endif
428 
429   template <typename FunctionT, typename... KernelArgsT>
430   PETSC_NODISCARD static hipError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, hipStream_t stream, KernelArgsT &&...kernelArgs) noexcept
431   {
432     void *args[] = {(void *)std::addressof(kernelArgs)...};
433 
434     return hipLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
435   }
436 };
437 #endif // PetscDefined(HAVE_HIP)
438 
439 // shorthand for bringing all of the typedefs from the base Interface class into your own,
440 // it's annoying that c++ doesn't have a way to do this automatically
441 #define PETSC_CUPM_IMPL_CLASS_HEADER(T) \
442   PETSC_CUPM_BASE_CLASS_HEADER(T); \
443   /* types */ \
444   using cupmError_t             = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmError_t; \
445   using cupmEvent_t             = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEvent_t; \
446   using cupmStream_t            = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStream_t; \
447   using cupmDeviceProp_t        = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceProp_t; \
448   using cupmMemcpyKind_t        = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyKind_t; \
449   using cupmDeviceAttr_t        = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceAttr_t; \
450   using cupmComplex_t           = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmComplex_t; \
451   using cupmPointerAttributes_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmPointerAttributes_t; \
452   using cupmMemoryType_t        = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryType_t; \
453   using cupmDim3                = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDim3; \
454   using cupmHostFn_t            = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostFn_t; \
455   using cupmMemPool_t           = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPool_t; \
456   using cupmMemPoolAttr         = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolAttr; \
457   /* variables */ \
458   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSuccess; \
459   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorNotReady; \
460   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorDeviceAlreadyInUse; \
461   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorSetOnActiveProcess; \
462   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorStubLibrary; \
463   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorNoDevice; \
464   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamDefault; \
465   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamNonBlocking; \
466   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceMapHost; \
467   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyHostToDevice; \
468   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDeviceToHost; \
469   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDeviceToDevice; \
470   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyHostToHost; \
471   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDefault; \
472   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeHost; \
473   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeDevice; \
474   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeManaged; \
475   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventDisableTiming; \
476   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostAllocDefault; \
477   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostAllocWriteCombined; \
478   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolAttrReleaseThreshold; \
479   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDevAttrClockRate; \
480   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDevAttrMemoryClockRate; \
481   /* functions */ \
482   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetErrorName; \
483   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetErrorString; \
484   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetLastError; \
485   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceCount; \
486   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceProperties; \
487   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceGetAttribute; \
488   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDevice; \
489   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSetDevice; \
490   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceFlags; \
491   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSetDeviceFlags; \
492   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmPointerGetAttributes; \
493   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceGetMemPool; \
494   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolSetAttribute; \
495   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmInit; \
496   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventCreate; \
497   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventCreateWithFlags; \
498   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventDestroy; \
499   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventRecord; \
500   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventSynchronize; \
501   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventElapsedTime; \
502   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventQuery; \
503   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamCreate; \
504   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamCreateWithFlags; \
505   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamGetFlags; \
506   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamDestroy; \
507   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamWaitEvent; \
508   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamQuery; \
509   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamSynchronize; \
510   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceSynchronize; \
511   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetSymbolAddress; \
512   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMalloc; \
513   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMallocAsync; \
514   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy; \
515   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyAsync; \
516   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMallocHost; \
517   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset; \
518   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemsetAsync; \
519   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy2D; \
520   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy2DAsync; \
521   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset2D; \
522   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset2DAsync; \
523   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmLaunchHostFunc
524 
525 #if PetscHasAttribute(always_inline)
526   // https://gcc.gnu.org/bugzilla//show_bug.cgi?id=109464
527   #define PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND __attribute__((always_inline))
528 #else
529   #define PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND
530 #endif
531 
532 // The actual interface class
533 template <DeviceType T>
534 struct Interface : InterfaceImpl<T> {
535 private:
536   using interface_type = InterfaceImpl<T>;
537 
538 public:
539   PETSC_CUPM_IMPL_CLASS_HEADER(T);
540 
541   using cupmReal_t   = util::conditional_t<PetscDefined(USE_REAL_SINGLE), float, double>;
542   using cupmScalar_t = util::conditional_t<PetscDefined(USE_COMPLEX), cupmComplex_t, cupmReal_t>;
543 
cupmScalarCastPetsc::device::cupm::impl::Interface544   PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr cupmScalar_t cupmScalarCast(PetscScalar s) noexcept
545   {
546 #if PetscDefined(USE_COMPLEX)
547     return cupmComplex_t{PetscRealPart(s), PetscImaginaryPart(s)};
548 #else
549     return static_cast<cupmScalar_t>(s);
550 #endif
551   }
552 
cupmScalarPtrCastPetsc::device::cupm::impl::Interface553   PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr const cupmScalar_t *cupmScalarPtrCast(const PetscScalar *s) noexcept { return reinterpret_cast<const cupmScalar_t *>(s); }
554 
cupmScalarPtrCastPetsc::device::cupm::impl::Interface555   PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr cupmScalar_t *cupmScalarPtrCast(PetscScalar *s) noexcept { return reinterpret_cast<cupmScalar_t *>(s); }
556 
cupmRealPtrCastPetsc::device::cupm::impl::Interface557   PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr const cupmReal_t *cupmRealPtrCast(const PetscReal *s) noexcept { return reinterpret_cast<const cupmReal_t *>(s); }
558 
cupmRealPtrCastPetsc::device::cupm::impl::Interface559   PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr cupmReal_t *cupmRealPtrCast(PetscReal *s) noexcept { return reinterpret_cast<cupmReal_t *>(s); }
560 
561 #if !defined(PETSC_PKG_CUDA_VERSION_GE)
562   #define PETSC_PKG_CUDA_VERSION_GE(...) 0
563   #define CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE
564 #endif
565 
566 #if !defined(PETSC_PKG_HIP_VERSION_LT)
567   #define PETSC_PKG_HIP_VERSION_LT(...) 0
568   #define CUPM_DEFINED_PETSC_PKG_HIP_VERSION_LT
569 #endif
570 
PetscCUPMGetMemTypePetsc::device::cupm::impl::Interface571   static PetscErrorCode PetscCUPMGetMemType(const void *data, PetscMemType *type, PetscBool *registered = nullptr, PetscBool *managed = nullptr) noexcept
572   {
573     cupmPointerAttributes_t attr;
574     cupmError_t             cerr;
575 
576     PetscFunctionBegin;
577     if (type) PetscAssertPointer(type, 2);
578     if (registered) {
579       PetscAssertPointer(registered, 3);
580       *registered = PETSC_FALSE;
581     }
582     if (managed) {
583       PetscAssertPointer(managed, 4);
584       *managed = PETSC_FALSE;
585     }
586     // Do not check error, instead reset it via GetLastError() since before CUDA 11.0, passing
587     // a host pointer returns cudaErrorInvalidValue
588     cerr = cupmPointerGetAttributes(&attr, data);
589     cerr = cupmGetLastError();
590     // HIP seems to always have used memoryType though
591 #if (defined(CUDART_VERSION) && (CUDART_VERSION < 10000)) || (defined(__HIP_PLATFORM_HCC__) && PETSC_PKG_HIP_VERSION_LT(5, 5, 0))
592     const auto mtype = attr.memoryType;
593     if (managed) *managed = static_cast<PetscBool>((cerr == cupmSuccess) && attr.isManaged);
594 #else
595     if (PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) && (T == DeviceType::CUDA)) PetscCallCUPM(cerr);
596     const auto mtype = attr.type;
597     if (managed) *managed = static_cast<PetscBool>(mtype == cupmMemoryTypeManaged);
598 #endif // CUDART_VERSION && CUDART_VERSION < 10000 || (defined(__HIP_PLATFORM_HCC__) && PETSC_PKG_HIP_VERSION_LT(5, 5, 0))
599     if (type) *type = ((cerr == cupmSuccess) && (mtype == cupmMemoryTypeDevice)) ? PETSC_MEMTYPE_CUPM() : PETSC_MEMTYPE_HOST;
600     if (registered && (cerr == cupmSuccess) && (mtype == cupmMemoryTypeHost)) *registered = PETSC_TRUE;
601     PetscFunctionReturn(PETSC_SUCCESS);
602   }
603 #if defined(CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE)
604   #undef PETSC_PKG_CUDA_VERSION_GE
605 #endif
606 #if defined(CUPM_DEFINED_PETSC_PKG_HIP_VERSION_LT)
607   #undef PETSC_PKG_HIP_VERSION_LT
608 #endif
609 
PetscDeviceCopyModeToCUPMMemcpyKindPetsc::device::cupm::impl::Interface610   PETSC_NODISCARD static PETSC_CONSTEXPR_14 cupmMemcpyKind_t PetscDeviceCopyModeToCUPMMemcpyKind(PetscDeviceCopyMode mode) noexcept
611   {
612     switch (mode) {
613     case PETSC_DEVICE_COPY_HTOH:
614       return cupmMemcpyHostToHost;
615     case PETSC_DEVICE_COPY_HTOD:
616       return cupmMemcpyHostToDevice;
617     case PETSC_DEVICE_COPY_DTOD:
618       return cupmMemcpyDeviceToDevice;
619     case PETSC_DEVICE_COPY_DTOH:
620       return cupmMemcpyDeviceToHost;
621     case PETSC_DEVICE_COPY_AUTO:
622       return cupmMemcpyDefault;
623     }
624     PetscUnreachable();
625     return cupmMemcpyDefault;
626   }
627 
628   // these change what the arguments mean, so need to namespace these
629   template <typename M>
PetscCUPMMallocAsyncPetsc::device::cupm::impl::Interface630   static PetscErrorCode PetscCUPMMallocAsync(M **ptr, std::size_t n, cupmStream_t stream = nullptr) noexcept
631   {
632     static_assert(!std::is_void<M>::value, "");
633 
634     PetscFunctionBegin;
635     PetscAssertPointer(ptr, 1);
636     *ptr = nullptr;
637     if (n) {
638       const auto bytes = n * sizeof(M);
639       // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-2/
640       //
641       // TLD;DR: cudaMallocAsync() does not work with NVIDIA GPUDirect which OPENMPI uses to
642       // underpin its cuda-aware MPI implementation, so we cannot just async allocate
643       // blindly...
644       if (stream) {
645         PetscCallCUPM(cupmMallocAsync(reinterpret_cast<void **>(ptr), bytes, stream));
646       } else {
647         PetscCallCUPM(cupmMalloc(reinterpret_cast<void **>(ptr), bytes));
648       }
649     }
650     PetscFunctionReturn(PETSC_SUCCESS);
651   }
652 
653   template <typename M>
PetscCUPMMallocPetsc::device::cupm::impl::Interface654   static PetscErrorCode PetscCUPMMalloc(M **ptr, std::size_t n) noexcept
655   {
656     PetscFunctionBegin;
657     PetscCall(PetscCUPMMallocAsync(ptr, n));
658     PetscFunctionReturn(PETSC_SUCCESS);
659   }
660 
661   template <typename M>
PetscCUPMMallocHostPetsc::device::cupm::impl::Interface662   static PetscErrorCode PetscCUPMMallocHost(M **ptr, std::size_t n, unsigned int flags = cupmHostAllocDefault) noexcept
663   {
664     static_assert(!std::is_void<M>::value, "");
665 
666     PetscFunctionBegin;
667     PetscAssertPointer(ptr, 1);
668     *ptr = nullptr;
669     if (n) PetscCallCUPM(cupmMallocHost(reinterpret_cast<void **>(ptr), n * sizeof(M), flags));
670     PetscFunctionReturn(PETSC_SUCCESS);
671   }
672 
673   template <typename D>
PetscCUPMMemcpyAsyncPetsc::device::cupm::impl::Interface674   static PetscErrorCode PetscCUPMMemcpyAsync(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr, bool use_async = false) noexcept
675   {
676     static_assert(!std::is_void<D>::value, "");
677     const auto size = n * sizeof(D);
678 
679     PetscFunctionBegin;
680     if (PetscUnlikely(!n)) PetscFunctionReturn(PETSC_SUCCESS);
681     // cannot dereference (i.e. cannot call PetscAssertPointer() here)
682     PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer");
683     PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer");
684     // do early return after nullptr check since we need to check that they are not both nullptrs
685     if (PetscUnlikely(dest == src)) PetscFunctionReturn(PETSC_SUCCESS);
686     if (kind == cupmMemcpyHostToHost) {
687       // If we are HTOH it is cheaper to check if the stream is idle and do a basic mempcy()
688       // than it is to just call the vendor functions. This assumes of course that the stream
689       // accounts for both memory regions being "idle"
690       if (cupmStreamQuery(stream) == cupmSuccess) {
691         PetscCall(PetscMemcpy(dest, src, size));
692         PetscFunctionReturn(PETSC_SUCCESS);
693       }
694       // need to clear the potential cupmErrorNotReady generated by query above...
695       auto cerr = cupmGetLastError();
696 
697       if (PetscUnlikely(cerr != cupmErrorNotReady)) PetscCallCUPM(cerr);
698     }
699     if (use_async || stream || (kind != cupmMemcpyDeviceToHost)) {
700       PetscCallCUPM(cupmMemcpyAsync(dest, src, size, kind, stream));
701     } else {
702       PetscCallCUPM(cupmMemcpy(dest, src, size, kind));
703     }
704     PetscCall(PetscLogCUPMMemcpyTransfer(kind, size));
705     PetscFunctionReturn(PETSC_SUCCESS);
706   }
707 
708   template <typename D>
PetscCUPMMemcpyPetsc::device::cupm::impl::Interface709   static PetscErrorCode PetscCUPMMemcpy(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind) noexcept
710   {
711     PetscFunctionBegin;
712     PetscCall(PetscCUPMMemcpyAsync(dest, src, n, kind));
713     PetscFunctionReturn(PETSC_SUCCESS);
714   }
715 
716   template <typename D>
PetscCUPMMemcpy2DAsyncPetsc::device::cupm::impl::Interface717   static PetscErrorCode PetscCUPMMemcpy2DAsync(D *dest, std::size_t dest_pitch, const util::type_identity_t<D> *src, std::size_t src_pitch, std::size_t width, std::size_t height, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr)
718   {
719     static_assert(!std::is_void<D>::value, "");
720     const auto dest_pitch_bytes = dest_pitch * sizeof(D);
721     const auto src_pitch_bytes  = src_pitch * sizeof(D);
722     const auto width_bytes      = width * sizeof(D);
723     const auto size             = height * width_bytes;
724 
725     PetscFunctionBegin;
726     if (PetscUnlikely(!size)) PetscFunctionReturn(PETSC_SUCCESS);
727     PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer");
728     PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer");
729     if (stream || (kind != cupmMemcpyDeviceToHost)) {
730       PetscCallCUPM(cupmMemcpy2DAsync(dest, dest_pitch_bytes, src, src_pitch_bytes, width_bytes, height, kind, stream));
731     } else {
732       PetscCallCUPM(cupmMemcpy2D(dest, dest_pitch_bytes, src, src_pitch_bytes, width_bytes, height, kind));
733     }
734     PetscCall(PetscLogCUPMMemcpyTransfer(kind, size));
735     PetscFunctionReturn(PETSC_SUCCESS);
736   }
737 
738   template <typename D>
PetscCUPMMemcpy2DPetsc::device::cupm::impl::Interface739   static PetscErrorCode PetscCUPMMemcpy2D(D *dest, std::size_t dest_pitch, const util::type_identity_t<D> *src, std::size_t src_pitch, std::size_t width, std::size_t height, cupmMemcpyKind_t kind)
740   {
741     PetscFunctionBegin;
742     PetscCall(PetscCUPMMemcpy2DAsync(dest, dest_pitch, src, src_pitch, width, height, kind));
743     PetscFunctionReturn(PETSC_SUCCESS);
744   }
745 
746   template <typename M>
PetscCUPMMemsetAsyncPetsc::device::cupm::impl::Interface747   static PetscErrorCode PetscCUPMMemsetAsync(M *ptr, int value, std::size_t n, cupmStream_t stream = nullptr, bool use_async = false) noexcept
748   {
749     static_assert(!std::is_void<M>::value, "");
750 
751     PetscFunctionBegin;
752     if (PetscLikely(n)) {
753       const auto bytes = n * sizeof(M);
754 
755       PetscCheck(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", n);
756       if (stream || use_async) {
757         PetscCallCUPM(cupmMemsetAsync(ptr, value, bytes, stream));
758       } else {
759         PetscCallCUPM(cupmMemset(ptr, value, bytes));
760       }
761     }
762     PetscFunctionReturn(PETSC_SUCCESS);
763   }
764 
765   template <typename M>
PetscCUPMMemsetPetsc::device::cupm::impl::Interface766   static PetscErrorCode PetscCUPMMemset(M *ptr, int value, std::size_t n) noexcept
767   {
768     PetscFunctionBegin;
769     PetscCall(PetscCUPMMemsetAsync(ptr, value, n));
770     PetscFunctionReturn(PETSC_SUCCESS);
771   }
772 
773   template <typename D>
PetscCUPMMemset2DAsyncPetsc::device::cupm::impl::Interface774   static PetscErrorCode PetscCUPMMemset2DAsync(D *ptr, std::size_t pitch, int value, std::size_t width, std::size_t height, cupmStream_t stream = nullptr)
775   {
776     static_assert(!std::is_void<D>::value, "");
777     const auto pitch_bytes = pitch * sizeof(D);
778     const auto width_bytes = width * sizeof(D);
779     const auto size        = width_bytes * height;
780 
781     PetscFunctionBegin;
782     if (PetscUnlikely(!size)) PetscFunctionReturn(PETSC_SUCCESS);
783     PetscAssert(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", size);
784     if (stream) {
785       PetscCallCUPM(cupmMemset2DAsync(ptr, pitch_bytes, value, width_bytes, height, stream));
786     } else {
787       PetscCallCUPM(cupmMemset2D(ptr, pitch_bytes, value, width_bytes, height));
788     }
789     PetscFunctionReturn(PETSC_SUCCESS);
790   }
791 
792   // these we can transparently wrap, no need to namespace it to Petsc
793   template <typename M>
cupmFreeAsyncPetsc::device::cupm::impl::Interface794   PETSC_NODISCARD static cupmError_t cupmFreeAsync(M &ptr, cupmStream_t stream = nullptr) noexcept
795   {
796     static_assert(std::is_pointer<util::decay_t<M>>::value, "");
797     static_assert(!std::is_const<M>::value, "");
798 
799     if (ptr) {
800       auto cerr = interface_type::cupmFreeAsync(std::forward<M>(ptr), stream);
801 
802       ptr = nullptr;
803       if (PetscUnlikely(cerr != cupmSuccess)) return cerr;
804     }
805     return cupmSuccess;
806   }
807 
cupmFreeAsyncPetsc::device::cupm::impl::Interface808   PETSC_NODISCARD static cupmError_t cupmFreeAsync(std::nullptr_t ptr, cupmStream_t stream = nullptr) { return interface_type::cupmFreeAsync(ptr, stream); }
809 
810   template <typename M>
cupmFreePetsc::device::cupm::impl::Interface811   PETSC_NODISCARD static cupmError_t cupmFree(M &ptr) noexcept
812   {
813     return cupmFreeAsync(ptr);
814   }
815 
cupmFreePetsc::device::cupm::impl::Interface816   PETSC_NODISCARD static cupmError_t cupmFree(std::nullptr_t ptr) { return cupmFreeAsync(ptr); }
817 
818   template <typename M>
cupmFreeHostPetsc::device::cupm::impl::Interface819   PETSC_NODISCARD static cupmError_t cupmFreeHost(M &ptr) noexcept
820   {
821     static_assert(std::is_pointer<util::decay_t<M>>::value, "");
822     const auto cerr = interface_type::cupmFreeHost(std::forward<M>(ptr));
823     ptr             = nullptr;
824     return cerr;
825   }
826 
cupmFreeHostPetsc::device::cupm::impl::Interface827   PETSC_NODISCARD static cupmError_t cupmFreeHost(std::nullptr_t ptr) { return interface_type::cupmFreeHost(ptr); }
828 
829   // specific wrapper for device launch function, as the real function is a C routine and
830   // doesn't have variable arguments. The actual mechanics of this are a bit complicated but
831   // boils down to the fact that ultimately we pass a
832   //
833   // void *args[] = {(void*)&kernel_args...};
834   //
835   // to the kernel launcher. Since we pass void* this means implicit conversion does **not**
836   // happen to the kernel arguments so we must do it ourselves here. This function does this in
837   // 3 stages:
838   // 1. Enumerate the kernel arguments (cupmLaunchKernel)
839   // 2. Deduce the signature of func() and static_cast the kernel arguments to the type
840   //    expected by func() using the enumeration above (deduceKernelCall)
841   // 3. Form the void* array with the converted arguments and call cuda/hipLaunchKernel with
842   //    it. (interface_type::cupmLaunchKernel)
843   template <typename F, typename... Args>
cupmLaunchKernelPetsc::device::cupm::impl::Interface844   PETSC_NODISCARD static cupmError_t cupmLaunchKernel(F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
845   {
846     return deduceKernelCall(util::index_sequence_for<Args...>{}, std::forward<F>(func), std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), std::forward<Args>(kernelArgs)...);
847   }
848 
849   template <std::size_t block_size = 256, std::size_t warp_size = 32, typename F, typename... Args>
PetscCUPMLaunchKernel1DPetsc::device::cupm::impl::Interface850   static PetscErrorCode PetscCUPMLaunchKernel1D(std::size_t n, std::size_t sharedMem, cupmStream_t stream, F &&func, Args &&...kernelArgs) noexcept
851   {
852     static_assert(block_size > 0, "");
853     static_assert(warp_size > 0, "");
854     // want block_size to be a multiple of the warp_size
855     static_assert(block_size % warp_size == 0, "");
856     const auto nthread = std::min(n, block_size);
857     const auto nblock  = (n + block_size - 1) / block_size;
858 
859     PetscFunctionBegin;
860     // if n = 0 then nthread = 0, which is not allowed. rather than letting the user try to
861     // decipher cryptic 'cuda/hipErrorLaunchFailure' we explicitly check for zero here
862     PetscAssert(nthread, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Trying to launch kernel with grid/block size 0");
863     PetscCallCUPM(cupmLaunchKernel(std::forward<F>(func), (unsigned int)nblock, (unsigned int)nthread, sharedMem, stream, std::forward<Args>(kernelArgs)...));
864     PetscFunctionReturn(PETSC_SUCCESS);
865   }
866 
867 private:
868   template <typename S, typename D, typename = void>
869   struct is_static_castable : std::false_type { };
870 
871   template <typename S, typename D>
872   struct is_static_castable<S, D, util::void_t<decltype(static_cast<D>(std::declval<S>()))>> : std::true_type { };
873 
874   template <typename D, typename S>
cast_toPetsc::device::cupm::impl::Interface875   static constexpr util::enable_if_t<is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
876   {
877     return static_cast<D>(std::forward<S>(src));
878   }
879 
880   template <typename D, typename S>
cast_toPetsc::device::cupm::impl::Interface881   static constexpr util::enable_if_t<!is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
882   {
883     return const_cast<D>(std::forward<S>(src));
884   }
885 
886   template <typename F, typename... Args, std::size_t... Idx>
deduceKernelCallPetsc::device::cupm::impl::Interface887   PETSC_NODISCARD static cupmError_t deduceKernelCall(util::index_sequence<Idx...>, F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
888   {
889     // clang-format off
890     return interface_type::cupmLaunchKernel(
891       std::forward<F>(func),
892       std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream),
893       // can't static_cast() here since the function argument type may be cv-qualified, in
894       // which case we would need to const_cast(). But you can only const_cast() indirect types
895       // (pointers, references). So we need a SFINAE monster that is a static_cast() if
896       // possible, and a const_cast() if not. We could just use a C-style cast which *would*
897       // work here since it tries the following and uses the first one that succeeds:
898       //
899       // 1. const_cast()
900       // 2. static_cast()
901       // 3. static_cast() then const_cast()
902       // 4. reinterpret_cast()...
903       //
904       // the issue however is the final reinterpret_cast(). We absolutely cannot get there
905       // because doing so would silently hide a ton of bugs, for example casting a PetscScalar
906       // * to double * in complex builds, a PetscInt * to int * in 64idx builds, etc.
907       cast_to<typename util::func_traits<F>::template arg<Idx>::type>(std::forward<Args>(kernelArgs))...
908     );
909     // clang-format on
910   }
911 
PetscLogCUPMMemcpyTransferPetsc::device::cupm::impl::Interface912   static PetscErrorCode PetscLogCUPMMemcpyTransfer(cupmMemcpyKind_t kind, std::size_t size) noexcept
913   {
914     PetscFunctionBegin;
915     // only the explicit HTOD or DTOH are handled, since we either don't log the other cases
916     // (yet) or don't know the direction
917     if (kind == cupmMemcpyDeviceToHost) PetscCall(PetscLogGpuToCpu(static_cast<PetscLogDouble>(size)));
918     else if (kind == cupmMemcpyHostToDevice) PetscCall(PetscLogCpuToGpu(static_cast<PetscLogDouble>(size)));
919     else (void)size;
920     PetscFunctionReturn(PETSC_SUCCESS);
921   }
922 };
923 
924 #undef PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND
925 
926 #define PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(T) \
927   PETSC_CUPM_IMPL_CLASS_HEADER(T); \
928   using cupmReal_t   = typename ::Petsc::device::cupm::impl::Interface<T>::cupmReal_t; \
929   using cupmScalar_t = typename ::Petsc::device::cupm::impl::Interface<T>::cupmScalar_t; \
930   using ::Petsc::device::cupm::impl::Interface<T>::cupmScalarCast; \
931   using ::Petsc::device::cupm::impl::Interface<T>::cupmScalarPtrCast; \
932   using ::Petsc::device::cupm::impl::Interface<T>::cupmRealPtrCast; \
933   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMGetMemType; \
934   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemset; \
935   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemsetAsync; \
936   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMalloc; \
937   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMallocAsync; \
938   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMallocHost; \
939   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy; \
940   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpyAsync; \
941   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy2D; \
942   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy2DAsync; \
943   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemset2DAsync; \
944   using ::Petsc::device::cupm::impl::Interface<T>::cupmFree; \
945   using ::Petsc::device::cupm::impl::Interface<T>::cupmFreeAsync; \
946   using ::Petsc::device::cupm::impl::Interface<T>::cupmFreeHost; \
947   using ::Petsc::device::cupm::impl::Interface<T>::cupmLaunchKernel; \
948   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMLaunchKernel1D; \
949   using ::Petsc::device::cupm::impl::Interface<T>::PetscDeviceCopyModeToCUPMMemcpyKind
950 
951 #if PetscDefined(HAVE_CUDA)
952 extern template struct PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL Interface<DeviceType::CUDA>;
953 #endif
954 
955 #if PetscDefined(HAVE_HIP)
956 extern template struct PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL Interface<DeviceType::HIP>;
957 #endif
958 
959 } // namespace impl
960 
961 } // namespace cupm
962 
963 } // namespace device
964 
965 } // namespace Petsc
966