xref: /petsc/include/petsc/private/veccupmimpl.h (revision ae51ba108530e1a8d2aeb8bc3771a24c2fcd801e)
1 #ifndef PETSCVECCUPMIMPL_H
2 #define PETSCVECCUPMIMPL_H
3 
4 #include <petsc/private/vecimpl.h>
5 #include <../src/vec/vec/impls/dvecimpl.h> // for Vec_Seq
6 
7 #if PetscDefined(HAVE_NVSHMEM)
8 PETSC_INTERN PetscErrorCode PetscNvshmemInitializeCheck(void);
9 PETSC_INTERN PetscErrorCode PetscNvshmemMalloc(size_t, void **);
10 PETSC_INTERN PetscErrorCode PetscNvshmemCalloc(size_t, void **);
11 PETSC_INTERN PetscErrorCode PetscNvshmemFree_Private(void *);
12   #define PetscNvshmemFree(ptr) ((PetscErrorCode)((ptr) && (PetscNvshmemFree_Private(ptr) || ((ptr) = PETSC_NULLPTR, PETSC_SUCCESS))))
13 PETSC_INTERN PetscErrorCode PetscNvshmemSum(PetscInt, PetscScalar *, const PetscScalar *);
14 PETSC_INTERN PetscErrorCode PetscNvshmemMax(PetscInt, PetscReal *, const PetscReal *);
15 PETSC_INTERN PetscErrorCode VecNormAsync_NVSHMEM(Vec, NormType, PetscReal *);
16 PETSC_INTERN PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec);
17 #else
18   #define PetscNvshmemFree(ptr) PETSC_SUCCESS
19 #endif
20 
21 #if defined(__cplusplus) && PetscDefined(HAVE_DEVICE)
22   #include <petsc/private/deviceimpl.h>
23   #include <petsc/private/cupmblasinterface.hpp>
24 
25   #include <petsc/private/cpp/functional.hpp>
26 
27   #include <limits>  // std::numeric_limits
28   #include <cstring> // std::memset
29 
30 namespace Petsc
31 {
32 
33 namespace vec
34 {
35 
36 namespace cupm
37 {
38 
39 namespace impl
40 {
41 
42 namespace
43 {
44 
45 // ==========================================================================================
46 // UseCUPMHostAlloc_
47 //
48 // A simple RAII helper for PetscMallocSet[CUDA|HIP]Host(). it exists because integrating the
49 // regular versions would be an enormous pain to square with the templated types...
50 // ==========================================================================================
51 template <device::cupm::DeviceType T>
52 class UseCUPMHostAlloc_ : device::cupm::impl::Interface<T> {
53 public:
54   PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(interface_type, T);
55 
56   UseCUPMHostAlloc_(bool) noexcept;
57   ~UseCUPMHostAlloc_() noexcept;
58 
59   PETSC_NODISCARD bool value() const noexcept;
60 
61 private:
62     // would have loved to just do
63     //
64     // const auto oldmalloc = PetscTrMalloc;
65     //
66     // but in order to use auto the member needs to be static; in order to be static it must
67     // also be constexpr -- which in turn requires an initializer (also implicitly required by
68     // auto). But constexpr needs a constant expression initializer, so we can't initialize it
69     // with global (mutable) variables...
70   #define DECLTYPE_AUTO(left, right) decltype(right) left = right
71   const DECLTYPE_AUTO(oldmalloc_, PetscTrMalloc);
72   const DECLTYPE_AUTO(oldfree_, PetscTrFree);
73   const DECLTYPE_AUTO(oldrealloc_, PetscTrRealloc);
74   #undef DECLTYPE_AUTO
75   bool v_;
76 };
77 
78 template <device::cupm::DeviceType T>
79 inline UseCUPMHostAlloc_<T>::UseCUPMHostAlloc_(bool useit) noexcept : v_(useit)
80 {
81   PetscFunctionBegin;
82   if (useit) {
83     // all unused arguments are un-named, this saves having to add PETSC_UNUSED to them all
84     PetscTrMalloc = [](std::size_t sz, PetscBool clear, int, const char *, const char *, void **ptr) {
85       PetscFunctionBegin;
86       PetscCallCUPM(cupmMallocHost(ptr, sz));
87       if (clear) std::memset(*ptr, 0, sz);
88       PetscFunctionReturn(PETSC_SUCCESS);
89     };
90     PetscTrFree = [](void *ptr, int, const char *, const char *) {
91       PetscFunctionBegin;
92       PetscCallCUPM(cupmFreeHost(ptr));
93       PetscFunctionReturn(PETSC_SUCCESS);
94     };
95     PetscTrRealloc = [](std::size_t, int, const char *, const char *, void **) {
96       // REVIEW ME: can be implemented by malloc->copy->free?
97       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "%s has no realloc()", cupmName());
98     };
99   }
100   PetscFunctionReturnVoid();
101 }
102 
103 template <device::cupm::DeviceType T>
104 inline bool UseCUPMHostAlloc_<T>::value() const noexcept
105 {
106   return v_;
107 }
108 
109 template <device::cupm::DeviceType T>
110 inline UseCUPMHostAlloc_<T>::~UseCUPMHostAlloc_() noexcept
111 {
112   PetscFunctionBegin;
113   if (value()) {
114     PetscTrMalloc  = oldmalloc_;
115     PetscTrFree    = oldfree_;
116     PetscTrRealloc = oldrealloc_;
117   }
118   PetscFunctionReturnVoid();
119 }
120 
121 struct no_op {
122   template <typename... T>
123   constexpr PetscErrorCode operator()(T &&...) const noexcept
124   {
125     return PETSC_SUCCESS;
126   }
127 };
128 
129 template <typename T>
130 struct CooPair {
131   using value_type = T;
132   using size_type  = PetscCount;
133 
134   value_type *&device;
135   value_type *&host;
136   size_type    size;
137 };
138 
139 template <typename U>
140 static constexpr CooPair<U> make_coo_pair(U *&device, U *&host, PetscCount size) noexcept
141 {
142   return {device, host, size};
143 }
144 
145 } // anonymous namespace
146 
147 // forward declarations
148 template <device::cupm::DeviceType>
149 class VecSeq_CUPM;
150 template <device::cupm::DeviceType>
151 class VecMPI_CUPM;
152 
153 // ==========================================================================================
154 // Vec_CUPMBase
155 //
156 // Base class for the VecSeq and VecMPI CUPM implementations. On top of the usual DeviceType
157 // template parameter it also uses CRTP to be able to use values/calls specific to either
158 // VecSeq or VecMPI. This is in effect "inside-out" polymorphism.
159 // ==========================================================================================
160 template <device::cupm::DeviceType T, typename Derived>
161 class Vec_CUPMBase : device::cupm::impl::BlasInterface<T> {
162 public:
163   PETSC_CUPMBLAS_INHERIT_INTERFACE_TYPEDEFS_USING(cupmBlasInterface_t, T);
164   // ==========================================================================================
165   // Vec_CUPMBase::vector_array
166   //
167   // RAII versions of the get/restore array routines. Determines constness of the pointer type,
168   // holds the pointer itself provides the implicit conversion operator
169   // ==========================================================================================
170   template <PetscMemType, PetscMemoryAccessMode>
171   class vector_array;
172 
173 private:
174   // A debug check to ensure that a given pointer-memtype pairing taken from user-land is
175   // actually correct. Errors on mismatch
176   static PetscErrorCode CheckPointerMatchesMemType_(const void *ptr, PetscMemType mtype) noexcept
177   {
178     PetscFunctionBegin;
179     if (PetscDefined(USE_DEBUG) && ptr) {
180       PetscMemType ptr_mtype;
181 
182       PetscCall(PetscCUPMGetMemType(ptr, &ptr_mtype));
183       if (mtype == PETSC_MEMTYPE_HOST) {
184         PetscCheck(PetscMemTypeHost(ptr_mtype), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p declared as %s does not match actual memtype %s", ptr, PetscMemTypeToString(mtype), PetscMemTypeToString(ptr_mtype));
185       } else if (mtype == PETSC_MEMTYPE_DEVICE) {
186         // generic "device" memory should only care if the actual memtype is also generically
187         // "device"
188         PetscCheck(PetscMemTypeDevice(ptr_mtype), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p declared as %s does not match actual memtype %s", ptr, PetscMemTypeToString(mtype), PetscMemTypeToString(ptr_mtype));
189       } else {
190         PetscCheck(mtype == ptr_mtype, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p declared as %s does not match actual memtype %s", ptr, PetscMemTypeToString(mtype), PetscMemTypeToString(ptr_mtype));
191       }
192     }
193     PetscFunctionReturn(PETSC_SUCCESS);
194   }
195 
196   // The final stop in the GetHandles_/GetFromHandles_ chain. This retrieves the various
197   // compute handles and ensure the given PetscDeviceContext is of the right type
198   static PetscErrorCode GetFromHandleDispatch_(PetscDeviceContext, cupmBlasHandle_t *, cupmStream_t *) noexcept;
199   static PetscErrorCode GetHandleDispatch_(PetscDeviceContext *, cupmBlasHandle_t *, cupmStream_t *) noexcept;
200 
201 protected:
202   static PetscErrorCode VecView_Debug(Vec v, const char *message = "") noexcept
203   {
204     const auto   pobj  = PetscObjectCast(v);
205     const auto   vimpl = VecIMPLCast(v);
206     const auto   vcu   = VecCUPMCast(v);
207     PetscMemType mtype;
208     MPI_Comm     comm;
209 
210     PetscFunctionBegin;
211     PetscValidPointer(vimpl, 1);
212     PetscValidPointer(vcu, 1);
213     PetscCall(PetscObjectGetComm(pobj, &comm));
214     PetscCall(PetscPrintf(comm, "---------- %s ----------\n", message));
215     PetscCall(PetscObjectPrintClassNamePrefixType(pobj, PETSC_VIEWER_STDOUT_(comm)));
216     PetscCall(PetscPrintf(comm, "Address:             %p\n", v));
217     PetscCall(PetscPrintf(comm, "Size:                %" PetscInt_FMT "\n", v->map->n));
218     PetscCall(PetscPrintf(comm, "Offload mask:        %s\n", PetscOffloadMaskToString(v->offloadmask)));
219     PetscCall(PetscPrintf(comm, "Host ptr:            %p\n", vimpl->array));
220     PetscCall(PetscPrintf(comm, "Device ptr:          %p\n", vcu->array_d));
221     PetscCall(PetscPrintf(comm, "Device alloced ptr:  %p\n", vcu->array_allocated_d));
222     PetscCall(PetscCUPMGetMemType(vcu->array_d, &mtype));
223     PetscCall(PetscPrintf(comm, "dptr is device mem?  %s\n", PetscBools[static_cast<PetscBool>(PetscMemTypeDevice(mtype))]));
224     PetscFunctionReturn(PETSC_SUCCESS);
225   }
226 
227   // Helper routines to retrieve various combinations of handles. The first set (GetHandles_)
228   // gets a PetscDeviceContext along with it, while the second set (GetHandlesFrom_) assumes
229   // you've gotten the PetscDeviceContext already, and retrieves the handles from it. All of
230   // them check that the PetscDeviceContext is of the appropriate type
231   static PetscErrorCode GetHandles_(PetscDeviceContext *, cupmBlasHandle_t * = nullptr, cupmStream_t * = nullptr) noexcept;
232   static PetscErrorCode GetHandles_(PetscDeviceContext *, cupmStream_t *) noexcept;
233   static PetscErrorCode GetHandles_(cupmStream_t *) noexcept;
234   static PetscErrorCode GetHandles_(cupmBlasHandle_t *) noexcept;
235 
236   static PetscErrorCode GetHandlesFrom_(PetscDeviceContext, cupmBlasHandle_t *, cupmStream_t * = nullptr) noexcept;
237   static PetscErrorCode GetHandlesFrom_(PetscDeviceContext, cupmStream_t *) noexcept;
238 
239   // Delete the allocated device array if required and replace it with the given array
240   static PetscErrorCode ResetAllocatedDevicePtr_(PetscDeviceContext, Vec, PetscScalar * = nullptr) noexcept;
241   // Check either the host or device impl pointer is allocated and allocate it if
242   // isn't. CastFunctionType casts the Vec to the required type and returns the pointer
243   template <typename CastFunctionType>
244   static PetscErrorCode VecAllocateCheck_(Vec, void *&, CastFunctionType &&) noexcept;
245   // Check the CUPM part (v->spptr) is allocated, otherwise allocate it
246   static PetscErrorCode VecCUPMAllocateCheck_(Vec) noexcept;
247   // Check the Host part (v->data) is allocated, otherwise allocate it
248   static PetscErrorCode VecIMPLAllocateCheck_(Vec) noexcept;
249   // Check the Host array is allocated, otherwise allocate it
250   static PetscErrorCode HostAllocateCheck_(PetscDeviceContext, Vec) noexcept;
251   // Check the CUPM array is allocated, otherwise allocate it
252   static PetscErrorCode DeviceAllocateCheck_(PetscDeviceContext, Vec) noexcept;
253   // Copy HTOD, allocating device if necessary
254   static PetscErrorCode CopyToDevice_(PetscDeviceContext, Vec, bool = false) noexcept;
255   // Copy DTOH, allocating host if necessary
256   static PetscErrorCode CopyToHost_(PetscDeviceContext, Vec, bool = false) noexcept;
257 
258 public:
259   struct Vec_CUPM {
260     PetscScalar *array_d;           // gpu data
261     PetscScalar *array_allocated_d; // does PETSc own the array ptr?
262     PetscBool    nvshmem;           // is array allocated in nvshmem? It is used to allocate
263                                     // Mvctx->lvec in nvshmem
264 
265     // COO stuff
266     PetscCount *jmap1_d; // [m+1]: i-th entry of the vector has jmap1[i+1]-jmap1[i] repeats
267                          // in COO arrays
268     PetscCount *perm1_d; // [tot1]: permutation array for local entries
269     PetscCount *imap2_d; // [nnz2]: i-th unique entry in recvbuf is imap2[i]-th entry in
270                          // the vector
271     PetscCount *jmap2_d; // [nnz2+1]
272     PetscCount *perm2_d; // [recvlen]
273     PetscCount *Cperm_d; // [sendlen]: permutation array to fill sendbuf[]. 'C' for
274                          // communication
275 
276     // Buffers for remote values in VecSetValuesCOO()
277     PetscScalar *sendbuf_d;
278     PetscScalar *recvbuf_d;
279   };
280 
281   // Cast the Vec to its Vec_CUPM struct, i.e. return the result of (Vec_CUPM *)v->spptr
282   PETSC_NODISCARD static Vec_CUPM *VecCUPMCast(Vec) noexcept;
283   // Cast the Vec to its host struct, i.e. return the result of (Vec_Seq *)v->data
284   template <typename U = Derived>
285   PETSC_NODISCARD static constexpr auto VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v));
286   // Get the PetscLogEvents for HTOD and DTOH
287   PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyToGPU() noexcept;
288   PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyFromGPU() noexcept;
289   // Get the VecTypes
290   PETSC_NODISCARD static constexpr VecType VECSEQCUPM() noexcept;
291   PETSC_NODISCARD static constexpr VecType VECMPICUPM() noexcept;
292   // Get the VecType of the calling vector
293   template <typename U = Derived>
294   PETSC_NODISCARD static constexpr VecType         VECIMPLCUPM() noexcept;
295   PETSC_NODISCARD static constexpr PetscRandomType PETSCDEVICERAND() noexcept;
296 
297   // Call the host destroy function, i.e. VecDestroy_Seq()
298   static PetscErrorCode VecDestroy_IMPL(Vec) noexcept;
299   // Call the host reset function, i.e. VecResetArray_Seq()
300   static PetscErrorCode VecResetArray_IMPL(Vec) noexcept;
301   // ... you get the idea
302   static PetscErrorCode VecPlaceArray_IMPL(Vec, const PetscScalar *) noexcept;
303   // Call the host creation function, i.e. VecCreate_Seq(), and also initialize the CUPM part
304   // along with it if needed
305   static PetscErrorCode VecCreate_IMPL_Private(Vec, PetscBool *, PetscInt = 0, PetscScalar * = nullptr) noexcept;
306 
307   // Shorthand for creating vector_array's. Need functions to create them, otherwise using them
308   // as an unnamed temporary leads to most vexing parse
309   PETSC_NODISCARD static auto DeviceArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>{dctx, v});
310   PETSC_NODISCARD static auto DeviceArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>{dctx, v});
311   PETSC_NODISCARD static auto DeviceArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v});
312   PETSC_NODISCARD static auto HostArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>{dctx, v});
313   PETSC_NODISCARD static auto HostArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>{dctx, v});
314   PETSC_NODISCARD static auto HostArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v});
315 
316   // disallow implicit conversion
317   template <typename U>
318   PETSC_NODISCARD static UseCUPMHostAlloc_<T> UseCUPMHostAlloc(U) noexcept = delete;
319   // utility for using cupmHostAlloc()
320   PETSC_NODISCARD static UseCUPMHostAlloc_<T> UseCUPMHostAlloc(bool) noexcept;
321   PETSC_NODISCARD static UseCUPMHostAlloc_<T> UseCUPMHostAlloc(PetscBool) noexcept;
322 
323   // ops-table functions
324   static PetscErrorCode create(Vec) noexcept;
325   static PetscErrorCode destroy(Vec) noexcept;
326   template <PetscMemType, PetscMemoryAccessMode, bool = false>
327   static PetscErrorCode getarray(Vec, PetscScalar **, PetscDeviceContext) noexcept;
328   template <PetscMemType, PetscMemoryAccessMode, bool = false>
329   static PetscErrorCode getarray(Vec, PetscScalar **) noexcept;
330   template <PetscMemType, PetscMemoryAccessMode>
331   static PetscErrorCode restorearray(Vec, PetscScalar **, PetscDeviceContext) noexcept;
332   template <PetscMemType, PetscMemoryAccessMode>
333   static PetscErrorCode restorearray(Vec, PetscScalar **) noexcept;
334   template <PetscMemoryAccessMode>
335   static PetscErrorCode getarrayandmemtype(Vec, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
336   template <PetscMemoryAccessMode>
337   static PetscErrorCode getarrayandmemtype(Vec, PetscScalar **, PetscMemType *) noexcept;
338   template <PetscMemoryAccessMode>
339   static PetscErrorCode restorearrayandmemtype(Vec, PetscScalar **, PetscDeviceContext) noexcept;
340   template <PetscMemoryAccessMode>
341   static PetscErrorCode restorearrayandmemtype(Vec, PetscScalar **) noexcept;
342   template <PetscMemType>
343   static PetscErrorCode replacearray(Vec, const PetscScalar *) noexcept;
344   template <PetscMemType>
345   static PetscErrorCode resetarray(Vec) noexcept;
346   template <PetscMemType>
347   static PetscErrorCode placearray(Vec, const PetscScalar *) noexcept;
348 
349   // common ops shared between Seq and MPI
350   static PetscErrorCode Create_CUPM(Vec) noexcept;
351   static PetscErrorCode Create_CUPMBase(MPI_Comm, PetscInt, PetscInt, PetscInt, Vec *, PetscBool, PetscLayout /*reference*/ = nullptr) noexcept;
352   static PetscErrorCode Initialize_CUPMBase(Vec, PetscBool, PetscScalar *, PetscScalar *, PetscDeviceContext) noexcept;
353   template <typename SetupFunctionT = no_op>
354   static PetscErrorCode Duplicate_CUPMBase(Vec, Vec *, PetscDeviceContext, SetupFunctionT && = SetupFunctionT{}) noexcept;
355   static PetscErrorCode BindToCPU_CUPMBase(Vec, PetscBool, PetscDeviceContext) noexcept;
356   static PetscErrorCode GetArrays_CUPMBase(Vec, const PetscScalar **, const PetscScalar **, PetscOffloadMask *, PetscDeviceContext) noexcept;
357   static PetscErrorCode ResetPreallocationCOO_CUPMBase(Vec, PetscDeviceContext) noexcept;
358   template <std::size_t NCount = 0, std::size_t NScal = 0>
359   static PetscErrorCode SetPreallocationCOO_CUPMBase(Vec, PetscCount, const PetscInt[], PetscDeviceContext, const std::array<CooPair<PetscCount>, NCount> & = {}, const std::array<CooPair<PetscScalar>, NScal> & = {}) noexcept;
360 };
361 
362 // ==========================================================================================
363 // Vec_CUPMBase::vector_array
364 //
365 // RAII versions of the get/restore array routines. Determines constness of the pointer type,
366 // holds the pointer itself and provides the implicit conversion operator.
367 //
368 // On construction this calls the moral equivalent of Vec[CUPM]GetArray[Read|Write]()
369 // (depending on PetscMemoryAccessMode) and on destruction automatically restores the array
370 // for you
371 // ==========================================================================================
372 template <device::cupm::DeviceType T, typename D>
373 template <PetscMemType MT, PetscMemoryAccessMode MA>
374 class Vec_CUPMBase<T, D>::vector_array {
375 public:
376   static const auto memory_type = MT;
377   static const auto access_type = MA;
378 
379   using value_type        = PetscScalar;
380   using pointer_type      = value_type *;
381   using cupm_pointer_type = cupmScalar_t *;
382 
383   vector_array(PetscDeviceContext, Vec) noexcept;
384   ~vector_array() noexcept;
385 
386   constexpr vector_array(vector_array &&) noexcept            = default;
387   constexpr vector_array &operator=(vector_array &&) noexcept = default;
388 
389   pointer_type      data() const noexcept;
390   cupm_pointer_type cupmdata() const noexcept;
391 
392   operator pointer_type() const noexcept;
393   // in case pointer_type == cupmscalar_pointer_type we don't want this overload to exist, so
394   // we make a dummy template parameter to allow SFINAE to nix it for us
395   template <typename U = pointer_type, typename = util::enable_if_t<!std::is_same<U, cupm_pointer_type>::value>>
396   operator cupm_pointer_type() const noexcept;
397 
398 private:
399   pointer_type       ptr_  = nullptr;
400   PetscDeviceContext dctx_ = nullptr;
401   Vec                v_    = nullptr;
402 };
403 
404 // ==========================================================================================
405 // Vec_CUPMBase::vector_array - Static Variables
406 // ==========================================================================================
407 
408 template <device::cupm::DeviceType T, typename D>
409 template <PetscMemType MT, PetscMemoryAccessMode MA>
410 const PetscMemType Vec_CUPMBase<T, D>::vector_array<MT, MA>::memory_type;
411 
412 template <device::cupm::DeviceType T, typename D>
413 template <PetscMemType MT, PetscMemoryAccessMode MA>
414 const PetscMemoryAccessMode Vec_CUPMBase<T, D>::vector_array<MT, MA>::access_type;
415 
416 // ==========================================================================================
417 // Vec_CUPMBase::vector_array - Public API
418 // ==========================================================================================
419 
420 template <device::cupm::DeviceType T, typename D>
421 template <PetscMemType MT, PetscMemoryAccessMode MA>
422 inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::vector_array(PetscDeviceContext dctx, Vec v) noexcept : dctx_(dctx), v_(v)
423 {
424   PetscFunctionBegin;
425   PetscCallAbort(PETSC_COMM_SELF, getarray<MT, MA, true>(v, &ptr_, dctx));
426   PetscFunctionReturnVoid();
427 }
428 
429 template <device::cupm::DeviceType T, typename D>
430 template <PetscMemType MT, PetscMemoryAccessMode MA>
431 inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::~vector_array() noexcept
432 {
433   PetscFunctionBegin;
434   PetscCallAbort(PETSC_COMM_SELF, restorearray<MT, MA>(v_, &ptr_, dctx_));
435   PetscFunctionReturnVoid();
436 }
437 
438 template <device::cupm::DeviceType T, typename D>
439 template <PetscMemType MT, PetscMemoryAccessMode MA>
440 inline typename Vec_CUPMBase<T, D>::template vector_array<MT, MA>::pointer_type Vec_CUPMBase<T, D>::vector_array<MT, MA>::data() const noexcept
441 {
442   return ptr_;
443 }
444 
445 template <device::cupm::DeviceType T, typename D>
446 template <PetscMemType MT, PetscMemoryAccessMode MA>
447 inline typename Vec_CUPMBase<T, D>::template vector_array<MT, MA>::cupm_pointer_type Vec_CUPMBase<T, D>::vector_array<MT, MA>::cupmdata() const noexcept
448 {
449   return cupmScalarPtrCast(data());
450 }
451 
452 template <device::cupm::DeviceType T, typename D>
453 template <PetscMemType MT, PetscMemoryAccessMode MA>
454 inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::operator pointer_type() const noexcept
455 {
456   return data();
457 }
458 
459 // in case pointer_type == cupmscalar_pointer_type we don't want this overload to exist, so
460 // we make a dummy template parameter to allow SFINAE to nix it for us
461 template <device::cupm::DeviceType T, typename D>
462 template <PetscMemType MT, PetscMemoryAccessMode MA>
463 template <typename U, typename>
464 inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::operator cupm_pointer_type() const noexcept
465 {
466   return cupmdata();
467 }
468 
469 // ==========================================================================================
470 // Vec_CUPMBase - Private API
471 // ==========================================================================================
472 
473 template <device::cupm::DeviceType T, typename D>
474 inline PetscErrorCode Vec_CUPMBase<T, D>::GetFromHandleDispatch_(PetscDeviceContext dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept
475 {
476   PetscFunctionBegin;
477   PetscValidDeviceContext(dctx, 1);
478   if (handle) PetscValidPointer(handle, 2);
479   if (stream) PetscValidPointer(stream, 3);
480   if (PetscDefined(USE_DEBUG)) {
481     PetscDeviceType dtype;
482 
483     PetscCall(PetscDeviceContextGetDeviceType(dctx, &dtype));
484     PetscCheckCompatibleDeviceTypes(PETSC_DEVICE_CUPM(), -1, dtype, 1);
485   }
486   if (handle) PetscCall(PetscDeviceContextGetBLASHandle_Internal(dctx, handle));
487   if (stream) PetscCall(PetscDeviceContextGetStreamHandle_Internal(dctx, stream));
488   PetscFunctionReturn(PETSC_SUCCESS);
489 }
490 
491 template <device::cupm::DeviceType T, typename D>
492 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandleDispatch_(PetscDeviceContext *dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept
493 {
494   PetscDeviceContext dctx_loc = nullptr;
495 
496   PetscFunctionBegin;
497   // silence uninitialized variable warnings
498   if (dctx) *dctx = nullptr;
499   PetscCall(PetscDeviceContextGetCurrentContext(&dctx_loc));
500   PetscCall(GetFromHandleDispatch_(dctx_loc, handle, stream));
501   if (dctx) *dctx = dctx_loc;
502   PetscFunctionReturn(PETSC_SUCCESS);
503 }
504 
505 // ==========================================================================================
506 // Vec_CUPMBase - Protected API
507 // ==========================================================================================
508 
509 template <device::cupm::DeviceType T, typename D>
510 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(PetscDeviceContext *dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept
511 {
512   return GetHandleDispatch_(dctx, handle, stream);
513 }
514 
515 template <device::cupm::DeviceType T, typename D>
516 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(PetscDeviceContext *dctx, cupmStream_t *stream) noexcept
517 {
518   return GetHandles_(dctx, nullptr, stream);
519 }
520 
521 template <device::cupm::DeviceType T, typename D>
522 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(cupmStream_t *stream) noexcept
523 {
524   return GetHandles_(nullptr, stream);
525 }
526 
527 template <device::cupm::DeviceType T, typename D>
528 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(cupmBlasHandle_t *handle) noexcept
529 {
530   return GetHandles_(nullptr, handle);
531 }
532 
533 template <device::cupm::DeviceType T, typename D>
534 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandlesFrom_(PetscDeviceContext dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept
535 {
536   return GetFromHandleDispatch_(dctx, handle, stream);
537 }
538 
539 template <device::cupm::DeviceType T, typename D>
540 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandlesFrom_(PetscDeviceContext dctx, cupmStream_t *stream) noexcept
541 {
542   return GetHandlesFrom_(dctx, nullptr, stream);
543 }
544 
545 template <device::cupm::DeviceType T, typename D>
546 inline PetscErrorCode Vec_CUPMBase<T, D>::ResetAllocatedDevicePtr_(PetscDeviceContext dctx, Vec v, PetscScalar *new_value) noexcept
547 {
548   auto &device_array = VecCUPMCast(v)->array_allocated_d;
549 
550   PetscFunctionBegin;
551   if (device_array) {
552     if (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) {
553       PetscCall(PetscNvshmemFree(device_array));
554     } else {
555       cupmStream_t stream;
556 
557       PetscCall(GetHandlesFrom_(dctx, &stream));
558       PetscCallCUPM(cupmFreeAsync(device_array, stream));
559     }
560   }
561   device_array = new_value;
562   PetscFunctionReturn(PETSC_SUCCESS);
563 }
564 
565 namespace
566 {
567 
568 inline PetscErrorCode VecCUPMCheckMinimumPinnedMemory_Internal(Vec v) noexcept
569 {
570   auto      mem = static_cast<PetscInt>(v->minimum_bytes_pinned_memory);
571   PetscBool flg;
572 
573   PetscFunctionBegin;
574   PetscObjectOptionsBegin(PetscObjectCast(v));
575   PetscCall(PetscOptionsRangeInt("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", mem, &mem, &flg, 0, std::numeric_limits<decltype(mem)>::max()));
576   if (flg) v->minimum_bytes_pinned_memory = mem;
577   PetscOptionsEnd();
578   PetscFunctionReturn(PETSC_SUCCESS);
579 }
580 
581 } // anonymous namespace
582 
583 template <device::cupm::DeviceType T, typename D>
584 template <typename CastFunctionType>
585 inline PetscErrorCode Vec_CUPMBase<T, D>::VecAllocateCheck_(Vec v, void *&dest, CastFunctionType &&cast) noexcept
586 {
587   PetscFunctionBegin;
588   if (PetscLikely(dest)) PetscFunctionReturn(PETSC_SUCCESS);
589   // do the check here so we don't have to do it in every function
590   PetscCall(checkCupmBlasIntCast(v->map->n));
591   {
592     auto impl = cast(v);
593 
594     PetscCall(PetscNew(&impl));
595     dest = impl;
596   }
597   PetscFunctionReturn(PETSC_SUCCESS);
598 }
599 
600 template <device::cupm::DeviceType T, typename D>
601 inline PetscErrorCode Vec_CUPMBase<T, D>::VecIMPLAllocateCheck_(Vec v) noexcept
602 {
603   PetscFunctionBegin;
604   PetscCall(VecAllocateCheck_(v, v->data, VecIMPLCast<D>));
605   PetscFunctionReturn(PETSC_SUCCESS);
606 }
607 
608 // allocate the Vec_CUPM struct. this is normally done through DeviceAllocateCheck_(), but in
609 // certain circumstances (such as when the user places the device array) we do not want to do
610 // the full DeviceAllocateCheck_() as it also allocates the array
611 template <device::cupm::DeviceType T, typename D>
612 inline PetscErrorCode Vec_CUPMBase<T, D>::VecCUPMAllocateCheck_(Vec v) noexcept
613 {
614   PetscFunctionBegin;
615   PetscCall(VecAllocateCheck_(v, v->spptr, VecCUPMCast));
616   PetscFunctionReturn(PETSC_SUCCESS);
617 }
618 
619 template <device::cupm::DeviceType T, typename D>
620 inline PetscErrorCode Vec_CUPMBase<T, D>::HostAllocateCheck_(PetscDeviceContext, Vec v) noexcept
621 {
622   PetscFunctionBegin;
623   PetscCall(VecIMPLAllocateCheck_(v));
624   if (auto &alloc = VecIMPLCast(v)->array_allocated) PetscFunctionReturn(PETSC_SUCCESS);
625   else {
626     PetscCall(VecCUPMCheckMinimumPinnedMemory_Internal(v));
627     {
628       const auto n     = v->map->n;
629       const auto useit = UseCUPMHostAlloc((n * sizeof(*alloc)) > v->minimum_bytes_pinned_memory);
630 
631       v->pinned_memory = static_cast<decltype(v->pinned_memory)>(useit.value());
632       PetscCall(PetscMalloc1(n, &alloc));
633     }
634     if (!VecIMPLCast(v)->array) VecIMPLCast(v)->array = alloc;
635     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) v->offloadmask = PETSC_OFFLOAD_CPU;
636   }
637   PetscFunctionReturn(PETSC_SUCCESS);
638 }
639 
640 template <device::cupm::DeviceType T, typename D>
641 inline PetscErrorCode Vec_CUPMBase<T, D>::DeviceAllocateCheck_(PetscDeviceContext dctx, Vec v) noexcept
642 {
643   PetscFunctionBegin;
644   PetscCall(VecCUPMAllocateCheck_(v));
645   if (auto &alloc = VecCUPMCast(v)->array_d) PetscFunctionReturn(PETSC_SUCCESS);
646   else {
647     const auto   n                 = v->map->n;
648     auto        &array_allocated_d = VecCUPMCast(v)->array_allocated_d;
649     cupmStream_t stream;
650 
651     PetscCall(GetHandlesFrom_(dctx, &stream));
652     PetscCall(PetscCUPMMallocAsync(&array_allocated_d, n, stream));
653     alloc = array_allocated_d;
654     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
655       const auto vimp = VecIMPLCast(v);
656       v->offloadmask  = (vimp && vimp->array) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
657     }
658   }
659   PetscFunctionReturn(PETSC_SUCCESS);
660 }
661 
662 template <device::cupm::DeviceType T, typename D>
663 inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToDevice_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept
664 {
665   PetscFunctionBegin;
666   PetscCall(DeviceAllocateCheck_(dctx, v));
667   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
668     cupmStream_t stream;
669 
670     v->offloadmask = PETSC_OFFLOAD_BOTH;
671     PetscCall(GetHandlesFrom_(dctx, &stream));
672     PetscCall(PetscLogEventBegin(VEC_CUPMCopyToGPU(), v, 0, 0, 0));
673     PetscCall(PetscCUPMMemcpyAsync(VecCUPMCast(v)->array_d, VecIMPLCast(v)->array, v->map->n, cupmMemcpyHostToDevice, stream, forceasync));
674     PetscCall(PetscLogEventEnd(VEC_CUPMCopyToGPU(), v, 0, 0, 0));
675   }
676   PetscFunctionReturn(PETSC_SUCCESS);
677 }
678 
679 template <device::cupm::DeviceType T, typename D>
680 inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToHost_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept
681 {
682   PetscFunctionBegin;
683   PetscCall(HostAllocateCheck_(dctx, v));
684   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
685     cupmStream_t stream;
686 
687     v->offloadmask = PETSC_OFFLOAD_BOTH;
688     PetscCall(GetHandlesFrom_(dctx, &stream));
689     PetscCall(PetscLogEventBegin(VEC_CUPMCopyFromGPU(), v, 0, 0, 0));
690     PetscCall(PetscCUPMMemcpyAsync(VecIMPLCast(v)->array, VecCUPMCast(v)->array_d, v->map->n, cupmMemcpyDeviceToHost, stream, forceasync));
691     PetscCall(PetscLogEventEnd(VEC_CUPMCopyFromGPU(), v, 0, 0, 0));
692   }
693   PetscFunctionReturn(PETSC_SUCCESS);
694 }
695 
696 // ==========================================================================================
697 // Vec_CUPMBase - Public API
698 // ==========================================================================================
699 
700 template <device::cupm::DeviceType T, typename D>
701 inline typename Vec_CUPMBase<T, D>::Vec_CUPM *Vec_CUPMBase<T, D>::VecCUPMCast(Vec v) noexcept
702 {
703   return static_cast<Vec_CUPM *>(v->spptr);
704 }
705 
706 // This is a trick to get around the fact that in CRTP the derived class is not yet fully
707 // defined because Base<Derived> must necessarily be instantiated before Derived is
708 // complete. By using a dummy template parameter we make the type "dependent" and so will
709 // only be determined when the derived class is instantiated (and therefore fully defined)
710 template <device::cupm::DeviceType T, typename D>
711 template <typename U>
712 inline constexpr auto Vec_CUPMBase<T, D>::VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v))
713 {
714   return U::VecIMPLCast_(v);
715 }
716 
717 template <device::cupm::DeviceType T, typename D>
718 inline PetscErrorCode Vec_CUPMBase<T, D>::VecDestroy_IMPL(Vec v) noexcept
719 {
720   return D::VecDestroy_IMPL_(v);
721 }
722 
723 template <device::cupm::DeviceType T, typename D>
724 inline PetscErrorCode Vec_CUPMBase<T, D>::VecResetArray_IMPL(Vec v) noexcept
725 {
726   return D::VecResetArray_IMPL_(v);
727 }
728 
729 template <device::cupm::DeviceType T, typename D>
730 inline PetscErrorCode Vec_CUPMBase<T, D>::VecPlaceArray_IMPL(Vec v, const PetscScalar *a) noexcept
731 {
732   return D::VecPlaceArray_IMPL_(v, a);
733 }
734 
735 template <device::cupm::DeviceType T, typename D>
736 inline PetscErrorCode Vec_CUPMBase<T, D>::VecCreate_IMPL_Private(Vec v, PetscBool *alloc_missing, PetscInt nghost, PetscScalar *host_array) noexcept
737 {
738   return D::VecCreate_IMPL_Private_(v, alloc_missing, nghost, host_array);
739 }
740 
741 template <device::cupm::DeviceType T, typename D>
742 inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyToGPU() noexcept
743 {
744   return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyToGPU : VEC_HIPCopyToGPU;
745 }
746 
747 template <device::cupm::DeviceType T, typename D>
748 inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyFromGPU() noexcept
749 {
750   return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyFromGPU : VEC_HIPCopyFromGPU;
751 }
752 
753 template <device::cupm::DeviceType T, typename D>
754 inline constexpr VecType Vec_CUPMBase<T, D>::VECSEQCUPM() noexcept
755 {
756   return T == device::cupm::DeviceType::CUDA ? VECSEQCUDA : VECSEQHIP;
757 }
758 
759 template <device::cupm::DeviceType T, typename D>
760 inline constexpr VecType Vec_CUPMBase<T, D>::VECMPICUPM() noexcept
761 {
762   return T == device::cupm::DeviceType::CUDA ? VECMPICUDA : VECMPIHIP;
763 }
764 
765 template <device::cupm::DeviceType T, typename D>
766 template <typename U>
767 inline constexpr VecType Vec_CUPMBase<T, D>::VECIMPLCUPM() noexcept
768 {
769   return U::VECIMPLCUPM_();
770 }
771 
772 template <device::cupm::DeviceType T, typename D>
773 inline constexpr PetscRandomType Vec_CUPMBase<T, D>::PETSCDEVICERAND() noexcept
774 {
775   // REVIEW ME: HIP default rng?
776   return T == device::cupm::DeviceType::CUDA ? PETSCCURAND : PETSCRANDER48;
777 }
778 
779 // utility for using cupmHostAlloc()
780 template <device::cupm::DeviceType T, typename D>
781 inline UseCUPMHostAlloc_<T> Vec_CUPMBase<T, D>::UseCUPMHostAlloc(bool b) noexcept
782 {
783   return {b};
784 }
785 
786 template <device::cupm::DeviceType T, typename D>
787 inline UseCUPMHostAlloc_<T> Vec_CUPMBase<T, D>::UseCUPMHostAlloc(PetscBool b) noexcept
788 {
789   return UseCUPMHostAlloc(static_cast<bool>(b));
790 }
791 
792 // private version that takes a PetscDeviceContext, called by the public variant
793 template <device::cupm::DeviceType T, typename D>
794 template <PetscMemType mtype, PetscMemoryAccessMode access, bool force>
795 inline PetscErrorCode Vec_CUPMBase<T, D>::getarray(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept
796 {
797   constexpr auto hostmem     = PetscMemTypeHost(mtype);
798   const auto     oldmask     = v->offloadmask;
799   auto          &mask        = v->offloadmask;
800   auto           should_sync = false;
801 
802   PetscFunctionBegin;
803   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
804   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
805   if (PetscMemoryAccessRead(access)) {
806     // READ or READ_WRITE
807     if (((oldmask == PETSC_OFFLOAD_GPU) && hostmem) || ((oldmask == PETSC_OFFLOAD_CPU) && !hostmem)) {
808       // if we move the data we should set the flag to synchronize later on
809       should_sync = true;
810     }
811     PetscCall((hostmem ? CopyToHost_ : CopyToDevice_)(dctx, v, force));
812   } else {
813     // WRITE only
814     PetscCall((hostmem ? HostAllocateCheck_ : DeviceAllocateCheck_)(dctx, v));
815   }
816   *a = hostmem ? VecIMPLCast(v)->array : VecCUPMCast(v)->array_d;
817   // if unallocated previously we should zero things out if we intend to read
818   if (PetscMemoryAccessRead(access) && (oldmask == PETSC_OFFLOAD_UNALLOCATED)) {
819     const auto n = v->map->n;
820 
821     if (hostmem) {
822       PetscCall(PetscArrayzero(*a, n));
823     } else {
824       cupmStream_t stream;
825 
826       PetscCall(GetHandlesFrom_(dctx, &stream));
827       PetscCall(PetscCUPMMemsetAsync(*a, 0, n, stream, force));
828       should_sync = true;
829     }
830   }
831   // update the offloadmask if we intend to write, since we assume immediately modified
832   if (PetscMemoryAccessWrite(access)) {
833     PetscCall(VecSetErrorIfLocked(v, 1));
834     // REVIEW ME: this should probably also call PetscObjectStateIncrease() since we assume it
835     // is immediately modified
836     mask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
837   }
838   // if we are a globally blocking stream and we have MOVED data then we should synchronize,
839   // since even doing async calls on the NULL stream is not synchronous
840   if (!force && should_sync) PetscCall(PetscDeviceContextSynchronize(dctx));
841   PetscFunctionReturn(PETSC_SUCCESS);
842 }
843 
844 // v->ops->getarray[read|write] or VecCUPMGetArray[Read|Write]()
845 template <device::cupm::DeviceType T, typename D>
846 template <PetscMemType mtype, PetscMemoryAccessMode access, bool force>
847 inline PetscErrorCode Vec_CUPMBase<T, D>::getarray(Vec v, PetscScalar **a) noexcept
848 {
849   PetscDeviceContext dctx;
850 
851   PetscFunctionBegin;
852   PetscCall(GetHandles_(&dctx));
853   PetscCall(getarray<mtype, access, force>(v, a, dctx));
854   PetscFunctionReturn(PETSC_SUCCESS);
855 }
856 
857 // private version that takes a PetscDeviceContext, called by the public variant
858 template <device::cupm::DeviceType T, typename D>
859 template <PetscMemType mtype, PetscMemoryAccessMode access>
860 inline PetscErrorCode Vec_CUPMBase<T, D>::restorearray(Vec v, PetscScalar **a, PetscDeviceContext) noexcept
861 {
862   PetscFunctionBegin;
863   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
864   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
865   if (PetscMemoryAccessWrite(access)) {
866     // WRITE or READ_WRITE
867     PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
868     v->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
869   }
870   if (a) {
871     PetscCall(CheckPointerMatchesMemType_(*a, mtype));
872     *a = nullptr;
873   }
874   PetscFunctionReturn(PETSC_SUCCESS);
875 }
876 
877 // v->ops->restorearray[read|write] or VecCUPMRestoreArray[Read|Write]()
878 template <device::cupm::DeviceType T, typename D>
879 template <PetscMemType mtype, PetscMemoryAccessMode access>
880 inline PetscErrorCode Vec_CUPMBase<T, D>::restorearray(Vec v, PetscScalar **a) noexcept
881 {
882   PetscDeviceContext dctx;
883 
884   PetscFunctionBegin;
885   PetscCall(GetHandles_(&dctx));
886   PetscCall(restorearray<mtype, access>(v, a, dctx));
887   PetscFunctionReturn(PETSC_SUCCESS);
888 }
889 
890 template <device::cupm::DeviceType T, typename D>
891 template <PetscMemoryAccessMode access>
892 inline PetscErrorCode Vec_CUPMBase<T, D>::getarrayandmemtype(Vec v, PetscScalar **a, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
893 {
894   PetscFunctionBegin;
895   PetscCall(getarray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx));
896   if (mtype) *mtype = (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUPM();
897   PetscFunctionReturn(PETSC_SUCCESS);
898 }
899 
900 // v->ops->getarrayandmemtype
901 template <device::cupm::DeviceType T, typename D>
902 template <PetscMemoryAccessMode access>
903 inline PetscErrorCode Vec_CUPMBase<T, D>::getarrayandmemtype(Vec v, PetscScalar **a, PetscMemType *mtype) noexcept
904 {
905   PetscDeviceContext dctx;
906 
907   PetscFunctionBegin;
908   PetscCall(GetHandles_(&dctx));
909   PetscCall(getarrayandmemtype<access>(v, a, mtype, dctx));
910   PetscFunctionReturn(PETSC_SUCCESS);
911 }
912 
913 template <device::cupm::DeviceType T, typename D>
914 template <PetscMemoryAccessMode access>
915 inline PetscErrorCode Vec_CUPMBase<T, D>::restorearrayandmemtype(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept
916 {
917   PetscFunctionBegin;
918   PetscCall(restorearray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx));
919   PetscFunctionReturn(PETSC_SUCCESS);
920 }
921 
922 // v->ops->restorearrayandmemtype
923 template <device::cupm::DeviceType T, typename D>
924 template <PetscMemoryAccessMode access>
925 inline PetscErrorCode Vec_CUPMBase<T, D>::restorearrayandmemtype(Vec v, PetscScalar **a) noexcept
926 {
927   PetscDeviceContext dctx;
928 
929   PetscFunctionBegin;
930   PetscCall(GetHandles_(&dctx));
931   PetscCall(restorearrayandmemtype<access>(v, a, dctx));
932   PetscFunctionReturn(PETSC_SUCCESS);
933 }
934 
935 // v->ops->placearray or VecCUPMPlaceArray()
936 template <device::cupm::DeviceType T, typename D>
937 template <PetscMemType mtype>
938 inline PetscErrorCode Vec_CUPMBase<T, D>::placearray(Vec v, const PetscScalar *a) noexcept
939 {
940   PetscDeviceContext dctx;
941 
942   PetscFunctionBegin;
943   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
944   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
945   PetscCall(CheckPointerMatchesMemType_(a, mtype));
946   PetscCall(GetHandles_(&dctx));
947   if (PetscMemTypeHost(mtype)) {
948     PetscCall(CopyToHost_(dctx, v));
949     PetscCall(VecPlaceArray_IMPL(v, a));
950     v->offloadmask = PETSC_OFFLOAD_CPU;
951   } else {
952     PetscCall(VecIMPLAllocateCheck_(v));
953     {
954       auto &backup_array = VecIMPLCast(v)->unplacedarray;
955 
956       PetscCheck(!backup_array, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "VecPlaceArray() was already called on this vector, without a call to VecResetArray()");
957       PetscCall(CopyToDevice_(dctx, v));
958       PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
959       backup_array = util::exchange(VecCUPMCast(v)->array_d, const_cast<PetscScalar *>(a));
960       // only update the offload mask if we actually assign a pointer
961       if (a) v->offloadmask = PETSC_OFFLOAD_GPU;
962     }
963   }
964   PetscFunctionReturn(PETSC_SUCCESS);
965 }
966 
967 // v->ops->replacearray or VecCUPMReplaceArray()
968 template <device::cupm::DeviceType T, typename D>
969 template <PetscMemType mtype>
970 inline PetscErrorCode Vec_CUPMBase<T, D>::replacearray(Vec v, const PetscScalar *a) noexcept
971 {
972   const auto         aptr = const_cast<PetscScalar *>(a);
973   PetscDeviceContext dctx;
974 
975   PetscFunctionBegin;
976   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
977   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
978   PetscCall(CheckPointerMatchesMemType_(a, mtype));
979   PetscCall(GetHandles_(&dctx));
980   if (PetscMemTypeHost(mtype)) {
981     PetscCall(VecIMPLAllocateCheck_(v));
982     {
983       const auto vimpl      = VecIMPLCast(v);
984       auto      &host_array = vimpl->array_allocated;
985 
986       // make sure the users array has the latest values.
987       // REVIEW ME: why? we're about to free it
988       if (host_array != vimpl->array) PetscCall(CopyToHost_(dctx, v));
989       if (host_array) {
990         const auto useit = UseCUPMHostAlloc(v->pinned_memory);
991 
992         PetscCall(PetscFree(host_array));
993       }
994       host_array       = aptr;
995       vimpl->array     = host_array;
996       v->pinned_memory = PETSC_FALSE; // REVIEW ME: we can determine this
997       v->offloadmask   = PETSC_OFFLOAD_CPU;
998     }
999   } else {
1000     PetscCall(VecCUPMAllocateCheck_(v));
1001     {
1002       const auto vcu = VecCUPMCast(v);
1003 
1004       PetscCall(ResetAllocatedDevicePtr_(dctx, v, aptr));
1005       // don't update the offloadmask if placed pointer is NULL
1006       vcu->array_d = vcu->array_allocated_d /* = aptr */;
1007       if (aptr) v->offloadmask = PETSC_OFFLOAD_GPU;
1008     }
1009   }
1010   PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
1011   PetscFunctionReturn(PETSC_SUCCESS);
1012 }
1013 
1014 // v->ops->resetarray or VecCUPMResetArray()
1015 template <device::cupm::DeviceType T, typename D>
1016 template <PetscMemType mtype>
1017 inline PetscErrorCode Vec_CUPMBase<T, D>::resetarray(Vec v) noexcept
1018 {
1019   PetscDeviceContext dctx;
1020 
1021   PetscFunctionBegin;
1022   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1023   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
1024   PetscCall(GetHandles_(&dctx));
1025   // REVIEW ME:
1026   // this is wildly inefficient but must be done if we assume that the placed array must have
1027   // correct values
1028   if (PetscMemTypeHost(mtype)) {
1029     PetscCall(CopyToHost_(dctx, v));
1030     PetscCall(VecResetArray_IMPL(v));
1031     v->offloadmask = PETSC_OFFLOAD_CPU;
1032   } else {
1033     PetscCall(VecIMPLAllocateCheck_(v));
1034     PetscCall(VecCUPMAllocateCheck_(v));
1035     {
1036       const auto vcu        = VecCUPMCast(v);
1037       const auto vimpl      = VecIMPLCast(v);
1038       auto      &host_array = vimpl->unplacedarray;
1039 
1040       PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_DEVICE));
1041       PetscCall(CopyToDevice_(dctx, v));
1042       PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
1043       // Need to reset the offloadmask. If we had a stashed pointer we are on the GPU,
1044       // otherwise check if the host has a valid pointer. If neither, then we are not
1045       // allocated.
1046       vcu->array_d = host_array;
1047       if (host_array) {
1048         host_array     = nullptr;
1049         v->offloadmask = PETSC_OFFLOAD_GPU;
1050       } else if (vimpl->array) {
1051         v->offloadmask = PETSC_OFFLOAD_CPU;
1052       } else {
1053         v->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1054       }
1055     }
1056   }
1057   PetscFunctionReturn(PETSC_SUCCESS);
1058 }
1059 
1060 // v->ops->create
1061 template <device::cupm::DeviceType T, typename D>
1062 inline PetscErrorCode Vec_CUPMBase<T, D>::create(Vec v) noexcept
1063 {
1064   PetscBool          alloc_missing;
1065   PetscDeviceContext dctx;
1066 
1067   PetscFunctionBegin;
1068   PetscCall(VecCreate_IMPL_Private(v, &alloc_missing));
1069   PetscCall(GetHandles_(&dctx));
1070   PetscCall(Initialize_CUPMBase(v, alloc_missing, nullptr, nullptr, dctx));
1071   PetscFunctionReturn(PETSC_SUCCESS);
1072 }
1073 
1074 // v->ops->destroy
1075 template <device::cupm::DeviceType T, typename D>
1076 inline PetscErrorCode Vec_CUPMBase<T, D>::destroy(Vec v) noexcept
1077 {
1078   PetscFunctionBegin;
1079   if (const auto vcu = VecCUPMCast(v)) {
1080     PetscDeviceContext dctx;
1081 
1082     PetscCall(GetHandles_(&dctx));
1083     PetscCall(ResetAllocatedDevicePtr_(dctx, v));
1084     PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx));
1085     PetscCall(PetscFree(v->spptr));
1086   }
1087   PetscCall(PetscObjectSAWsViewOff(PetscObjectCast(v)));
1088   if (const auto vimpl = VecIMPLCast(v)) {
1089     if (auto &array_allocated = vimpl->array_allocated) {
1090       const auto useit = UseCUPMHostAlloc(v->pinned_memory);
1091 
1092       // do this ourselves since we may want to use the cupm functions
1093       PetscCall(PetscFree(array_allocated));
1094     }
1095   }
1096   v->pinned_memory = PETSC_FALSE;
1097   PetscCall(VecDestroy_IMPL(v));
1098   PetscFunctionReturn(PETSC_SUCCESS);
1099 }
1100 
1101 // ================================================================================== //
1102 //                      Common core between Seq and MPI                               //
1103 
1104 // VecCreate_CUPM()
1105 template <device::cupm::DeviceType T, typename D>
1106 inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPM(Vec v) noexcept
1107 {
1108   PetscMPIInt size;
1109 
1110   PetscFunctionBegin;
1111   PetscCallMPI(MPI_Comm_size(PetscObjectComm(PetscObjectCast(v)), &size));
1112   PetscCall(VecSetType(v, size > 1 ? VECMPICUPM() : VECSEQCUPM()));
1113   PetscFunctionReturn(PETSC_SUCCESS);
1114 }
1115 
1116 // VecCreateCUPM()
1117 template <device::cupm::DeviceType T, typename D>
1118 inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPMBase(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, Vec *v, PetscBool call_set_type, PetscLayout reference) noexcept
1119 {
1120   PetscFunctionBegin;
1121   PetscCall(VecCreate(comm, v));
1122   if (reference) PetscCall(PetscLayoutReference(reference, &(*v)->map));
1123   PetscCall(VecSetSizes(*v, n, N));
1124   if (bs) PetscCall(VecSetBlockSize(*v, bs));
1125   if (call_set_type) PetscCall(VecSetType(*v, VECIMPLCUPM()));
1126   PetscFunctionReturn(PETSC_SUCCESS);
1127 }
1128 
1129 // VecCreateIMPL_CUPM(), called through v->ops->create
1130 template <device::cupm::DeviceType T, typename D>
1131 inline PetscErrorCode Vec_CUPMBase<T, D>::Initialize_CUPMBase(Vec v, PetscBool allocate_missing, PetscScalar *host_array, PetscScalar *device_array, PetscDeviceContext dctx) noexcept
1132 {
1133   PetscFunctionBegin;
1134   // REVIEW ME: perhaps not needed
1135   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
1136   PetscCall(PetscObjectChangeTypeName(PetscObjectCast(v), VECIMPLCUPM()));
1137   PetscCall(D::bindtocpu(v, PETSC_FALSE));
1138   if (device_array) {
1139     PetscCall(CheckPointerMatchesMemType_(device_array, PETSC_MEMTYPE_CUPM()));
1140     PetscCall(VecCUPMAllocateCheck_(v));
1141     VecCUPMCast(v)->array_d = device_array;
1142   }
1143   if (host_array) {
1144     PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_HOST));
1145     VecIMPLCast(v)->array = host_array;
1146   }
1147   if (allocate_missing) {
1148     PetscCall(DeviceAllocateCheck_(dctx, v));
1149     PetscCall(HostAllocateCheck_(dctx, v));
1150     // REVIEW ME: junchao, is this needed with new calloc() branch? VecSet() will call
1151     // set() for reference
1152     // calls device-version
1153     PetscCall(VecSet(v, 0));
1154     // zero the host while device is underway
1155     PetscCall(PetscArrayzero(VecIMPLCast(v)->array, v->map->n));
1156     v->offloadmask = PETSC_OFFLOAD_BOTH;
1157   } else {
1158     if (host_array) {
1159       v->offloadmask = device_array ? PETSC_OFFLOAD_BOTH : PETSC_OFFLOAD_CPU;
1160     } else {
1161       v->offloadmask = device_array ? PETSC_OFFLOAD_GPU : PETSC_OFFLOAD_UNALLOCATED;
1162     }
1163   }
1164   PetscFunctionReturn(PETSC_SUCCESS);
1165 }
1166 
1167 // v->ops->duplicate
1168 template <device::cupm::DeviceType T, typename D>
1169 template <typename SetupFunctionT>
1170 inline PetscErrorCode Vec_CUPMBase<T, D>::Duplicate_CUPMBase(Vec v, Vec *y, PetscDeviceContext dctx, SetupFunctionT &&DerivedCreateIMPLCUPM_Async) noexcept
1171 {
1172   // if the derived setup is the default no_op then we should call VecSetType()
1173   constexpr auto call_set_type = static_cast<PetscBool>(std::is_same<SetupFunctionT, no_op>::value);
1174   const auto     vobj          = PetscObjectCast(v);
1175   const auto     map           = v->map;
1176   PetscInt       bs;
1177 
1178   PetscFunctionBegin;
1179   PetscCall(VecGetBlockSize(v, &bs));
1180   PetscCall(Create_CUPMBase(PetscObjectComm(vobj), bs, map->n, map->N, y, call_set_type, map));
1181   // Derived class can set up the remainder of the data structures here
1182   PetscCall(DerivedCreateIMPLCUPM_Async(*y));
1183   // If the other vector is bound to CPU then the memcpy of the ops struct will give the
1184   // duplicated vector the host "getarray" function which does not lazily allocate the array
1185   // (as it is assumed to always exist). So we force allocation here, before we overwrite the
1186   // ops
1187   if (v->boundtocpu) PetscCall(HostAllocateCheck_(dctx, *y));
1188   // in case the user has done some VecSetOps() tomfoolery
1189   PetscCall(PetscArraycpy((*y)->ops, v->ops, 1));
1190   {
1191     const auto yobj = PetscObjectCast(*y);
1192 
1193     PetscCall(PetscObjectListDuplicate(vobj->olist, &yobj->olist));
1194     PetscCall(PetscFunctionListDuplicate(vobj->qlist, &yobj->qlist));
1195   }
1196   (*y)->stash.donotstash   = v->stash.donotstash;
1197   (*y)->stash.ignorenegidx = v->stash.ignorenegidx;
1198   (*y)->map->bs            = std::abs(v->map->bs);
1199   (*y)->bstash.bs          = v->bstash.bs;
1200   PetscFunctionReturn(PETSC_SUCCESS);
1201 }
1202 
1203   #define VecSetOp_CUPM(op_name, op_host, ...) \
1204     do { \
1205       if (usehost) { \
1206         v->ops->op_name = op_host; \
1207       } else { \
1208         v->ops->op_name = __VA_ARGS__; \
1209       } \
1210     } while (0)
1211 
1212 // v->ops->bindtocpu
1213 template <device::cupm::DeviceType T, typename D>
1214 inline PetscErrorCode Vec_CUPMBase<T, D>::BindToCPU_CUPMBase(Vec v, PetscBool usehost, PetscDeviceContext dctx) noexcept
1215 {
1216   const auto change_default_rand_type = [](PetscRandomType target, char **ptr) {
1217     PetscFunctionBegin;
1218     PetscValidPointer(ptr, 2);
1219     PetscValidCharPointer(*ptr, 2);
1220     if (std::strcmp(target, *ptr)) {
1221       PetscCall(PetscFree(*ptr));
1222       PetscCall(PetscStrallocpy(target, ptr));
1223     }
1224     PetscFunctionReturn(PETSC_SUCCESS);
1225   };
1226 
1227   PetscFunctionBegin;
1228   v->boundtocpu = usehost;
1229   if (usehost) PetscCall(CopyToHost_(dctx, v));
1230   PetscCall(change_default_rand_type(usehost ? PETSCRANDER48 : PETSCDEVICERAND(), &v->defaultrandtype));
1231 
1232   // set the base functions that are guaranteed to be the same for both
1233   v->ops->duplicate = D::duplicate;
1234   v->ops->create    = create;
1235   v->ops->destroy   = destroy;
1236   v->ops->bindtocpu = D::bindtocpu;
1237   // Note that setting these to NULL on host breaks convergence in certain areas. I don't know
1238   // why, and I don't know how, but it is IMPERATIVE these are set as such!
1239   v->ops->replacearray = replacearray<PETSC_MEMTYPE_HOST>;
1240   v->ops->restorearray = restorearray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>;
1241 
1242   // set device-only common functions
1243   VecSetOp_CUPM(dotnorm2, nullptr, D::dotnorm2);
1244   VecSetOp_CUPM(getarray, nullptr, getarray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1245   VecSetOp_CUPM(getarraywrite, nullptr, getarray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1246   VecSetOp_CUPM(restorearraywrite, nullptr, restorearray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1247 
1248   VecSetOp_CUPM(getarrayread, nullptr, [](Vec v, const PetscScalar **a) { return getarray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
1249   VecSetOp_CUPM(restorearrayread, nullptr, [](Vec v, const PetscScalar **a) { return restorearray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
1250 
1251   VecSetOp_CUPM(getarrayandmemtype, nullptr, getarrayandmemtype<PETSC_MEMORY_ACCESS_READ_WRITE>);
1252   VecSetOp_CUPM(restorearrayandmemtype, nullptr, restorearrayandmemtype<PETSC_MEMORY_ACCESS_READ_WRITE>);
1253 
1254   VecSetOp_CUPM(getarraywriteandmemtype, nullptr, getarrayandmemtype<PETSC_MEMORY_ACCESS_WRITE>);
1255   VecSetOp_CUPM(restorearraywriteandmemtype, nullptr, [](Vec v, PetscScalar **a, PetscMemType *) { return restorearrayandmemtype<PETSC_MEMORY_ACCESS_WRITE>(v, a); });
1256 
1257   VecSetOp_CUPM(getarrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a, PetscMemType *m) { return getarrayandmemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a), m); });
1258   VecSetOp_CUPM(restorearrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a) { return restorearrayandmemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
1259 
1260   // set the functions that are always sequential
1261   using VecSeq_T = VecSeq_CUPM<T>;
1262   VecSetOp_CUPM(scale, VecScale_Seq, VecSeq_T::scale);
1263   VecSetOp_CUPM(copy, VecCopy_Seq, VecSeq_T::copy);
1264   VecSetOp_CUPM(set, VecSet_Seq, VecSeq_T::set);
1265   VecSetOp_CUPM(swap, VecSwap_Seq, VecSeq_T::swap);
1266   VecSetOp_CUPM(axpy, VecAXPY_Seq, VecSeq_T::axpy);
1267   VecSetOp_CUPM(axpby, VecAXPBY_Seq, VecSeq_T::axpby);
1268   VecSetOp_CUPM(maxpy, VecMAXPY_Seq, VecSeq_T::maxpy);
1269   VecSetOp_CUPM(aypx, VecAYPX_Seq, VecSeq_T::aypx);
1270   VecSetOp_CUPM(waxpy, VecWAXPY_Seq, VecSeq_T::waxpy);
1271   VecSetOp_CUPM(axpbypcz, VecAXPBYPCZ_Seq, VecSeq_T::axpbypcz);
1272   VecSetOp_CUPM(pointwisemult, VecPointwiseMult_Seq, VecSeq_T::pointwisemult);
1273   VecSetOp_CUPM(pointwisedivide, VecPointwiseDivide_Seq, VecSeq_T::pointwisedivide);
1274   VecSetOp_CUPM(setrandom, VecSetRandom_Seq, VecSeq_T::setrandom);
1275   VecSetOp_CUPM(dot_local, VecDot_Seq, VecSeq_T::dot);
1276   VecSetOp_CUPM(tdot_local, VecTDot_Seq, VecSeq_T::tdot);
1277   VecSetOp_CUPM(norm_local, VecNorm_Seq, VecSeq_T::norm);
1278   VecSetOp_CUPM(mdot_local, VecMDot_Seq, VecSeq_T::mdot);
1279   VecSetOp_CUPM(reciprocal, VecReciprocal_Default, VecSeq_T::reciprocal);
1280   VecSetOp_CUPM(shift, nullptr, VecSeq_T::shift);
1281   VecSetOp_CUPM(getlocalvector, nullptr, VecSeq_T::template getlocalvector<PETSC_MEMORY_ACCESS_READ_WRITE>);
1282   VecSetOp_CUPM(restorelocalvector, nullptr, VecSeq_T::template restorelocalvector<PETSC_MEMORY_ACCESS_READ_WRITE>);
1283   VecSetOp_CUPM(getlocalvectorread, nullptr, VecSeq_T::template getlocalvector<PETSC_MEMORY_ACCESS_READ>);
1284   VecSetOp_CUPM(restorelocalvectorread, nullptr, VecSeq_T::template restorelocalvector<PETSC_MEMORY_ACCESS_READ>);
1285   VecSetOp_CUPM(sum, nullptr, VecSeq_T::sum);
1286   PetscFunctionReturn(PETSC_SUCCESS);
1287 }
1288 
1289 // Called from VecGetSubVector()
1290 template <device::cupm::DeviceType T, typename D>
1291 inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrays_CUPMBase(Vec v, const PetscScalar **host_array, const PetscScalar **device_array, PetscOffloadMask *mask, PetscDeviceContext dctx) noexcept
1292 {
1293   PetscFunctionBegin;
1294   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
1295   if (host_array) {
1296     PetscCall(HostAllocateCheck_(dctx, v));
1297     *host_array = VecIMPLCast(v)->array;
1298   }
1299   if (device_array) {
1300     PetscCall(DeviceAllocateCheck_(dctx, v));
1301     *device_array = VecCUPMCast(v)->array_d;
1302   }
1303   if (mask) *mask = v->offloadmask;
1304   PetscFunctionReturn(PETSC_SUCCESS);
1305 }
1306 
1307 template <device::cupm::DeviceType T, typename D>
1308 inline PetscErrorCode Vec_CUPMBase<T, D>::ResetPreallocationCOO_CUPMBase(Vec v, PetscDeviceContext dctx) noexcept
1309 {
1310   PetscFunctionBegin;
1311   if (const auto vcu = VecCUPMCast(v)) {
1312     cupmStream_t stream;
1313     // clang-format off
1314     const auto   cntptrs = util::make_array(
1315       std::ref(vcu->jmap1_d),
1316       std::ref(vcu->perm1_d),
1317       std::ref(vcu->imap2_d),
1318       std::ref(vcu->jmap2_d),
1319       std::ref(vcu->perm2_d),
1320       std::ref(vcu->Cperm_d)
1321     );
1322     // clang-format on
1323 
1324     PetscCall(GetHandlesFrom_(dctx, &stream));
1325     for (auto &&ptr : cntptrs) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream));
1326     for (auto &&ptr : util::make_array(std::ref(vcu->sendbuf_d), std::ref(vcu->recvbuf_d))) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream));
1327   }
1328   PetscFunctionReturn(PETSC_SUCCESS);
1329 }
1330 
1331 template <device::cupm::DeviceType T, typename D>
1332 template <std::size_t NCount, std::size_t NScal>
1333 inline PetscErrorCode Vec_CUPMBase<T, D>::SetPreallocationCOO_CUPMBase(Vec v, PetscCount, const PetscInt[], PetscDeviceContext dctx, const std::array<CooPair<PetscCount>, NCount> &extra_cntptrs, const std::array<CooPair<PetscScalar>, NScal> &bufptrs) noexcept
1334 {
1335   const auto vimpl = VecIMPLCast(v);
1336 
1337   PetscFunctionBegin;
1338   PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx));
1339   // need to instantiate the private pointer if not already
1340   PetscCall(VecCUPMAllocateCheck_(v));
1341   {
1342     const auto vcu = VecCUPMCast(v);
1343     // clang-fomat off
1344     const auto cntptrs = util::concat_array(util::make_array(make_coo_pair(vcu->jmap1_d, vimpl->jmap1, v->map->n + 1), make_coo_pair(vcu->perm1_d, vimpl->perm1, vimpl->tot1)), extra_cntptrs);
1345     // clang-format on
1346     cupmStream_t stream;
1347 
1348     PetscCall(GetHandlesFrom_(dctx, &stream));
1349     // allocate
1350     for (auto &elem : cntptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream));
1351     for (auto &elem : bufptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream));
1352     // copy
1353     for (const auto &elem : cntptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true));
1354     for (const auto &elem : bufptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true));
1355   }
1356   PetscFunctionReturn(PETSC_SUCCESS);
1357 }
1358 
1359   #define PETSC_VEC_CUPM_BASE_CLASS_HEADER(name, Tp, ...) \
1360     using name = ::Petsc::vec::cupm::impl::Vec_CUPMBase<Tp, __VA_ARGS__>; \
1361     friend name; \
1362     /* introspection */ \
1363     using name::VecCUPMCast; \
1364     using name::VecIMPLCast; \
1365     using name::VECIMPLCUPM; \
1366     using name::VECSEQCUPM; \
1367     using name::VECMPICUPM; \
1368     using name::VecView_Debug; \
1369     /* utility */ \
1370     using typename name::Vec_CUPM; \
1371     using name::UseCUPMHostAlloc; \
1372     using name::GetHandles_; \
1373     using name::GetHandlesFrom_; \
1374     using name::VecCUPMAllocateCheck_; \
1375     using name::VecIMPLAllocateCheck_; \
1376     using name::HostAllocateCheck_; \
1377     using name::DeviceAllocateCheck_; \
1378     using name::CopyToDevice_; \
1379     using name::CopyToHost_; \
1380     using name::create; \
1381     using name::destroy; \
1382     using name::getarray; \
1383     using name::restorearray; \
1384     using name::getarrayandmemtype; \
1385     using name::restorearrayandmemtype; \
1386     using name::placearray; \
1387     using name::replacearray; \
1388     using name::resetarray; \
1389     /* base functions */ \
1390     using name::Create_CUPMBase; \
1391     using name::Initialize_CUPMBase; \
1392     using name::Duplicate_CUPMBase; \
1393     using name::BindToCPU_CUPMBase; \
1394     using name::Create_CUPM; \
1395     using name::DeviceArrayRead; \
1396     using name::DeviceArrayWrite; \
1397     using name::DeviceArrayReadWrite; \
1398     using name::HostArrayRead; \
1399     using name::HostArrayWrite; \
1400     using name::HostArrayReadWrite; \
1401     using name::ResetPreallocationCOO_CUPMBase; \
1402     using name::SetPreallocationCOO_CUPMBase; \
1403     /* blas interface */ \
1404     PETSC_CUPMBLAS_INHERIT_INTERFACE_TYPEDEFS_USING(cupmBlasInterface_t, Tp)
1405 
1406 } // namespace impl
1407 
1408 } // namespace cupm
1409 
1410 } // namespace vec
1411 
1412 } // namespace Petsc
1413 
1414 #endif // __cplusplus && PetscDefined(HAVE_DEVICE)
1415 
1416 #endif // PETSCVECCUPMIMPL_H
1417