1 #ifndef PETSCVECCUPMIMPL_H 2 #define PETSCVECCUPMIMPL_H 3 4 #include <petsc/private/vecimpl.h> 5 #include <../src/vec/vec/impls/dvecimpl.h> // for Vec_Seq 6 7 #if PetscDefined(HAVE_NVSHMEM) 8 PETSC_INTERN PetscErrorCode PetscNvshmemInitializeCheck(void); 9 PETSC_INTERN PetscErrorCode PetscNvshmemMalloc(size_t, void **); 10 PETSC_INTERN PetscErrorCode PetscNvshmemCalloc(size_t, void **); 11 PETSC_INTERN PetscErrorCode PetscNvshmemFree_Private(void *); 12 #define PetscNvshmemFree(ptr) ((PetscErrorCode)((ptr) && (PetscNvshmemFree_Private(ptr) || ((ptr) = PETSC_NULLPTR, PETSC_SUCCESS)))) 13 PETSC_INTERN PetscErrorCode PetscNvshmemSum(PetscInt, PetscScalar *, const PetscScalar *); 14 PETSC_INTERN PetscErrorCode PetscNvshmemMax(PetscInt, PetscReal *, const PetscReal *); 15 PETSC_INTERN PetscErrorCode VecNormAsync_NVSHMEM(Vec, NormType, PetscReal *); 16 PETSC_INTERN PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec); 17 #else 18 #define PetscNvshmemFree(ptr) PETSC_SUCCESS 19 #endif 20 21 #if defined(__cplusplus) && PetscDefined(HAVE_DEVICE) 22 #include <petsc/private/deviceimpl.h> 23 #include <petsc/private/cupmblasinterface.hpp> 24 25 #include <petsc/private/cpp/functional.hpp> 26 27 #include <limits> // std::numeric_limits 28 #include <cstring> // std::memset 29 30 namespace Petsc 31 { 32 33 namespace vec 34 { 35 36 namespace cupm 37 { 38 39 namespace impl 40 { 41 42 namespace 43 { 44 45 // ========================================================================================== 46 // UseCUPMHostAlloc_ 47 // 48 // A simple RAII helper for PetscMallocSet[CUDA|HIP]Host(). it exists because integrating the 49 // regular versions would be an enormous pain to square with the templated types... 50 // ========================================================================================== 51 template <device::cupm::DeviceType T> 52 class UseCUPMHostAlloc_ : device::cupm::impl::Interface<T> { 53 public: 54 PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(interface_type, T); 55 56 UseCUPMHostAlloc_(bool) noexcept; 57 ~UseCUPMHostAlloc_() noexcept; 58 59 PETSC_NODISCARD bool value() const noexcept; 60 61 private: 62 // would have loved to just do 63 // 64 // const auto oldmalloc = PetscTrMalloc; 65 // 66 // but in order to use auto the member needs to be static; in order to be static it must 67 // also be constexpr -- which in turn requires an initializer (also implicitly required by 68 // auto). But constexpr needs a constant expression initializer, so we can't initialize it 69 // with global (mutable) variables... 70 #define DECLTYPE_AUTO(left, right) decltype(right) left = right 71 const DECLTYPE_AUTO(oldmalloc_, PetscTrMalloc); 72 const DECLTYPE_AUTO(oldfree_, PetscTrFree); 73 const DECLTYPE_AUTO(oldrealloc_, PetscTrRealloc); 74 #undef DECLTYPE_AUTO 75 bool v_; 76 }; 77 78 template <device::cupm::DeviceType T> 79 inline UseCUPMHostAlloc_<T>::UseCUPMHostAlloc_(bool useit) noexcept : v_(useit) 80 { 81 PetscFunctionBegin; 82 if (useit) { 83 // all unused arguments are un-named, this saves having to add PETSC_UNUSED to them all 84 PetscTrMalloc = [](std::size_t sz, PetscBool clear, int, const char *, const char *, void **ptr) { 85 PetscFunctionBegin; 86 PetscCallCUPM(cupmMallocHost(ptr, sz)); 87 if (clear) std::memset(*ptr, 0, sz); 88 PetscFunctionReturn(PETSC_SUCCESS); 89 }; 90 PetscTrFree = [](void *ptr, int, const char *, const char *) { 91 PetscFunctionBegin; 92 PetscCallCUPM(cupmFreeHost(ptr)); 93 PetscFunctionReturn(PETSC_SUCCESS); 94 }; 95 PetscTrRealloc = [](std::size_t, int, const char *, const char *, void **) { 96 // REVIEW ME: can be implemented by malloc->copy->free? 97 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "%s has no realloc()", cupmName()); 98 }; 99 } 100 PetscFunctionReturnVoid(); 101 } 102 103 template <device::cupm::DeviceType T> 104 inline bool UseCUPMHostAlloc_<T>::value() const noexcept 105 { 106 return v_; 107 } 108 109 template <device::cupm::DeviceType T> 110 inline UseCUPMHostAlloc_<T>::~UseCUPMHostAlloc_() noexcept 111 { 112 PetscFunctionBegin; 113 if (value()) { 114 PetscTrMalloc = oldmalloc_; 115 PetscTrFree = oldfree_; 116 PetscTrRealloc = oldrealloc_; 117 } 118 PetscFunctionReturnVoid(); 119 } 120 121 struct no_op { 122 template <typename... T> 123 constexpr PetscErrorCode operator()(T &&...) const noexcept 124 { 125 return PETSC_SUCCESS; 126 } 127 }; 128 129 template <typename T> 130 struct CooPair { 131 using value_type = T; 132 using size_type = PetscCount; 133 134 value_type *&device; 135 value_type *&host; 136 size_type size; 137 }; 138 139 template <typename U> 140 static constexpr CooPair<U> make_coo_pair(U *&device, U *&host, PetscCount size) noexcept 141 { 142 return {device, host, size}; 143 } 144 145 } // anonymous namespace 146 147 // forward declarations 148 template <device::cupm::DeviceType> 149 class VecSeq_CUPM; 150 template <device::cupm::DeviceType> 151 class VecMPI_CUPM; 152 153 // ========================================================================================== 154 // Vec_CUPMBase 155 // 156 // Base class for the VecSeq and VecMPI CUPM implementations. On top of the usual DeviceType 157 // template parameter it also uses CRTP to be able to use values/calls specific to either 158 // VecSeq or VecMPI. This is in effect "inside-out" polymorphism. 159 // ========================================================================================== 160 template <device::cupm::DeviceType T, typename Derived> 161 class Vec_CUPMBase : device::cupm::impl::BlasInterface<T> { 162 public: 163 PETSC_CUPMBLAS_INHERIT_INTERFACE_TYPEDEFS_USING(cupmBlasInterface_t, T); 164 // ========================================================================================== 165 // Vec_CUPMBase::vector_array 166 // 167 // RAII versions of the get/restore array routines. Determines constness of the pointer type, 168 // holds the pointer itself provides the implicit conversion operator 169 // ========================================================================================== 170 template <PetscMemType, PetscMemoryAccessMode> 171 class vector_array; 172 173 private: 174 // A debug check to ensure that a given pointer-memtype pairing taken from user-land is 175 // actually correct. Errors on mismatch 176 static PetscErrorCode CheckPointerMatchesMemType_(const void *ptr, PetscMemType mtype) noexcept 177 { 178 PetscFunctionBegin; 179 if (PetscDefined(USE_DEBUG) && ptr) { 180 PetscMemType ptr_mtype; 181 182 PetscCall(PetscCUPMGetMemType(ptr, &ptr_mtype)); 183 if (mtype == PETSC_MEMTYPE_HOST) { 184 PetscCheck(PetscMemTypeHost(ptr_mtype), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p declared as %s does not match actual memtype %s", ptr, PetscMemTypeToString(mtype), PetscMemTypeToString(ptr_mtype)); 185 } else if (mtype == PETSC_MEMTYPE_DEVICE) { 186 // generic "device" memory should only care if the actual memtype is also generically 187 // "device" 188 PetscCheck(PetscMemTypeDevice(ptr_mtype), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p declared as %s does not match actual memtype %s", ptr, PetscMemTypeToString(mtype), PetscMemTypeToString(ptr_mtype)); 189 } else { 190 PetscCheck(mtype == ptr_mtype, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p declared as %s does not match actual memtype %s", ptr, PetscMemTypeToString(mtype), PetscMemTypeToString(ptr_mtype)); 191 } 192 } 193 PetscFunctionReturn(PETSC_SUCCESS); 194 } 195 196 // The final stop in the GetHandles_/GetFromHandles_ chain. This retrieves the various 197 // compute handles and ensure the given PetscDeviceContext is of the right type 198 static PetscErrorCode GetFromHandleDispatch_(PetscDeviceContext, cupmBlasHandle_t *, cupmStream_t *) noexcept; 199 static PetscErrorCode GetHandleDispatch_(PetscDeviceContext *, cupmBlasHandle_t *, cupmStream_t *) noexcept; 200 201 protected: 202 static PetscErrorCode VecView_Debug(Vec v, const char *message = "") noexcept 203 { 204 const auto pobj = PetscObjectCast(v); 205 const auto vimpl = VecIMPLCast(v); 206 const auto vcu = VecCUPMCast(v); 207 PetscMemType mtype; 208 MPI_Comm comm; 209 210 PetscFunctionBegin; 211 PetscValidPointer(vimpl, 1); 212 PetscValidPointer(vcu, 1); 213 PetscCall(PetscObjectGetComm(pobj, &comm)); 214 PetscCall(PetscPrintf(comm, "---------- %s ----------\n", message)); 215 PetscCall(PetscObjectPrintClassNamePrefixType(pobj, PETSC_VIEWER_STDOUT_(comm))); 216 PetscCall(PetscPrintf(comm, "Address: %p\n", v)); 217 PetscCall(PetscPrintf(comm, "Size: %" PetscInt_FMT "\n", v->map->n)); 218 PetscCall(PetscPrintf(comm, "Offload mask: %s\n", PetscOffloadMaskToString(v->offloadmask))); 219 PetscCall(PetscPrintf(comm, "Host ptr: %p\n", vimpl->array)); 220 PetscCall(PetscPrintf(comm, "Device ptr: %p\n", vcu->array_d)); 221 PetscCall(PetscPrintf(comm, "Device alloced ptr: %p\n", vcu->array_allocated_d)); 222 PetscCall(PetscCUPMGetMemType(vcu->array_d, &mtype)); 223 PetscCall(PetscPrintf(comm, "dptr is device mem? %s\n", PetscBools[static_cast<PetscBool>(PetscMemTypeDevice(mtype))])); 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 // Helper routines to retrieve various combinations of handles. The first set (GetHandles_) 228 // gets a PetscDeviceContext along with it, while the second set (GetHandlesFrom_) assumes 229 // you've gotten the PetscDeviceContext already, and retrieves the handles from it. All of 230 // them check that the PetscDeviceContext is of the appropriate type 231 static PetscErrorCode GetHandles_(PetscDeviceContext *, cupmBlasHandle_t * = nullptr, cupmStream_t * = nullptr) noexcept; 232 static PetscErrorCode GetHandles_(PetscDeviceContext *, cupmStream_t *) noexcept; 233 static PetscErrorCode GetHandles_(cupmStream_t *) noexcept; 234 static PetscErrorCode GetHandles_(cupmBlasHandle_t *) noexcept; 235 236 static PetscErrorCode GetHandlesFrom_(PetscDeviceContext, cupmBlasHandle_t *, cupmStream_t * = nullptr) noexcept; 237 static PetscErrorCode GetHandlesFrom_(PetscDeviceContext, cupmStream_t *) noexcept; 238 239 // Delete the allocated device array if required and replace it with the given array 240 static PetscErrorCode ResetAllocatedDevicePtr_(PetscDeviceContext, Vec, PetscScalar * = nullptr) noexcept; 241 // Check either the host or device impl pointer is allocated and allocate it if 242 // isn't. CastFunctionType casts the Vec to the required type and returns the pointer 243 template <typename CastFunctionType> 244 static PetscErrorCode VecAllocateCheck_(Vec, void *&, CastFunctionType &&) noexcept; 245 // Check the CUPM part (v->spptr) is allocated, otherwise allocate it 246 static PetscErrorCode VecCUPMAllocateCheck_(Vec) noexcept; 247 // Check the Host part (v->data) is allocated, otherwise allocate it 248 static PetscErrorCode VecIMPLAllocateCheck_(Vec) noexcept; 249 // Check the Host array is allocated, otherwise allocate it 250 static PetscErrorCode HostAllocateCheck_(PetscDeviceContext, Vec) noexcept; 251 // Check the CUPM array is allocated, otherwise allocate it 252 static PetscErrorCode DeviceAllocateCheck_(PetscDeviceContext, Vec) noexcept; 253 // Copy HTOD, allocating device if necessary 254 static PetscErrorCode CopyToDevice_(PetscDeviceContext, Vec, bool = false) noexcept; 255 // Copy DTOH, allocating host if necessary 256 static PetscErrorCode CopyToHost_(PetscDeviceContext, Vec, bool = false) noexcept; 257 258 public: 259 struct Vec_CUPM { 260 PetscScalar *array_d; // gpu data 261 PetscScalar *array_allocated_d; // does PETSc own the array ptr? 262 PetscBool nvshmem; // is array allocated in nvshmem? It is used to allocate 263 // Mvctx->lvec in nvshmem 264 265 // COO stuff 266 PetscCount *jmap1_d; // [m+1]: i-th entry of the vector has jmap1[i+1]-jmap1[i] repeats 267 // in COO arrays 268 PetscCount *perm1_d; // [tot1]: permutation array for local entries 269 PetscCount *imap2_d; // [nnz2]: i-th unique entry in recvbuf is imap2[i]-th entry in 270 // the vector 271 PetscCount *jmap2_d; // [nnz2+1] 272 PetscCount *perm2_d; // [recvlen] 273 PetscCount *Cperm_d; // [sendlen]: permutation array to fill sendbuf[]. 'C' for 274 // communication 275 276 // Buffers for remote values in VecSetValuesCOO() 277 PetscScalar *sendbuf_d; 278 PetscScalar *recvbuf_d; 279 }; 280 281 // Cast the Vec to its Vec_CUPM struct, i.e. return the result of (Vec_CUPM *)v->spptr 282 PETSC_NODISCARD static Vec_CUPM *VecCUPMCast(Vec) noexcept; 283 // Cast the Vec to its host struct, i.e. return the result of (Vec_Seq *)v->data 284 template <typename U = Derived> 285 PETSC_NODISCARD static constexpr auto VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v)); 286 // Get the PetscLogEvents for HTOD and DTOH 287 PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyToGPU() noexcept; 288 PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyFromGPU() noexcept; 289 // Get the VecTypes 290 PETSC_NODISCARD static constexpr VecType VECSEQCUPM() noexcept; 291 PETSC_NODISCARD static constexpr VecType VECMPICUPM() noexcept; 292 // Get the VecType of the calling vector 293 template <typename U = Derived> 294 PETSC_NODISCARD static constexpr VecType VECIMPLCUPM() noexcept; 295 PETSC_NODISCARD static constexpr PetscRandomType PETSCDEVICERAND() noexcept; 296 297 // Call the host destroy function, i.e. VecDestroy_Seq() 298 static PetscErrorCode VecDestroy_IMPL(Vec) noexcept; 299 // Call the host reset function, i.e. VecResetArray_Seq() 300 static PetscErrorCode VecResetArray_IMPL(Vec) noexcept; 301 // ... you get the idea 302 static PetscErrorCode VecPlaceArray_IMPL(Vec, const PetscScalar *) noexcept; 303 // Call the host creation function, i.e. VecCreate_Seq(), and also initialize the CUPM part 304 // along with it if needed 305 static PetscErrorCode VecCreate_IMPL_Private(Vec, PetscBool *, PetscInt = 0, PetscScalar * = nullptr) noexcept; 306 307 // Shorthand for creating vector_array's. Need functions to create them, otherwise using them 308 // as an unnamed temporary leads to most vexing parse 309 PETSC_NODISCARD static auto DeviceArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>{dctx, v}); 310 PETSC_NODISCARD static auto DeviceArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>{dctx, v}); 311 PETSC_NODISCARD static auto DeviceArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v}); 312 PETSC_NODISCARD static auto HostArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>{dctx, v}); 313 PETSC_NODISCARD static auto HostArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>{dctx, v}); 314 PETSC_NODISCARD static auto HostArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v}); 315 316 // disallow implicit conversion 317 template <typename U> 318 PETSC_NODISCARD static UseCUPMHostAlloc_<T> UseCUPMHostAlloc(U) noexcept = delete; 319 // utility for using cupmHostAlloc() 320 PETSC_NODISCARD static UseCUPMHostAlloc_<T> UseCUPMHostAlloc(bool) noexcept; 321 PETSC_NODISCARD static UseCUPMHostAlloc_<T> UseCUPMHostAlloc(PetscBool) noexcept; 322 323 // ops-table functions 324 static PetscErrorCode create(Vec) noexcept; 325 static PetscErrorCode destroy(Vec) noexcept; 326 template <PetscMemType, PetscMemoryAccessMode, bool = false> 327 static PetscErrorCode getarray(Vec, PetscScalar **, PetscDeviceContext) noexcept; 328 template <PetscMemType, PetscMemoryAccessMode, bool = false> 329 static PetscErrorCode getarray(Vec, PetscScalar **) noexcept; 330 template <PetscMemType, PetscMemoryAccessMode> 331 static PetscErrorCode restorearray(Vec, PetscScalar **, PetscDeviceContext) noexcept; 332 template <PetscMemType, PetscMemoryAccessMode> 333 static PetscErrorCode restorearray(Vec, PetscScalar **) noexcept; 334 template <PetscMemoryAccessMode> 335 static PetscErrorCode getarrayandmemtype(Vec, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept; 336 template <PetscMemoryAccessMode> 337 static PetscErrorCode getarrayandmemtype(Vec, PetscScalar **, PetscMemType *) noexcept; 338 template <PetscMemoryAccessMode> 339 static PetscErrorCode restorearrayandmemtype(Vec, PetscScalar **, PetscDeviceContext) noexcept; 340 template <PetscMemoryAccessMode> 341 static PetscErrorCode restorearrayandmemtype(Vec, PetscScalar **) noexcept; 342 template <PetscMemType> 343 static PetscErrorCode replacearray(Vec, const PetscScalar *) noexcept; 344 template <PetscMemType> 345 static PetscErrorCode resetarray(Vec) noexcept; 346 template <PetscMemType> 347 static PetscErrorCode placearray(Vec, const PetscScalar *) noexcept; 348 349 // common ops shared between Seq and MPI 350 static PetscErrorCode Create_CUPM(Vec) noexcept; 351 static PetscErrorCode Create_CUPMBase(MPI_Comm, PetscInt, PetscInt, PetscInt, Vec *, PetscBool, PetscLayout /*reference*/ = nullptr) noexcept; 352 static PetscErrorCode Initialize_CUPMBase(Vec, PetscBool, PetscScalar *, PetscScalar *, PetscDeviceContext) noexcept; 353 template <typename SetupFunctionT = no_op> 354 static PetscErrorCode Duplicate_CUPMBase(Vec, Vec *, PetscDeviceContext, SetupFunctionT && = SetupFunctionT{}) noexcept; 355 static PetscErrorCode BindToCPU_CUPMBase(Vec, PetscBool, PetscDeviceContext) noexcept; 356 static PetscErrorCode GetArrays_CUPMBase(Vec, const PetscScalar **, const PetscScalar **, PetscOffloadMask *, PetscDeviceContext) noexcept; 357 static PetscErrorCode ResetPreallocationCOO_CUPMBase(Vec, PetscDeviceContext) noexcept; 358 template <std::size_t NCount = 0, std::size_t NScal = 0> 359 static PetscErrorCode SetPreallocationCOO_CUPMBase(Vec, PetscCount, const PetscInt[], PetscDeviceContext, const std::array<CooPair<PetscCount>, NCount> & = {}, const std::array<CooPair<PetscScalar>, NScal> & = {}) noexcept; 360 }; 361 362 // ========================================================================================== 363 // Vec_CUPMBase::vector_array 364 // 365 // RAII versions of the get/restore array routines. Determines constness of the pointer type, 366 // holds the pointer itself and provides the implicit conversion operator. 367 // 368 // On construction this calls the moral equivalent of Vec[CUPM]GetArray[Read|Write]() 369 // (depending on PetscMemoryAccessMode) and on destruction automatically restores the array 370 // for you 371 // ========================================================================================== 372 template <device::cupm::DeviceType T, typename D> 373 template <PetscMemType MT, PetscMemoryAccessMode MA> 374 class Vec_CUPMBase<T, D>::vector_array { 375 public: 376 static const auto memory_type = MT; 377 static const auto access_type = MA; 378 379 using value_type = PetscScalar; 380 using pointer_type = value_type *; 381 using cupm_pointer_type = cupmScalar_t *; 382 383 vector_array(PetscDeviceContext, Vec) noexcept; 384 ~vector_array() noexcept; 385 386 constexpr vector_array(vector_array &&) noexcept = default; 387 constexpr vector_array &operator=(vector_array &&) noexcept = default; 388 389 pointer_type data() const noexcept; 390 cupm_pointer_type cupmdata() const noexcept; 391 392 operator pointer_type() const noexcept; 393 // in case pointer_type == cupmscalar_pointer_type we don't want this overload to exist, so 394 // we make a dummy template parameter to allow SFINAE to nix it for us 395 template <typename U = pointer_type, typename = util::enable_if_t<!std::is_same<U, cupm_pointer_type>::value>> 396 operator cupm_pointer_type() const noexcept; 397 398 private: 399 pointer_type ptr_ = nullptr; 400 PetscDeviceContext dctx_ = nullptr; 401 Vec v_ = nullptr; 402 }; 403 404 // ========================================================================================== 405 // Vec_CUPMBase::vector_array - Static Variables 406 // ========================================================================================== 407 408 template <device::cupm::DeviceType T, typename D> 409 template <PetscMemType MT, PetscMemoryAccessMode MA> 410 const PetscMemType Vec_CUPMBase<T, D>::vector_array<MT, MA>::memory_type; 411 412 template <device::cupm::DeviceType T, typename D> 413 template <PetscMemType MT, PetscMemoryAccessMode MA> 414 const PetscMemoryAccessMode Vec_CUPMBase<T, D>::vector_array<MT, MA>::access_type; 415 416 // ========================================================================================== 417 // Vec_CUPMBase::vector_array - Public API 418 // ========================================================================================== 419 420 template <device::cupm::DeviceType T, typename D> 421 template <PetscMemType MT, PetscMemoryAccessMode MA> 422 inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::vector_array(PetscDeviceContext dctx, Vec v) noexcept : dctx_(dctx), v_(v) 423 { 424 PetscFunctionBegin; 425 PetscCallAbort(PETSC_COMM_SELF, getarray<MT, MA, true>(v, &ptr_, dctx)); 426 PetscFunctionReturnVoid(); 427 } 428 429 template <device::cupm::DeviceType T, typename D> 430 template <PetscMemType MT, PetscMemoryAccessMode MA> 431 inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::~vector_array() noexcept 432 { 433 PetscFunctionBegin; 434 PetscCallAbort(PETSC_COMM_SELF, restorearray<MT, MA>(v_, &ptr_, dctx_)); 435 PetscFunctionReturnVoid(); 436 } 437 438 template <device::cupm::DeviceType T, typename D> 439 template <PetscMemType MT, PetscMemoryAccessMode MA> 440 inline typename Vec_CUPMBase<T, D>::template vector_array<MT, MA>::pointer_type Vec_CUPMBase<T, D>::vector_array<MT, MA>::data() const noexcept 441 { 442 return ptr_; 443 } 444 445 template <device::cupm::DeviceType T, typename D> 446 template <PetscMemType MT, PetscMemoryAccessMode MA> 447 inline typename Vec_CUPMBase<T, D>::template vector_array<MT, MA>::cupm_pointer_type Vec_CUPMBase<T, D>::vector_array<MT, MA>::cupmdata() const noexcept 448 { 449 return cupmScalarPtrCast(data()); 450 } 451 452 template <device::cupm::DeviceType T, typename D> 453 template <PetscMemType MT, PetscMemoryAccessMode MA> 454 inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::operator pointer_type() const noexcept 455 { 456 return data(); 457 } 458 459 // in case pointer_type == cupmscalar_pointer_type we don't want this overload to exist, so 460 // we make a dummy template parameter to allow SFINAE to nix it for us 461 template <device::cupm::DeviceType T, typename D> 462 template <PetscMemType MT, PetscMemoryAccessMode MA> 463 template <typename U, typename> 464 inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::operator cupm_pointer_type() const noexcept 465 { 466 return cupmdata(); 467 } 468 469 // ========================================================================================== 470 // Vec_CUPMBase - Private API 471 // ========================================================================================== 472 473 template <device::cupm::DeviceType T, typename D> 474 inline PetscErrorCode Vec_CUPMBase<T, D>::GetFromHandleDispatch_(PetscDeviceContext dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept 475 { 476 PetscFunctionBegin; 477 PetscValidDeviceContext(dctx, 1); 478 if (handle) PetscValidPointer(handle, 2); 479 if (stream) PetscValidPointer(stream, 3); 480 if (PetscDefined(USE_DEBUG)) { 481 PetscDeviceType dtype; 482 483 PetscCall(PetscDeviceContextGetDeviceType(dctx, &dtype)); 484 PetscCheckCompatibleDeviceTypes(PETSC_DEVICE_CUPM(), -1, dtype, 1); 485 } 486 if (handle) PetscCall(PetscDeviceContextGetBLASHandle_Internal(dctx, handle)); 487 if (stream) PetscCall(PetscDeviceContextGetStreamHandle_Internal(dctx, stream)); 488 PetscFunctionReturn(PETSC_SUCCESS); 489 } 490 491 template <device::cupm::DeviceType T, typename D> 492 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandleDispatch_(PetscDeviceContext *dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept 493 { 494 PetscDeviceContext dctx_loc = nullptr; 495 496 PetscFunctionBegin; 497 // silence uninitialized variable warnings 498 if (dctx) *dctx = nullptr; 499 PetscCall(PetscDeviceContextGetCurrentContext(&dctx_loc)); 500 PetscCall(GetFromHandleDispatch_(dctx_loc, handle, stream)); 501 if (dctx) *dctx = dctx_loc; 502 PetscFunctionReturn(PETSC_SUCCESS); 503 } 504 505 // ========================================================================================== 506 // Vec_CUPMBase - Protected API 507 // ========================================================================================== 508 509 template <device::cupm::DeviceType T, typename D> 510 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(PetscDeviceContext *dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept 511 { 512 return GetHandleDispatch_(dctx, handle, stream); 513 } 514 515 template <device::cupm::DeviceType T, typename D> 516 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(PetscDeviceContext *dctx, cupmStream_t *stream) noexcept 517 { 518 return GetHandles_(dctx, nullptr, stream); 519 } 520 521 template <device::cupm::DeviceType T, typename D> 522 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(cupmStream_t *stream) noexcept 523 { 524 return GetHandles_(nullptr, stream); 525 } 526 527 template <device::cupm::DeviceType T, typename D> 528 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(cupmBlasHandle_t *handle) noexcept 529 { 530 return GetHandles_(nullptr, handle); 531 } 532 533 template <device::cupm::DeviceType T, typename D> 534 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandlesFrom_(PetscDeviceContext dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept 535 { 536 return GetFromHandleDispatch_(dctx, handle, stream); 537 } 538 539 template <device::cupm::DeviceType T, typename D> 540 inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandlesFrom_(PetscDeviceContext dctx, cupmStream_t *stream) noexcept 541 { 542 return GetHandlesFrom_(dctx, nullptr, stream); 543 } 544 545 template <device::cupm::DeviceType T, typename D> 546 inline PetscErrorCode Vec_CUPMBase<T, D>::ResetAllocatedDevicePtr_(PetscDeviceContext dctx, Vec v, PetscScalar *new_value) noexcept 547 { 548 auto &device_array = VecCUPMCast(v)->array_allocated_d; 549 550 PetscFunctionBegin; 551 if (device_array) { 552 if (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) { 553 PetscCall(PetscNvshmemFree(device_array)); 554 } else { 555 cupmStream_t stream; 556 557 PetscCall(GetHandlesFrom_(dctx, &stream)); 558 PetscCallCUPM(cupmFreeAsync(device_array, stream)); 559 } 560 } 561 device_array = new_value; 562 PetscFunctionReturn(PETSC_SUCCESS); 563 } 564 565 namespace 566 { 567 568 inline PetscErrorCode VecCUPMCheckMinimumPinnedMemory_Internal(Vec v) noexcept 569 { 570 auto mem = static_cast<PetscInt>(v->minimum_bytes_pinned_memory); 571 PetscBool flg; 572 573 PetscFunctionBegin; 574 PetscObjectOptionsBegin(PetscObjectCast(v)); 575 PetscCall(PetscOptionsRangeInt("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", mem, &mem, &flg, 0, std::numeric_limits<decltype(mem)>::max())); 576 if (flg) v->minimum_bytes_pinned_memory = mem; 577 PetscOptionsEnd(); 578 PetscFunctionReturn(PETSC_SUCCESS); 579 } 580 581 } // anonymous namespace 582 583 template <device::cupm::DeviceType T, typename D> 584 template <typename CastFunctionType> 585 inline PetscErrorCode Vec_CUPMBase<T, D>::VecAllocateCheck_(Vec v, void *&dest, CastFunctionType &&cast) noexcept 586 { 587 PetscFunctionBegin; 588 if (PetscLikely(dest)) PetscFunctionReturn(PETSC_SUCCESS); 589 // do the check here so we don't have to do it in every function 590 PetscCall(checkCupmBlasIntCast(v->map->n)); 591 { 592 auto impl = cast(v); 593 594 PetscCall(PetscNew(&impl)); 595 dest = impl; 596 } 597 PetscFunctionReturn(PETSC_SUCCESS); 598 } 599 600 template <device::cupm::DeviceType T, typename D> 601 inline PetscErrorCode Vec_CUPMBase<T, D>::VecIMPLAllocateCheck_(Vec v) noexcept 602 { 603 PetscFunctionBegin; 604 PetscCall(VecAllocateCheck_(v, v->data, VecIMPLCast<D>)); 605 PetscFunctionReturn(PETSC_SUCCESS); 606 } 607 608 // allocate the Vec_CUPM struct. this is normally done through DeviceAllocateCheck_(), but in 609 // certain circumstances (such as when the user places the device array) we do not want to do 610 // the full DeviceAllocateCheck_() as it also allocates the array 611 template <device::cupm::DeviceType T, typename D> 612 inline PetscErrorCode Vec_CUPMBase<T, D>::VecCUPMAllocateCheck_(Vec v) noexcept 613 { 614 PetscFunctionBegin; 615 PetscCall(VecAllocateCheck_(v, v->spptr, VecCUPMCast)); 616 PetscFunctionReturn(PETSC_SUCCESS); 617 } 618 619 template <device::cupm::DeviceType T, typename D> 620 inline PetscErrorCode Vec_CUPMBase<T, D>::HostAllocateCheck_(PetscDeviceContext, Vec v) noexcept 621 { 622 PetscFunctionBegin; 623 PetscCall(VecIMPLAllocateCheck_(v)); 624 if (auto &alloc = VecIMPLCast(v)->array_allocated) PetscFunctionReturn(PETSC_SUCCESS); 625 else { 626 PetscCall(VecCUPMCheckMinimumPinnedMemory_Internal(v)); 627 { 628 const auto n = v->map->n; 629 const auto useit = UseCUPMHostAlloc((n * sizeof(*alloc)) > v->minimum_bytes_pinned_memory); 630 631 v->pinned_memory = static_cast<decltype(v->pinned_memory)>(useit.value()); 632 PetscCall(PetscMalloc1(n, &alloc)); 633 } 634 if (!VecIMPLCast(v)->array) VecIMPLCast(v)->array = alloc; 635 if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) v->offloadmask = PETSC_OFFLOAD_CPU; 636 } 637 PetscFunctionReturn(PETSC_SUCCESS); 638 } 639 640 template <device::cupm::DeviceType T, typename D> 641 inline PetscErrorCode Vec_CUPMBase<T, D>::DeviceAllocateCheck_(PetscDeviceContext dctx, Vec v) noexcept 642 { 643 PetscFunctionBegin; 644 PetscCall(VecCUPMAllocateCheck_(v)); 645 if (auto &alloc = VecCUPMCast(v)->array_d) PetscFunctionReturn(PETSC_SUCCESS); 646 else { 647 const auto n = v->map->n; 648 auto &array_allocated_d = VecCUPMCast(v)->array_allocated_d; 649 cupmStream_t stream; 650 651 PetscCall(GetHandlesFrom_(dctx, &stream)); 652 PetscCall(PetscCUPMMallocAsync(&array_allocated_d, n, stream)); 653 alloc = array_allocated_d; 654 if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) { 655 const auto vimp = VecIMPLCast(v); 656 v->offloadmask = (vimp && vimp->array) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 657 } 658 } 659 PetscFunctionReturn(PETSC_SUCCESS); 660 } 661 662 template <device::cupm::DeviceType T, typename D> 663 inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToDevice_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept 664 { 665 PetscFunctionBegin; 666 PetscCall(DeviceAllocateCheck_(dctx, v)); 667 if (v->offloadmask == PETSC_OFFLOAD_CPU) { 668 cupmStream_t stream; 669 670 v->offloadmask = PETSC_OFFLOAD_BOTH; 671 PetscCall(GetHandlesFrom_(dctx, &stream)); 672 PetscCall(PetscLogEventBegin(VEC_CUPMCopyToGPU(), v, 0, 0, 0)); 673 PetscCall(PetscCUPMMemcpyAsync(VecCUPMCast(v)->array_d, VecIMPLCast(v)->array, v->map->n, cupmMemcpyHostToDevice, stream, forceasync)); 674 PetscCall(PetscLogEventEnd(VEC_CUPMCopyToGPU(), v, 0, 0, 0)); 675 } 676 PetscFunctionReturn(PETSC_SUCCESS); 677 } 678 679 template <device::cupm::DeviceType T, typename D> 680 inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToHost_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept 681 { 682 PetscFunctionBegin; 683 PetscCall(HostAllocateCheck_(dctx, v)); 684 if (v->offloadmask == PETSC_OFFLOAD_GPU) { 685 cupmStream_t stream; 686 687 v->offloadmask = PETSC_OFFLOAD_BOTH; 688 PetscCall(GetHandlesFrom_(dctx, &stream)); 689 PetscCall(PetscLogEventBegin(VEC_CUPMCopyFromGPU(), v, 0, 0, 0)); 690 PetscCall(PetscCUPMMemcpyAsync(VecIMPLCast(v)->array, VecCUPMCast(v)->array_d, v->map->n, cupmMemcpyDeviceToHost, stream, forceasync)); 691 PetscCall(PetscLogEventEnd(VEC_CUPMCopyFromGPU(), v, 0, 0, 0)); 692 } 693 PetscFunctionReturn(PETSC_SUCCESS); 694 } 695 696 // ========================================================================================== 697 // Vec_CUPMBase - Public API 698 // ========================================================================================== 699 700 template <device::cupm::DeviceType T, typename D> 701 inline typename Vec_CUPMBase<T, D>::Vec_CUPM *Vec_CUPMBase<T, D>::VecCUPMCast(Vec v) noexcept 702 { 703 return static_cast<Vec_CUPM *>(v->spptr); 704 } 705 706 // This is a trick to get around the fact that in CRTP the derived class is not yet fully 707 // defined because Base<Derived> must necessarily be instantiated before Derived is 708 // complete. By using a dummy template parameter we make the type "dependent" and so will 709 // only be determined when the derived class is instantiated (and therefore fully defined) 710 template <device::cupm::DeviceType T, typename D> 711 template <typename U> 712 inline constexpr auto Vec_CUPMBase<T, D>::VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v)) 713 { 714 return U::VecIMPLCast_(v); 715 } 716 717 template <device::cupm::DeviceType T, typename D> 718 inline PetscErrorCode Vec_CUPMBase<T, D>::VecDestroy_IMPL(Vec v) noexcept 719 { 720 return D::VecDestroy_IMPL_(v); 721 } 722 723 template <device::cupm::DeviceType T, typename D> 724 inline PetscErrorCode Vec_CUPMBase<T, D>::VecResetArray_IMPL(Vec v) noexcept 725 { 726 return D::VecResetArray_IMPL_(v); 727 } 728 729 template <device::cupm::DeviceType T, typename D> 730 inline PetscErrorCode Vec_CUPMBase<T, D>::VecPlaceArray_IMPL(Vec v, const PetscScalar *a) noexcept 731 { 732 return D::VecPlaceArray_IMPL_(v, a); 733 } 734 735 template <device::cupm::DeviceType T, typename D> 736 inline PetscErrorCode Vec_CUPMBase<T, D>::VecCreate_IMPL_Private(Vec v, PetscBool *alloc_missing, PetscInt nghost, PetscScalar *host_array) noexcept 737 { 738 return D::VecCreate_IMPL_Private_(v, alloc_missing, nghost, host_array); 739 } 740 741 template <device::cupm::DeviceType T, typename D> 742 inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyToGPU() noexcept 743 { 744 return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyToGPU : VEC_HIPCopyToGPU; 745 } 746 747 template <device::cupm::DeviceType T, typename D> 748 inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyFromGPU() noexcept 749 { 750 return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyFromGPU : VEC_HIPCopyFromGPU; 751 } 752 753 template <device::cupm::DeviceType T, typename D> 754 inline constexpr VecType Vec_CUPMBase<T, D>::VECSEQCUPM() noexcept 755 { 756 return T == device::cupm::DeviceType::CUDA ? VECSEQCUDA : VECSEQHIP; 757 } 758 759 template <device::cupm::DeviceType T, typename D> 760 inline constexpr VecType Vec_CUPMBase<T, D>::VECMPICUPM() noexcept 761 { 762 return T == device::cupm::DeviceType::CUDA ? VECMPICUDA : VECMPIHIP; 763 } 764 765 template <device::cupm::DeviceType T, typename D> 766 template <typename U> 767 inline constexpr VecType Vec_CUPMBase<T, D>::VECIMPLCUPM() noexcept 768 { 769 return U::VECIMPLCUPM_(); 770 } 771 772 template <device::cupm::DeviceType T, typename D> 773 inline constexpr PetscRandomType Vec_CUPMBase<T, D>::PETSCDEVICERAND() noexcept 774 { 775 // REVIEW ME: HIP default rng? 776 return T == device::cupm::DeviceType::CUDA ? PETSCCURAND : PETSCRANDER48; 777 } 778 779 // utility for using cupmHostAlloc() 780 template <device::cupm::DeviceType T, typename D> 781 inline UseCUPMHostAlloc_<T> Vec_CUPMBase<T, D>::UseCUPMHostAlloc(bool b) noexcept 782 { 783 return {b}; 784 } 785 786 template <device::cupm::DeviceType T, typename D> 787 inline UseCUPMHostAlloc_<T> Vec_CUPMBase<T, D>::UseCUPMHostAlloc(PetscBool b) noexcept 788 { 789 return UseCUPMHostAlloc(static_cast<bool>(b)); 790 } 791 792 // private version that takes a PetscDeviceContext, called by the public variant 793 template <device::cupm::DeviceType T, typename D> 794 template <PetscMemType mtype, PetscMemoryAccessMode access, bool force> 795 inline PetscErrorCode Vec_CUPMBase<T, D>::getarray(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept 796 { 797 constexpr auto hostmem = PetscMemTypeHost(mtype); 798 const auto oldmask = v->offloadmask; 799 auto &mask = v->offloadmask; 800 auto should_sync = false; 801 802 PetscFunctionBegin; 803 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 804 PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM()); 805 if (PetscMemoryAccessRead(access)) { 806 // READ or READ_WRITE 807 if (((oldmask == PETSC_OFFLOAD_GPU) && hostmem) || ((oldmask == PETSC_OFFLOAD_CPU) && !hostmem)) { 808 // if we move the data we should set the flag to synchronize later on 809 should_sync = true; 810 } 811 PetscCall((hostmem ? CopyToHost_ : CopyToDevice_)(dctx, v, force)); 812 } else { 813 // WRITE only 814 PetscCall((hostmem ? HostAllocateCheck_ : DeviceAllocateCheck_)(dctx, v)); 815 } 816 *a = hostmem ? VecIMPLCast(v)->array : VecCUPMCast(v)->array_d; 817 // if unallocated previously we should zero things out if we intend to read 818 if (PetscMemoryAccessRead(access) && (oldmask == PETSC_OFFLOAD_UNALLOCATED)) { 819 const auto n = v->map->n; 820 821 if (hostmem) { 822 PetscCall(PetscArrayzero(*a, n)); 823 } else { 824 cupmStream_t stream; 825 826 PetscCall(GetHandlesFrom_(dctx, &stream)); 827 PetscCall(PetscCUPMMemsetAsync(*a, 0, n, stream, force)); 828 should_sync = true; 829 } 830 } 831 // update the offloadmask if we intend to write, since we assume immediately modified 832 if (PetscMemoryAccessWrite(access)) { 833 PetscCall(VecSetErrorIfLocked(v, 1)); 834 // REVIEW ME: this should probably also call PetscObjectStateIncrease() since we assume it 835 // is immediately modified 836 mask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 837 } 838 // if we are a globally blocking stream and we have MOVED data then we should synchronize, 839 // since even doing async calls on the NULL stream is not synchronous 840 if (!force && should_sync) PetscCall(PetscDeviceContextSynchronize(dctx)); 841 PetscFunctionReturn(PETSC_SUCCESS); 842 } 843 844 // v->ops->getarray[read|write] or VecCUPMGetArray[Read|Write]() 845 template <device::cupm::DeviceType T, typename D> 846 template <PetscMemType mtype, PetscMemoryAccessMode access, bool force> 847 inline PetscErrorCode Vec_CUPMBase<T, D>::getarray(Vec v, PetscScalar **a) noexcept 848 { 849 PetscDeviceContext dctx; 850 851 PetscFunctionBegin; 852 PetscCall(GetHandles_(&dctx)); 853 PetscCall(getarray<mtype, access, force>(v, a, dctx)); 854 PetscFunctionReturn(PETSC_SUCCESS); 855 } 856 857 // private version that takes a PetscDeviceContext, called by the public variant 858 template <device::cupm::DeviceType T, typename D> 859 template <PetscMemType mtype, PetscMemoryAccessMode access> 860 inline PetscErrorCode Vec_CUPMBase<T, D>::restorearray(Vec v, PetscScalar **a, PetscDeviceContext) noexcept 861 { 862 PetscFunctionBegin; 863 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 864 PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM()); 865 if (PetscMemoryAccessWrite(access)) { 866 // WRITE or READ_WRITE 867 PetscCall(PetscObjectStateIncrease(PetscObjectCast(v))); 868 v->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 869 } 870 if (a) { 871 PetscCall(CheckPointerMatchesMemType_(*a, mtype)); 872 *a = nullptr; 873 } 874 PetscFunctionReturn(PETSC_SUCCESS); 875 } 876 877 // v->ops->restorearray[read|write] or VecCUPMRestoreArray[Read|Write]() 878 template <device::cupm::DeviceType T, typename D> 879 template <PetscMemType mtype, PetscMemoryAccessMode access> 880 inline PetscErrorCode Vec_CUPMBase<T, D>::restorearray(Vec v, PetscScalar **a) noexcept 881 { 882 PetscDeviceContext dctx; 883 884 PetscFunctionBegin; 885 PetscCall(GetHandles_(&dctx)); 886 PetscCall(restorearray<mtype, access>(v, a, dctx)); 887 PetscFunctionReturn(PETSC_SUCCESS); 888 } 889 890 template <device::cupm::DeviceType T, typename D> 891 template <PetscMemoryAccessMode access> 892 inline PetscErrorCode Vec_CUPMBase<T, D>::getarrayandmemtype(Vec v, PetscScalar **a, PetscMemType *mtype, PetscDeviceContext dctx) noexcept 893 { 894 PetscFunctionBegin; 895 PetscCall(getarray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx)); 896 if (mtype) *mtype = (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUPM(); 897 PetscFunctionReturn(PETSC_SUCCESS); 898 } 899 900 // v->ops->getarrayandmemtype 901 template <device::cupm::DeviceType T, typename D> 902 template <PetscMemoryAccessMode access> 903 inline PetscErrorCode Vec_CUPMBase<T, D>::getarrayandmemtype(Vec v, PetscScalar **a, PetscMemType *mtype) noexcept 904 { 905 PetscDeviceContext dctx; 906 907 PetscFunctionBegin; 908 PetscCall(GetHandles_(&dctx)); 909 PetscCall(getarrayandmemtype<access>(v, a, mtype, dctx)); 910 PetscFunctionReturn(PETSC_SUCCESS); 911 } 912 913 template <device::cupm::DeviceType T, typename D> 914 template <PetscMemoryAccessMode access> 915 inline PetscErrorCode Vec_CUPMBase<T, D>::restorearrayandmemtype(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept 916 { 917 PetscFunctionBegin; 918 PetscCall(restorearray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx)); 919 PetscFunctionReturn(PETSC_SUCCESS); 920 } 921 922 // v->ops->restorearrayandmemtype 923 template <device::cupm::DeviceType T, typename D> 924 template <PetscMemoryAccessMode access> 925 inline PetscErrorCode Vec_CUPMBase<T, D>::restorearrayandmemtype(Vec v, PetscScalar **a) noexcept 926 { 927 PetscDeviceContext dctx; 928 929 PetscFunctionBegin; 930 PetscCall(GetHandles_(&dctx)); 931 PetscCall(restorearrayandmemtype<access>(v, a, dctx)); 932 PetscFunctionReturn(PETSC_SUCCESS); 933 } 934 935 // v->ops->placearray or VecCUPMPlaceArray() 936 template <device::cupm::DeviceType T, typename D> 937 template <PetscMemType mtype> 938 inline PetscErrorCode Vec_CUPMBase<T, D>::placearray(Vec v, const PetscScalar *a) noexcept 939 { 940 PetscDeviceContext dctx; 941 942 PetscFunctionBegin; 943 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 944 PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM()); 945 PetscCall(CheckPointerMatchesMemType_(a, mtype)); 946 PetscCall(GetHandles_(&dctx)); 947 if (PetscMemTypeHost(mtype)) { 948 PetscCall(CopyToHost_(dctx, v)); 949 PetscCall(VecPlaceArray_IMPL(v, a)); 950 v->offloadmask = PETSC_OFFLOAD_CPU; 951 } else { 952 PetscCall(VecIMPLAllocateCheck_(v)); 953 { 954 auto &backup_array = VecIMPLCast(v)->unplacedarray; 955 956 PetscCheck(!backup_array, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "VecPlaceArray() was already called on this vector, without a call to VecResetArray()"); 957 PetscCall(CopyToDevice_(dctx, v)); 958 PetscCall(PetscObjectStateIncrease(PetscObjectCast(v))); 959 backup_array = util::exchange(VecCUPMCast(v)->array_d, const_cast<PetscScalar *>(a)); 960 // only update the offload mask if we actually assign a pointer 961 if (a) v->offloadmask = PETSC_OFFLOAD_GPU; 962 } 963 } 964 PetscFunctionReturn(PETSC_SUCCESS); 965 } 966 967 // v->ops->replacearray or VecCUPMReplaceArray() 968 template <device::cupm::DeviceType T, typename D> 969 template <PetscMemType mtype> 970 inline PetscErrorCode Vec_CUPMBase<T, D>::replacearray(Vec v, const PetscScalar *a) noexcept 971 { 972 const auto aptr = const_cast<PetscScalar *>(a); 973 PetscDeviceContext dctx; 974 975 PetscFunctionBegin; 976 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 977 PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM()); 978 PetscCall(CheckPointerMatchesMemType_(a, mtype)); 979 PetscCall(GetHandles_(&dctx)); 980 if (PetscMemTypeHost(mtype)) { 981 PetscCall(VecIMPLAllocateCheck_(v)); 982 { 983 const auto vimpl = VecIMPLCast(v); 984 auto &host_array = vimpl->array_allocated; 985 986 // make sure the users array has the latest values. 987 // REVIEW ME: why? we're about to free it 988 if (host_array != vimpl->array) PetscCall(CopyToHost_(dctx, v)); 989 if (host_array) { 990 const auto useit = UseCUPMHostAlloc(v->pinned_memory); 991 992 PetscCall(PetscFree(host_array)); 993 } 994 host_array = aptr; 995 vimpl->array = host_array; 996 v->pinned_memory = PETSC_FALSE; // REVIEW ME: we can determine this 997 v->offloadmask = PETSC_OFFLOAD_CPU; 998 } 999 } else { 1000 PetscCall(VecCUPMAllocateCheck_(v)); 1001 { 1002 const auto vcu = VecCUPMCast(v); 1003 1004 PetscCall(ResetAllocatedDevicePtr_(dctx, v, aptr)); 1005 // don't update the offloadmask if placed pointer is NULL 1006 vcu->array_d = vcu->array_allocated_d /* = aptr */; 1007 if (aptr) v->offloadmask = PETSC_OFFLOAD_GPU; 1008 } 1009 } 1010 PetscCall(PetscObjectStateIncrease(PetscObjectCast(v))); 1011 PetscFunctionReturn(PETSC_SUCCESS); 1012 } 1013 1014 // v->ops->resetarray or VecCUPMResetArray() 1015 template <device::cupm::DeviceType T, typename D> 1016 template <PetscMemType mtype> 1017 inline PetscErrorCode Vec_CUPMBase<T, D>::resetarray(Vec v) noexcept 1018 { 1019 PetscDeviceContext dctx; 1020 1021 PetscFunctionBegin; 1022 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 1023 PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM()); 1024 PetscCall(GetHandles_(&dctx)); 1025 // REVIEW ME: 1026 // this is wildly inefficient but must be done if we assume that the placed array must have 1027 // correct values 1028 if (PetscMemTypeHost(mtype)) { 1029 PetscCall(CopyToHost_(dctx, v)); 1030 PetscCall(VecResetArray_IMPL(v)); 1031 v->offloadmask = PETSC_OFFLOAD_CPU; 1032 } else { 1033 PetscCall(VecIMPLAllocateCheck_(v)); 1034 PetscCall(VecCUPMAllocateCheck_(v)); 1035 { 1036 const auto vcu = VecCUPMCast(v); 1037 const auto vimpl = VecIMPLCast(v); 1038 auto &host_array = vimpl->unplacedarray; 1039 1040 PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_DEVICE)); 1041 PetscCall(CopyToDevice_(dctx, v)); 1042 PetscCall(PetscObjectStateIncrease(PetscObjectCast(v))); 1043 // Need to reset the offloadmask. If we had a stashed pointer we are on the GPU, 1044 // otherwise check if the host has a valid pointer. If neither, then we are not 1045 // allocated. 1046 vcu->array_d = host_array; 1047 if (host_array) { 1048 host_array = nullptr; 1049 v->offloadmask = PETSC_OFFLOAD_GPU; 1050 } else if (vimpl->array) { 1051 v->offloadmask = PETSC_OFFLOAD_CPU; 1052 } else { 1053 v->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1054 } 1055 } 1056 } 1057 PetscFunctionReturn(PETSC_SUCCESS); 1058 } 1059 1060 // v->ops->create 1061 template <device::cupm::DeviceType T, typename D> 1062 inline PetscErrorCode Vec_CUPMBase<T, D>::create(Vec v) noexcept 1063 { 1064 PetscBool alloc_missing; 1065 PetscDeviceContext dctx; 1066 1067 PetscFunctionBegin; 1068 PetscCall(VecCreate_IMPL_Private(v, &alloc_missing)); 1069 PetscCall(GetHandles_(&dctx)); 1070 PetscCall(Initialize_CUPMBase(v, alloc_missing, nullptr, nullptr, dctx)); 1071 PetscFunctionReturn(PETSC_SUCCESS); 1072 } 1073 1074 // v->ops->destroy 1075 template <device::cupm::DeviceType T, typename D> 1076 inline PetscErrorCode Vec_CUPMBase<T, D>::destroy(Vec v) noexcept 1077 { 1078 PetscFunctionBegin; 1079 if (const auto vcu = VecCUPMCast(v)) { 1080 PetscDeviceContext dctx; 1081 1082 PetscCall(GetHandles_(&dctx)); 1083 PetscCall(ResetAllocatedDevicePtr_(dctx, v)); 1084 PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx)); 1085 PetscCall(PetscFree(v->spptr)); 1086 } 1087 PetscCall(PetscObjectSAWsViewOff(PetscObjectCast(v))); 1088 if (const auto vimpl = VecIMPLCast(v)) { 1089 if (auto &array_allocated = vimpl->array_allocated) { 1090 const auto useit = UseCUPMHostAlloc(v->pinned_memory); 1091 1092 // do this ourselves since we may want to use the cupm functions 1093 PetscCall(PetscFree(array_allocated)); 1094 } 1095 } 1096 v->pinned_memory = PETSC_FALSE; 1097 PetscCall(VecDestroy_IMPL(v)); 1098 PetscFunctionReturn(PETSC_SUCCESS); 1099 } 1100 1101 // ================================================================================== // 1102 // Common core between Seq and MPI // 1103 1104 // VecCreate_CUPM() 1105 template <device::cupm::DeviceType T, typename D> 1106 inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPM(Vec v) noexcept 1107 { 1108 PetscMPIInt size; 1109 1110 PetscFunctionBegin; 1111 PetscCallMPI(MPI_Comm_size(PetscObjectComm(PetscObjectCast(v)), &size)); 1112 PetscCall(VecSetType(v, size > 1 ? VECMPICUPM() : VECSEQCUPM())); 1113 PetscFunctionReturn(PETSC_SUCCESS); 1114 } 1115 1116 // VecCreateCUPM() 1117 template <device::cupm::DeviceType T, typename D> 1118 inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPMBase(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, Vec *v, PetscBool call_set_type, PetscLayout reference) noexcept 1119 { 1120 PetscFunctionBegin; 1121 PetscCall(VecCreate(comm, v)); 1122 if (reference) PetscCall(PetscLayoutReference(reference, &(*v)->map)); 1123 PetscCall(VecSetSizes(*v, n, N)); 1124 if (bs) PetscCall(VecSetBlockSize(*v, bs)); 1125 if (call_set_type) PetscCall(VecSetType(*v, VECIMPLCUPM())); 1126 PetscFunctionReturn(PETSC_SUCCESS); 1127 } 1128 1129 // VecCreateIMPL_CUPM(), called through v->ops->create 1130 template <device::cupm::DeviceType T, typename D> 1131 inline PetscErrorCode Vec_CUPMBase<T, D>::Initialize_CUPMBase(Vec v, PetscBool allocate_missing, PetscScalar *host_array, PetscScalar *device_array, PetscDeviceContext dctx) noexcept 1132 { 1133 PetscFunctionBegin; 1134 // REVIEW ME: perhaps not needed 1135 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM())); 1136 PetscCall(PetscObjectChangeTypeName(PetscObjectCast(v), VECIMPLCUPM())); 1137 PetscCall(D::bindtocpu(v, PETSC_FALSE)); 1138 if (device_array) { 1139 PetscCall(CheckPointerMatchesMemType_(device_array, PETSC_MEMTYPE_CUPM())); 1140 PetscCall(VecCUPMAllocateCheck_(v)); 1141 VecCUPMCast(v)->array_d = device_array; 1142 } 1143 if (host_array) { 1144 PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_HOST)); 1145 VecIMPLCast(v)->array = host_array; 1146 } 1147 if (allocate_missing) { 1148 PetscCall(DeviceAllocateCheck_(dctx, v)); 1149 PetscCall(HostAllocateCheck_(dctx, v)); 1150 // REVIEW ME: junchao, is this needed with new calloc() branch? VecSet() will call 1151 // set() for reference 1152 // calls device-version 1153 PetscCall(VecSet(v, 0)); 1154 // zero the host while device is underway 1155 PetscCall(PetscArrayzero(VecIMPLCast(v)->array, v->map->n)); 1156 v->offloadmask = PETSC_OFFLOAD_BOTH; 1157 } else { 1158 if (host_array) { 1159 v->offloadmask = device_array ? PETSC_OFFLOAD_BOTH : PETSC_OFFLOAD_CPU; 1160 } else { 1161 v->offloadmask = device_array ? PETSC_OFFLOAD_GPU : PETSC_OFFLOAD_UNALLOCATED; 1162 } 1163 } 1164 PetscFunctionReturn(PETSC_SUCCESS); 1165 } 1166 1167 // v->ops->duplicate 1168 template <device::cupm::DeviceType T, typename D> 1169 template <typename SetupFunctionT> 1170 inline PetscErrorCode Vec_CUPMBase<T, D>::Duplicate_CUPMBase(Vec v, Vec *y, PetscDeviceContext dctx, SetupFunctionT &&DerivedCreateIMPLCUPM_Async) noexcept 1171 { 1172 // if the derived setup is the default no_op then we should call VecSetType() 1173 constexpr auto call_set_type = static_cast<PetscBool>(std::is_same<SetupFunctionT, no_op>::value); 1174 const auto vobj = PetscObjectCast(v); 1175 const auto map = v->map; 1176 PetscInt bs; 1177 1178 PetscFunctionBegin; 1179 PetscCall(VecGetBlockSize(v, &bs)); 1180 PetscCall(Create_CUPMBase(PetscObjectComm(vobj), bs, map->n, map->N, y, call_set_type, map)); 1181 // Derived class can set up the remainder of the data structures here 1182 PetscCall(DerivedCreateIMPLCUPM_Async(*y)); 1183 // If the other vector is bound to CPU then the memcpy of the ops struct will give the 1184 // duplicated vector the host "getarray" function which does not lazily allocate the array 1185 // (as it is assumed to always exist). So we force allocation here, before we overwrite the 1186 // ops 1187 if (v->boundtocpu) PetscCall(HostAllocateCheck_(dctx, *y)); 1188 // in case the user has done some VecSetOps() tomfoolery 1189 PetscCall(PetscArraycpy((*y)->ops, v->ops, 1)); 1190 { 1191 const auto yobj = PetscObjectCast(*y); 1192 1193 PetscCall(PetscObjectListDuplicate(vobj->olist, &yobj->olist)); 1194 PetscCall(PetscFunctionListDuplicate(vobj->qlist, &yobj->qlist)); 1195 } 1196 (*y)->stash.donotstash = v->stash.donotstash; 1197 (*y)->stash.ignorenegidx = v->stash.ignorenegidx; 1198 (*y)->map->bs = std::abs(v->map->bs); 1199 (*y)->bstash.bs = v->bstash.bs; 1200 PetscFunctionReturn(PETSC_SUCCESS); 1201 } 1202 1203 #define VecSetOp_CUPM(op_name, op_host, ...) \ 1204 do { \ 1205 if (usehost) { \ 1206 v->ops->op_name = op_host; \ 1207 } else { \ 1208 v->ops->op_name = __VA_ARGS__; \ 1209 } \ 1210 } while (0) 1211 1212 // v->ops->bindtocpu 1213 template <device::cupm::DeviceType T, typename D> 1214 inline PetscErrorCode Vec_CUPMBase<T, D>::BindToCPU_CUPMBase(Vec v, PetscBool usehost, PetscDeviceContext dctx) noexcept 1215 { 1216 const auto change_default_rand_type = [](PetscRandomType target, char **ptr) { 1217 PetscFunctionBegin; 1218 PetscValidPointer(ptr, 2); 1219 PetscValidCharPointer(*ptr, 2); 1220 if (std::strcmp(target, *ptr)) { 1221 PetscCall(PetscFree(*ptr)); 1222 PetscCall(PetscStrallocpy(target, ptr)); 1223 } 1224 PetscFunctionReturn(PETSC_SUCCESS); 1225 }; 1226 1227 PetscFunctionBegin; 1228 v->boundtocpu = usehost; 1229 if (usehost) PetscCall(CopyToHost_(dctx, v)); 1230 PetscCall(change_default_rand_type(usehost ? PETSCRANDER48 : PETSCDEVICERAND(), &v->defaultrandtype)); 1231 1232 // set the base functions that are guaranteed to be the same for both 1233 v->ops->duplicate = D::duplicate; 1234 v->ops->create = create; 1235 v->ops->destroy = destroy; 1236 v->ops->bindtocpu = D::bindtocpu; 1237 // Note that setting these to NULL on host breaks convergence in certain areas. I don't know 1238 // why, and I don't know how, but it is IMPERATIVE these are set as such! 1239 v->ops->replacearray = replacearray<PETSC_MEMTYPE_HOST>; 1240 v->ops->restorearray = restorearray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>; 1241 1242 // set device-only common functions 1243 VecSetOp_CUPM(dotnorm2, nullptr, D::dotnorm2); 1244 VecSetOp_CUPM(getarray, nullptr, getarray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>); 1245 VecSetOp_CUPM(getarraywrite, nullptr, getarray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>); 1246 VecSetOp_CUPM(restorearraywrite, nullptr, restorearray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>); 1247 1248 VecSetOp_CUPM(getarrayread, nullptr, [](Vec v, const PetscScalar **a) { return getarray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); }); 1249 VecSetOp_CUPM(restorearrayread, nullptr, [](Vec v, const PetscScalar **a) { return restorearray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); }); 1250 1251 VecSetOp_CUPM(getarrayandmemtype, nullptr, getarrayandmemtype<PETSC_MEMORY_ACCESS_READ_WRITE>); 1252 VecSetOp_CUPM(restorearrayandmemtype, nullptr, restorearrayandmemtype<PETSC_MEMORY_ACCESS_READ_WRITE>); 1253 1254 VecSetOp_CUPM(getarraywriteandmemtype, nullptr, getarrayandmemtype<PETSC_MEMORY_ACCESS_WRITE>); 1255 VecSetOp_CUPM(restorearraywriteandmemtype, nullptr, [](Vec v, PetscScalar **a, PetscMemType *) { return restorearrayandmemtype<PETSC_MEMORY_ACCESS_WRITE>(v, a); }); 1256 1257 VecSetOp_CUPM(getarrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a, PetscMemType *m) { return getarrayandmemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a), m); }); 1258 VecSetOp_CUPM(restorearrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a) { return restorearrayandmemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); }); 1259 1260 // set the functions that are always sequential 1261 using VecSeq_T = VecSeq_CUPM<T>; 1262 VecSetOp_CUPM(scale, VecScale_Seq, VecSeq_T::scale); 1263 VecSetOp_CUPM(copy, VecCopy_Seq, VecSeq_T::copy); 1264 VecSetOp_CUPM(set, VecSet_Seq, VecSeq_T::set); 1265 VecSetOp_CUPM(swap, VecSwap_Seq, VecSeq_T::swap); 1266 VecSetOp_CUPM(axpy, VecAXPY_Seq, VecSeq_T::axpy); 1267 VecSetOp_CUPM(axpby, VecAXPBY_Seq, VecSeq_T::axpby); 1268 VecSetOp_CUPM(maxpy, VecMAXPY_Seq, VecSeq_T::maxpy); 1269 VecSetOp_CUPM(aypx, VecAYPX_Seq, VecSeq_T::aypx); 1270 VecSetOp_CUPM(waxpy, VecWAXPY_Seq, VecSeq_T::waxpy); 1271 VecSetOp_CUPM(axpbypcz, VecAXPBYPCZ_Seq, VecSeq_T::axpbypcz); 1272 VecSetOp_CUPM(pointwisemult, VecPointwiseMult_Seq, VecSeq_T::pointwisemult); 1273 VecSetOp_CUPM(pointwisedivide, VecPointwiseDivide_Seq, VecSeq_T::pointwisedivide); 1274 VecSetOp_CUPM(setrandom, VecSetRandom_Seq, VecSeq_T::setrandom); 1275 VecSetOp_CUPM(dot_local, VecDot_Seq, VecSeq_T::dot); 1276 VecSetOp_CUPM(tdot_local, VecTDot_Seq, VecSeq_T::tdot); 1277 VecSetOp_CUPM(norm_local, VecNorm_Seq, VecSeq_T::norm); 1278 VecSetOp_CUPM(mdot_local, VecMDot_Seq, VecSeq_T::mdot); 1279 VecSetOp_CUPM(reciprocal, VecReciprocal_Default, VecSeq_T::reciprocal); 1280 VecSetOp_CUPM(shift, nullptr, VecSeq_T::shift); 1281 VecSetOp_CUPM(getlocalvector, nullptr, VecSeq_T::template getlocalvector<PETSC_MEMORY_ACCESS_READ_WRITE>); 1282 VecSetOp_CUPM(restorelocalvector, nullptr, VecSeq_T::template restorelocalvector<PETSC_MEMORY_ACCESS_READ_WRITE>); 1283 VecSetOp_CUPM(getlocalvectorread, nullptr, VecSeq_T::template getlocalvector<PETSC_MEMORY_ACCESS_READ>); 1284 VecSetOp_CUPM(restorelocalvectorread, nullptr, VecSeq_T::template restorelocalvector<PETSC_MEMORY_ACCESS_READ>); 1285 VecSetOp_CUPM(sum, nullptr, VecSeq_T::sum); 1286 PetscFunctionReturn(PETSC_SUCCESS); 1287 } 1288 1289 // Called from VecGetSubVector() 1290 template <device::cupm::DeviceType T, typename D> 1291 inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrays_CUPMBase(Vec v, const PetscScalar **host_array, const PetscScalar **device_array, PetscOffloadMask *mask, PetscDeviceContext dctx) noexcept 1292 { 1293 PetscFunctionBegin; 1294 PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM()); 1295 if (host_array) { 1296 PetscCall(HostAllocateCheck_(dctx, v)); 1297 *host_array = VecIMPLCast(v)->array; 1298 } 1299 if (device_array) { 1300 PetscCall(DeviceAllocateCheck_(dctx, v)); 1301 *device_array = VecCUPMCast(v)->array_d; 1302 } 1303 if (mask) *mask = v->offloadmask; 1304 PetscFunctionReturn(PETSC_SUCCESS); 1305 } 1306 1307 template <device::cupm::DeviceType T, typename D> 1308 inline PetscErrorCode Vec_CUPMBase<T, D>::ResetPreallocationCOO_CUPMBase(Vec v, PetscDeviceContext dctx) noexcept 1309 { 1310 PetscFunctionBegin; 1311 if (const auto vcu = VecCUPMCast(v)) { 1312 cupmStream_t stream; 1313 // clang-format off 1314 const auto cntptrs = util::make_array( 1315 std::ref(vcu->jmap1_d), 1316 std::ref(vcu->perm1_d), 1317 std::ref(vcu->imap2_d), 1318 std::ref(vcu->jmap2_d), 1319 std::ref(vcu->perm2_d), 1320 std::ref(vcu->Cperm_d) 1321 ); 1322 // clang-format on 1323 1324 PetscCall(GetHandlesFrom_(dctx, &stream)); 1325 for (auto &&ptr : cntptrs) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream)); 1326 for (auto &&ptr : util::make_array(std::ref(vcu->sendbuf_d), std::ref(vcu->recvbuf_d))) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream)); 1327 } 1328 PetscFunctionReturn(PETSC_SUCCESS); 1329 } 1330 1331 template <device::cupm::DeviceType T, typename D> 1332 template <std::size_t NCount, std::size_t NScal> 1333 inline PetscErrorCode Vec_CUPMBase<T, D>::SetPreallocationCOO_CUPMBase(Vec v, PetscCount, const PetscInt[], PetscDeviceContext dctx, const std::array<CooPair<PetscCount>, NCount> &extra_cntptrs, const std::array<CooPair<PetscScalar>, NScal> &bufptrs) noexcept 1334 { 1335 const auto vimpl = VecIMPLCast(v); 1336 1337 PetscFunctionBegin; 1338 PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx)); 1339 // need to instantiate the private pointer if not already 1340 PetscCall(VecCUPMAllocateCheck_(v)); 1341 { 1342 const auto vcu = VecCUPMCast(v); 1343 // clang-fomat off 1344 const auto cntptrs = util::concat_array(util::make_array(make_coo_pair(vcu->jmap1_d, vimpl->jmap1, v->map->n + 1), make_coo_pair(vcu->perm1_d, vimpl->perm1, vimpl->tot1)), extra_cntptrs); 1345 // clang-format on 1346 cupmStream_t stream; 1347 1348 PetscCall(GetHandlesFrom_(dctx, &stream)); 1349 // allocate 1350 for (auto &elem : cntptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream)); 1351 for (auto &elem : bufptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream)); 1352 // copy 1353 for (const auto &elem : cntptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true)); 1354 for (const auto &elem : bufptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true)); 1355 } 1356 PetscFunctionReturn(PETSC_SUCCESS); 1357 } 1358 1359 #define PETSC_VEC_CUPM_BASE_CLASS_HEADER(name, Tp, ...) \ 1360 using name = ::Petsc::vec::cupm::impl::Vec_CUPMBase<Tp, __VA_ARGS__>; \ 1361 friend name; \ 1362 /* introspection */ \ 1363 using name::VecCUPMCast; \ 1364 using name::VecIMPLCast; \ 1365 using name::VECIMPLCUPM; \ 1366 using name::VECSEQCUPM; \ 1367 using name::VECMPICUPM; \ 1368 using name::VecView_Debug; \ 1369 /* utility */ \ 1370 using typename name::Vec_CUPM; \ 1371 using name::UseCUPMHostAlloc; \ 1372 using name::GetHandles_; \ 1373 using name::GetHandlesFrom_; \ 1374 using name::VecCUPMAllocateCheck_; \ 1375 using name::VecIMPLAllocateCheck_; \ 1376 using name::HostAllocateCheck_; \ 1377 using name::DeviceAllocateCheck_; \ 1378 using name::CopyToDevice_; \ 1379 using name::CopyToHost_; \ 1380 using name::create; \ 1381 using name::destroy; \ 1382 using name::getarray; \ 1383 using name::restorearray; \ 1384 using name::getarrayandmemtype; \ 1385 using name::restorearrayandmemtype; \ 1386 using name::placearray; \ 1387 using name::replacearray; \ 1388 using name::resetarray; \ 1389 /* base functions */ \ 1390 using name::Create_CUPMBase; \ 1391 using name::Initialize_CUPMBase; \ 1392 using name::Duplicate_CUPMBase; \ 1393 using name::BindToCPU_CUPMBase; \ 1394 using name::Create_CUPM; \ 1395 using name::DeviceArrayRead; \ 1396 using name::DeviceArrayWrite; \ 1397 using name::DeviceArrayReadWrite; \ 1398 using name::HostArrayRead; \ 1399 using name::HostArrayWrite; \ 1400 using name::HostArrayReadWrite; \ 1401 using name::ResetPreallocationCOO_CUPMBase; \ 1402 using name::SetPreallocationCOO_CUPMBase; \ 1403 /* blas interface */ \ 1404 PETSC_CUPMBLAS_INHERIT_INTERFACE_TYPEDEFS_USING(cupmBlasInterface_t, Tp) 1405 1406 } // namespace impl 1407 1408 } // namespace cupm 1409 1410 } // namespace vec 1411 1412 } // namespace Petsc 1413 1414 #endif // __cplusplus && PetscDefined(HAVE_DEVICE) 1415 1416 #endif // PETSCVECCUPMIMPL_H 1417