1 #pragma once 2 3 #include <petsc/private/matdensecupmimpl.h> /*I <petscmat.h> I*/ 4 #include <../src/mat/impls/dense/seq/dense.h> 5 6 #include <petsc/private/deviceimpl.h> // PetscDeviceContextGetOptionalNullContext_Internal() 7 #include <petsc/private/randomimpl.h> // _p_PetscRandom 8 #include <petsc/private/vecimpl.h> // _p_Vec 9 #include <petsc/private/cupmobject.hpp> 10 #include <petsc/private/cupmsolverinterface.hpp> 11 12 #include <petsc/private/cpp/type_traits.hpp> // PetscObjectCast() 13 #include <petsc/private/cpp/utility.hpp> // util::exchange() 14 15 #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp> // for VecSeq_CUPM 16 17 namespace Petsc 18 { 19 20 namespace mat 21 { 22 23 namespace cupm 24 { 25 26 namespace impl 27 { 28 29 template <device::cupm::DeviceType T> 30 class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> { 31 public: 32 MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>); 33 34 private: 35 struct Mat_SeqDenseCUPM { 36 PetscScalar *d_v; // pointer to the matrix on the GPU 37 PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original 38 bool d_user_alloc; 39 bool d_unplaced_user_alloc; 40 // factorization support 41 cupmBlasInt_t *d_fact_ipiv; // device pivots 42 cupmScalar_t *d_fact_tau; // device QR tau vector 43 cupmBlasInt_t *d_fact_info; // device info 44 cupmScalar_t *d_fact_work; // device workspace 45 cupmBlasInt_t d_fact_lwork; // size of device workspace 46 // workspace 47 Vec workvec; 48 }; 49 50 static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept; 51 52 static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept; 53 static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept; 54 55 static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept; 56 57 template <typename Derived> 58 struct SolveCommon; 59 struct SolveQR; 60 struct SolveCholesky; 61 struct SolveLU; 62 63 template <typename Solver, bool transpose> 64 static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept; 65 template <typename Solver, bool transpose> 66 static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept; 67 template <bool transpose, bool hermitian> 68 static PetscErrorCode MatMultAddColumnRange_Dispatch_(Mat, Vec, Vec, Vec, PetscInt, PetscInt) noexcept; 69 template <bool transpose, bool hermitian> 70 static PetscErrorCode MatMultColumnRange_Dispatch_(Mat, Vec, Vec, PetscInt, PetscInt) noexcept; 71 template <bool transpose, bool hermitian> 72 static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept; 73 74 template <bool to_host> 75 static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept; 76 77 PETSC_NODISCARD static constexpr MatType MATIMPLCUPM_() noexcept; 78 PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept; 79 80 public: 81 PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept; 82 83 // define these by hand since they don't fit the above mold 84 PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept; 85 PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept; 86 87 static PetscErrorCode Create(Mat) noexcept; 88 static PetscErrorCode Destroy(Mat) noexcept; 89 static PetscErrorCode SetUp(Mat) noexcept; 90 static PetscErrorCode Reset(Mat) noexcept; 91 92 static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept; 93 static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept; 94 static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept; 95 96 template <PetscMemType, PetscMemoryAccessMode> 97 static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept; 98 template <PetscMemType, PetscMemoryAccessMode> 99 static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept; 100 template <PetscMemoryAccessMode> 101 static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept; 102 template <PetscMemoryAccessMode> 103 static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept; 104 105 private: 106 template <PetscMemType mtype, PetscMemoryAccessMode mode> 107 static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept 108 { 109 PetscDeviceContext dctx; 110 111 PetscFunctionBegin; 112 PetscCall(GetHandles_(&dctx)); 113 PetscCall(GetArray<mtype, mode>(m, p, dctx)); 114 PetscFunctionReturn(PETSC_SUCCESS); 115 } 116 117 template <PetscMemType mtype, PetscMemoryAccessMode mode> 118 static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept 119 { 120 PetscDeviceContext dctx; 121 122 PetscFunctionBegin; 123 PetscCall(GetHandles_(&dctx)); 124 PetscCall(RestoreArray<mtype, mode>(m, p, dctx)); 125 PetscFunctionReturn(PETSC_SUCCESS); 126 } 127 128 template <PetscMemoryAccessMode mode> 129 static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept 130 { 131 PetscDeviceContext dctx; 132 133 PetscFunctionBegin; 134 PetscCall(GetHandles_(&dctx)); 135 PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 template <PetscMemoryAccessMode mode> 140 static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept 141 { 142 PetscDeviceContext dctx; 143 144 PetscFunctionBegin; 145 PetscCall(GetHandles_(&dctx)); 146 PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx)); 147 PetscFunctionReturn(PETSC_SUCCESS); 148 } 149 150 public: 151 static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept; 152 static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept; 153 static PetscErrorCode ResetArray(Mat) noexcept; 154 155 template <bool transpose_A, bool transpose_B> 156 static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept; 157 static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept; 158 static PetscErrorCode ZeroEntries(Mat) noexcept; 159 static PetscErrorCode Conjugate(Mat) noexcept; 160 static PetscErrorCode Scale(Mat, PetscScalar) noexcept; 161 static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept; 162 static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept; 163 static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept; 164 165 static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept; 166 template <PetscMemoryAccessMode> 167 static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept; 168 template <PetscMemoryAccessMode> 169 static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept; 170 171 static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept; 172 static PetscErrorCode InvertFactors(Mat) noexcept; 173 174 static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept; 175 static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept; 176 }; 177 178 } // namespace impl 179 180 namespace 181 { 182 183 // Declare this here so that the functions below can make use of it 184 template <device::cupm::DeviceType T> 185 inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept 186 { 187 PetscFunctionBegin; 188 PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate)); 189 PetscFunctionReturn(PETSC_SUCCESS); 190 } 191 192 } // anonymous namespace 193 194 namespace impl 195 { 196 197 // ========================================================================================== 198 // MatDense_Seq_CUPM - Private API - Utility 199 // ========================================================================================== 200 201 template <device::cupm::DeviceType T> 202 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept 203 { 204 const auto mcu = MatCUPMCast(m); 205 const auto nrows = m->rmap->n; 206 const auto ncols = m->cmap->n; 207 auto &lda = MatIMPLCast(m)->lda; 208 cupmStream_t stream; 209 210 PetscFunctionBegin; 211 PetscCheckTypeName(m, MATSEQDENSECUPM()); 212 PetscValidDeviceContext(dctx, 2); 213 PetscCall(checkCupmBlasIntCast(nrows)); 214 PetscCall(checkCupmBlasIntCast(ncols)); 215 PetscCall(GetHandlesFrom_(dctx, &stream)); 216 if (lda <= 0) lda = nrows; 217 if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream)); 218 if (user_device_array) { 219 mcu->d_user_alloc = PETSC_TRUE; 220 mcu->d_v = user_device_array; 221 } else { 222 std::size_t size; 223 224 mcu->d_user_alloc = PETSC_FALSE; 225 size = lda * ncols; 226 PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream)); 227 PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream)); 228 } 229 m->offloadmask = PETSC_OFFLOAD_GPU; 230 PetscFunctionReturn(PETSC_SUCCESS); 231 } 232 233 template <device::cupm::DeviceType T> 234 inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept 235 { 236 const auto nrows = m->rmap->n; 237 const auto ncols = m->cmap->n; 238 const auto copy = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED; 239 240 PetscFunctionBegin; 241 PetscCheckTypeName(m, MATSEQDENSECUPM()); 242 if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS); 243 PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols)); 244 if (copy) { 245 const auto mcu = MatCUPMCast(m); 246 cupmStream_t stream; 247 248 // Allocate GPU memory if not present 249 if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr)); 250 PetscCall(GetHandlesFrom_(dctx, &stream)); 251 PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0)); 252 { 253 const auto mimpl = MatIMPLCast(m); 254 const auto lda = mimpl->lda; 255 const auto src = mimpl->v; 256 const auto dest = mcu->d_v; 257 258 if (lda > nrows) { 259 PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream)); 260 } else { 261 PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream)); 262 } 263 } 264 PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0)); 265 // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH 266 m->offloadmask = PETSC_OFFLOAD_BOTH; 267 } 268 PetscFunctionReturn(PETSC_SUCCESS); 269 } 270 271 template <device::cupm::DeviceType T> 272 inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept 273 { 274 const auto nrows = m->rmap->n; 275 const auto ncols = m->cmap->n; 276 const auto copy = m->offloadmask == PETSC_OFFLOAD_GPU; 277 278 PetscFunctionBegin; 279 PetscCheckTypeName(m, MATSEQDENSECUPM()); 280 PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols)); 281 if (copy) { 282 const auto mimpl = MatIMPLCast(m); 283 cupmStream_t stream; 284 285 // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed 286 if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr)); 287 PetscCall(GetHandlesFrom_(dctx, &stream)); 288 PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0)); 289 { 290 const auto lda = mimpl->lda; 291 const auto dest = mimpl->v; 292 const auto src = MatCUPMCast(m)->d_v; 293 294 if (lda > nrows) { 295 PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream)); 296 } else { 297 PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream)); 298 } 299 } 300 PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0)); 301 // order is important, MatSeqDenseSetPreallocation() might set offloadmask 302 m->offloadmask = PETSC_OFFLOAD_BOTH; 303 } 304 PetscFunctionReturn(PETSC_SUCCESS); 305 } 306 307 template <device::cupm::DeviceType T> 308 inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept 309 { 310 PetscFunctionBegin; 311 if (PetscDefined(USE_DEBUG)) { 312 cupmBlasInt_t info = 0; 313 314 PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream)); 315 if (stream) PetscCallCUPM(cupmStreamSynchronize(stream)); 316 static_assert(std::is_same<decltype(info), int>::value, ""); 317 PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1); 318 PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info); 319 } 320 PetscFunctionReturn(PETSC_SUCCESS); 321 } 322 323 // ========================================================================================== 324 // MatDense_Seq_CUPM - Private API - Solver Dispatch 325 // ========================================================================================== 326 327 // specific solvers called through the dispatch_() family of functions 328 template <device::cupm::DeviceType T> 329 template <typename Derived> 330 struct MatDense_Seq_CUPM<T>::SolveCommon { 331 using derived_type = Derived; 332 333 template <typename F> 334 static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept 335 { 336 cupmBlasInt_t lwork; 337 338 PetscFunctionBegin; 339 PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork)); 340 if (lwork > mcu->d_fact_lwork) { 341 mcu->d_fact_lwork = lwork; 342 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream)); 343 PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream)); 344 } 345 PetscFunctionReturn(PETSC_SUCCESS); 346 } 347 348 static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept 349 { 350 const auto mcu = MatCUPMCast(A); 351 352 PetscFunctionBegin; 353 PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n)); 354 A->factortype = derived_type::MATFACTORTYPE(); 355 A->ops->solve = MatSolve_Factored_Dispatch_<derived_type, false>; 356 A->ops->solvetranspose = MatSolve_Factored_Dispatch_<derived_type, true>; 357 A->ops->matsolve = MatMatSolve_Factored_Dispatch_<derived_type, false>; 358 A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>; 359 360 PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype)); 361 if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream)); 362 PetscFunctionReturn(PETSC_SUCCESS); 363 } 364 }; 365 366 template <device::cupm::DeviceType T> 367 struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> { 368 using base_type = SolveCommon<SolveLU>; 369 370 static constexpr const char *NAME() noexcept { return "LU"; } 371 static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; } 372 373 static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept 374 { 375 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 376 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 377 cupmStream_t stream; 378 cupmSolverHandle_t handle; 379 PetscDeviceContext dctx; 380 381 PetscFunctionBegin; 382 if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS); 383 PetscCall(GetHandles_(&dctx, &handle, &stream)); 384 PetscCall(base_type::FactorPrepare(A, stream)); 385 { 386 const auto mcu = MatCUPMCast(A); 387 const auto da = DeviceArrayReadWrite(dctx, A); 388 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 389 390 // clang-format off 391 PetscCall( 392 base_type::ResizeFactLwork( 393 mcu, stream, 394 [&](cupmBlasInt_t *fact_lwork) 395 { 396 return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork); 397 } 398 ) 399 ); 400 // clang-format on 401 if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream)); 402 403 PetscCall(PetscLogGpuTimeBegin()); 404 PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info)); 405 PetscCall(PetscLogGpuTimeEnd()); 406 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 407 } 408 PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0)); 409 PetscFunctionReturn(PETSC_SUCCESS); 410 } 411 412 template <bool transpose> 413 static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept 414 { 415 const auto mcu = MatCUPMCast(A); 416 const auto fact_info = mcu->d_fact_info; 417 const auto fact_ipiv = mcu->d_fact_ipiv; 418 cupmSolverHandle_t handle; 419 420 PetscFunctionBegin; 421 PetscCall(GetHandlesFrom_(dctx, &handle)); 422 PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k)); 423 PetscCall(PetscLogGpuTimeBegin()); 424 { 425 constexpr auto op = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N; 426 const auto da = DeviceArrayRead(dctx, A); 427 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 428 429 // clang-format off 430 PetscCall( 431 base_type::ResizeFactLwork( 432 mcu, stream, 433 [&](cupmBlasInt_t *lwork) 434 { 435 return cupmSolverXgetrs_bufferSize( 436 handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork 437 ); 438 } 439 ) 440 ); 441 // clang-format on 442 PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info)); 443 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 444 } 445 PetscCall(PetscLogGpuTimeEnd()); 446 PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m))); 447 PetscFunctionReturn(PETSC_SUCCESS); 448 } 449 }; 450 451 template <device::cupm::DeviceType T> 452 struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> { 453 using base_type = SolveCommon<SolveCholesky>; 454 455 static constexpr const char *NAME() noexcept { return "Cholesky"; } 456 static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; } 457 458 static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept 459 { 460 const auto n = static_cast<cupmBlasInt_t>(A->rmap->n); 461 PetscDeviceContext dctx; 462 cupmSolverHandle_t handle; 463 cupmStream_t stream; 464 465 PetscFunctionBegin; 466 if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS); 467 PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName()); 468 PetscCall(GetHandles_(&dctx, &handle, &stream)); 469 PetscCall(base_type::FactorPrepare(A, stream)); 470 { 471 const auto mcu = MatCUPMCast(A); 472 const auto da = DeviceArrayReadWrite(dctx, A); 473 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 474 475 // clang-format off 476 PetscCall( 477 base_type::ResizeFactLwork( 478 mcu, stream, 479 [&](cupmBlasInt_t *fact_lwork) 480 { 481 return cupmSolverXpotrf_bufferSize( 482 handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork 483 ); 484 } 485 ) 486 ); 487 // clang-format on 488 PetscCall(PetscLogGpuTimeBegin()); 489 PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 490 PetscCall(PetscLogGpuTimeEnd()); 491 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 492 } 493 PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0)); 494 495 #if 0 496 // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs 497 // and *hetr* routines. The code below should work, and it can be activated when *sytrs 498 // routines will be available 499 if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream)); 500 if (!mcu->d_fact_lwork) { 501 PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork)); 502 PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream)); 503 } 504 if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream)); 505 PetscCall(PetscLogGpuTimeBegin()); 506 PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 507 PetscCall(PetscLogGpuTimeEnd()); 508 #endif 509 PetscFunctionReturn(PETSC_SUCCESS); 510 } 511 512 template <bool transpose> 513 static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept 514 { 515 const auto mcu = MatCUPMCast(A); 516 const auto fact_info = mcu->d_fact_info; 517 cupmSolverHandle_t handle; 518 519 PetscFunctionBegin; 520 PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName()); 521 PetscCall(GetHandlesFrom_(dctx, &handle)); 522 PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k)); 523 PetscCall(PetscLogGpuTimeBegin()); 524 { 525 const auto da = DeviceArrayRead(dctx, A); 526 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 527 528 // clang-format off 529 PetscCall( 530 base_type::ResizeFactLwork( 531 mcu, stream, 532 [&](cupmBlasInt_t *lwork) 533 { 534 return cupmSolverXpotrs_bufferSize( 535 handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork 536 ); 537 } 538 ) 539 ); 540 // clang-format on 541 PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info)); 542 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 543 } 544 PetscCall(PetscLogGpuTimeEnd()); 545 PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m))); 546 PetscFunctionReturn(PETSC_SUCCESS); 547 } 548 }; 549 550 template <device::cupm::DeviceType T> 551 struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> { 552 using base_type = SolveCommon<SolveQR>; 553 554 static constexpr const char *NAME() noexcept { return "QR"; } 555 static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; } 556 557 static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept 558 { 559 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 560 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 561 const auto min = std::min(m, n); 562 const auto mimpl = MatIMPLCast(A); 563 cupmStream_t stream; 564 cupmSolverHandle_t handle; 565 PetscDeviceContext dctx; 566 567 PetscFunctionBegin; 568 if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS); 569 PetscCall(GetHandles_(&dctx, &handle, &stream)); 570 PetscCall(base_type::FactorPrepare(A, stream)); 571 mimpl->rank = min; 572 { 573 const auto mcu = MatCUPMCast(A); 574 const auto da = DeviceArrayReadWrite(dctx, A); 575 const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda); 576 577 if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec)); 578 if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream)); 579 // clang-format off 580 PetscCall( 581 base_type::ResizeFactLwork( 582 mcu, stream, 583 [&](cupmBlasInt_t *fact_lwork) 584 { 585 return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork); 586 } 587 ) 588 ); 589 // clang-format on 590 PetscCall(PetscLogGpuTimeBegin()); 591 PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 592 PetscCall(PetscLogGpuTimeEnd()); 593 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 594 } 595 PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0))); 596 PetscFunctionReturn(PETSC_SUCCESS); 597 } 598 599 template <bool transpose> 600 static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept 601 { 602 const auto mimpl = MatIMPLCast(A); 603 const auto rank = static_cast<cupmBlasInt_t>(mimpl->rank); 604 const auto mcu = MatCUPMCast(A); 605 const auto fact_info = mcu->d_fact_info; 606 const auto fact_tau = mcu->d_fact_tau; 607 const auto fact_work = mcu->d_fact_work; 608 const auto fact_lwork = mcu->d_fact_lwork; 609 cupmSolverHandle_t solver_handle; 610 cupmBlasHandle_t blas_handle; 611 612 PetscFunctionBegin; 613 PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle)); 614 PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k)); 615 PetscCall(PetscLogGpuTimeBegin()); 616 { 617 const auto da = DeviceArrayRead(dctx, A); 618 const auto one = cupmScalarCast(1.0); 619 const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda); 620 621 if (transpose) { 622 PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx)); 623 PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info)); 624 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 625 } else { 626 constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T; 627 628 PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info)); 629 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 630 PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx)); 631 } 632 } 633 PetscCall(PetscLogGpuTimeEnd()); 634 PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank)))); 635 PetscFunctionReturn(PETSC_SUCCESS); 636 } 637 }; 638 639 template <device::cupm::DeviceType T> 640 template <typename Solver, bool transpose> 641 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept 642 { 643 using namespace vec::cupm; 644 const auto pobj_A = PetscObjectCast(A); 645 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 646 const auto k = static_cast<cupmBlasInt_t>(A->cmap->n); 647 auto &workvec = MatCUPMCast(A)->workvec; 648 PetscScalar *y_array = nullptr; 649 PetscDeviceContext dctx; 650 PetscBool xiscupm, yiscupm, aiscupm; 651 bool use_y_array_directly; 652 cupmStream_t stream; 653 654 PetscFunctionBegin; 655 PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve"); 656 PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm)); 657 PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm)); 658 PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm)); 659 PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????"); 660 PetscCall(GetHandles_(&dctx, &stream)); 661 use_y_array_directly = yiscupm && (k >= m); 662 { 663 const PetscScalar *x_array; 664 const auto xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask); 665 const auto copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice; 666 667 if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec)); 668 // The logic here is to try to minimize the amount of memory copying: 669 // 670 // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded 671 // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the 672 // data in order to copy it into the y array. So the array x will be wherever the data 673 // already is so that only one memcpy is performed 674 if (xisdevice) { 675 PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx)); 676 } else { 677 PetscCall(VecGetArrayRead(x, &x_array)); 678 } 679 PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx)); 680 PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream)); 681 if (xisdevice) { 682 PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx)); 683 } else { 684 PetscCall(VecRestoreArrayRead(x, &x_array)); 685 } 686 } 687 688 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 689 PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream)); 690 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A)); 691 692 if (use_y_array_directly) { 693 PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx)); 694 } else { 695 const auto copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost; 696 PetscScalar *yv; 697 698 // The logic here is that the data is not yet in either y's GPU array or its CPU array. 699 // There is nothing in the interface to say where the user would like it to end up. So we 700 // choose the GPU, because it is the faster option 701 if (yiscupm) { 702 PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx)); 703 } else { 704 PetscCall(VecGetArray(y, &yv)); 705 } 706 PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream)); 707 if (yiscupm) { 708 PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx)); 709 } else { 710 PetscCall(VecRestoreArray(y, &yv)); 711 } 712 PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array)); 713 } 714 PetscFunctionReturn(PETSC_SUCCESS); 715 } 716 717 template <device::cupm::DeviceType T> 718 template <typename Solver, bool transpose> 719 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept 720 { 721 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 722 const auto k = static_cast<cupmBlasInt_t>(A->cmap->n); 723 cupmBlasInt_t nrhs, ldb, ldx, ldy; 724 PetscScalar *y; 725 PetscBool biscupm, xiscupm, aiscupm; 726 PetscDeviceContext dctx; 727 cupmStream_t stream; 728 729 PetscFunctionBegin; 730 PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve"); 731 PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm)); 732 PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm)); 733 PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm)); 734 PetscCall(GetHandles_(&dctx, &stream)); 735 { 736 PetscInt n; 737 738 PetscCall(MatGetSize(B, nullptr, &n)); 739 PetscCall(PetscCUPMBlasIntCast(n, &nrhs)); 740 PetscCall(MatDenseGetLDA(B, &n)); 741 PetscCall(PetscCUPMBlasIntCast(n, &ldb)); 742 PetscCall(MatDenseGetLDA(X, &n)); 743 PetscCall(PetscCUPMBlasIntCast(n, &ldx)); 744 } 745 { 746 // The logic here is to try to minimize the amount of memory copying: 747 // 748 // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not 749 // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to 750 // get the data in order to copy it into the y array. So the array b will be wherever the 751 // data already is so that only one memcpy is performed 752 const auto bisdevice = biscupm && PetscOffloadDevice(B->offloadmask); 753 const auto copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice; 754 const PetscScalar *b; 755 756 if (bisdevice) { 757 b = DeviceArrayRead(dctx, B); 758 } else if (biscupm) { 759 b = HostArrayRead(dctx, B); 760 } else { 761 PetscCall(MatDenseGetArrayRead(B, &b)); 762 } 763 764 if (ldx < m || !xiscupm) { 765 // X's array cannot serve as the array (too small or not on device), B's array cannot 766 // serve as the array (const), so allocate a new array 767 ldy = m; 768 PetscCall(PetscCUPMMallocAsync(&y, nrhs * m)); 769 } else { 770 // X's array should serve as the array 771 ldy = ldx; 772 y = DeviceArrayWrite(dctx, X); 773 } 774 PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream)); 775 if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b)); 776 } 777 778 // convert to CUPM twice?????????????????????????????????? 779 // but A should already be CUPM?????????????????????????????????????? 780 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 781 PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream)); 782 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 783 784 if (ldx < m || !xiscupm) { 785 const auto copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost; 786 PetscScalar *x; 787 788 // The logic here is that the data is not yet in either X's GPU array or its CPU 789 // array. There is nothing in the interface to say where the user would like it to end up. 790 // So we choose the GPU, because it is the faster option 791 if (xiscupm) { 792 x = DeviceArrayWrite(dctx, X); 793 } else { 794 PetscCall(MatDenseGetArray(X, &x)); 795 } 796 PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream)); 797 if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x)); 798 PetscCallCUPM(cupmFreeAsync(y, stream)); 799 } 800 PetscFunctionReturn(PETSC_SUCCESS); 801 } 802 803 template <device::cupm::DeviceType T> 804 template <bool transpose, bool hermitian> 805 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAddColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end) noexcept 806 { 807 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 808 const auto n = static_cast<cupmBlasInt_t>(c_end - c_start); 809 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 810 PetscBool xiscupm, yiscupm, ziscupm; 811 cupmBlasHandle_t handle; 812 Vec x = xx, y = yy, z = zz; 813 PetscDeviceContext dctx; 814 815 PetscFunctionBegin; 816 PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(xx), &xiscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), "")); 817 if (!xiscupm || xx->boundtocpu) { 818 PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(xx)), &x)); 819 PetscCall(VecSetLayout(x, xx->map)); 820 PetscCall(VecSetType(x, VecSeq_CUPM::VECCUPM())); 821 PetscCall(VecCopy(xx, x)); 822 } 823 824 if (yy) { 825 PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(yy), &yiscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), "")); 826 if (!yiscupm || yy->boundtocpu) { 827 PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(yy)), &y)); 828 PetscCall(VecSetLayout(y, yy->map)); 829 PetscCall(VecSetType(y, VecSeq_CUPM::VECCUPM())); 830 PetscCall(VecCopy(yy, y)); 831 } 832 } 833 834 if (zz != yy) { 835 PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(zz), &ziscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), "")); 836 if (!ziscupm || zz->boundtocpu) { 837 PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(zz)), &z)); 838 PetscCall(VecSetLayout(z, zz->map)); 839 PetscCall(VecSetType(z, VecSeq_CUPM::VECCUPM())); 840 } 841 } else { 842 z = y; 843 } 844 845 if (y && y != z) PetscCall(VecSeq_CUPM::Copy(y, z)); // mult add 846 if (!m || !n) { 847 // mult only 848 if (!y) PetscCall(VecSeq_CUPM::Set(z, 0.0)); 849 PetscFunctionReturn(PETSC_SUCCESS); 850 } 851 PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n)); 852 PetscCall(GetHandles_(&dctx, &handle)); 853 { 854 constexpr auto op = transpose ? (hermitian ? CUPMBLAS_OP_C : CUPMBLAS_OP_T) : CUPMBLAS_OP_N; 855 const auto one = cupmScalarCast(1.0); 856 const auto zero = cupmScalarCast(0.0); 857 const auto da = DeviceArrayRead(dctx, A); 858 const auto dxx = VecSeq_CUPM::DeviceArrayRead(dctx, x); 859 const auto dzz = VecSeq_CUPM::DeviceArrayReadWrite(dctx, z); 860 861 PetscCall(PetscLogGpuTimeBegin()); 862 PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata() + c_start * lda, lda, dxx.cupmdata() + (transpose ? 0 : c_start), 1, y ? &one : &zero, dzz.cupmdata() + (transpose ? c_start : 0), 1)); 863 PetscCall(PetscLogGpuTimeEnd()); 864 } 865 PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m))); 866 if (z != zz) { 867 PetscCall(VecCopy(z, zz)); 868 if (z != y) PetscCall(VecDestroy(&z)); 869 } 870 if (y != yy) PetscCall(VecDestroy(&y)); 871 if (x != xx) PetscCall(VecDestroy(&x)); 872 PetscFunctionReturn(PETSC_SUCCESS); 873 } 874 875 template <device::cupm::DeviceType T> 876 template <bool transpose, bool hermitian> 877 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end) noexcept 878 { 879 PetscFunctionBegin; 880 PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, nullptr, yy, c_start, c_end)); 881 PetscFunctionReturn(PETSC_SUCCESS); 882 } 883 884 template <device::cupm::DeviceType T> 885 template <bool transpose, bool hermitian> 886 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept 887 { 888 PetscFunctionBegin; 889 PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, yy, zz, 0, A->cmap->n)); 890 PetscFunctionReturn(PETSC_SUCCESS); 891 } 892 893 // ========================================================================================== 894 // MatDense_Seq_CUPM - Private API - Conversion Dispatch 895 // ========================================================================================== 896 897 template <device::cupm::DeviceType T> 898 template <bool to_host> 899 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept 900 { 901 PetscFunctionBegin; 902 if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) { 903 // TODO these cases should be optimized 904 PetscCall(MatConvert_Basic(M, type, reuse, newmat)); 905 } else { 906 const auto B = *newmat; 907 const auto pobj = PetscObjectCast(B); 908 909 if (to_host) { 910 PetscCall(BindToCPU(B, PETSC_TRUE)); 911 PetscCall(Reset(B)); 912 } else { 913 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM())); 914 } 915 916 PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype)); 917 PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM())); 918 // cvec might be the wrong VecType, destroy and rebuild it if necessary 919 // REVIEW ME: this is possibly very inefficient 920 PetscCall(VecDestroy(&MatIMPLCast(B)->cvec)); 921 922 MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense); 923 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>); 924 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>); 925 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>); 926 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>); 927 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>); 928 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>); 929 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray); 930 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray); 931 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray); 932 MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense); 933 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation); 934 935 if (to_host) { 936 B->offloadmask = PETSC_OFFLOAD_CPU; 937 } else { 938 Mat_SeqDenseCUPM *mcu; 939 940 PetscCall(PetscNew(&mcu)); 941 B->spptr = mcu; 942 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host?? 943 PetscCall(BindToCPU(B, PETSC_FALSE)); 944 } 945 946 MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU); 947 MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy); 948 } 949 PetscFunctionReturn(PETSC_SUCCESS); 950 } 951 952 // ========================================================================================== 953 // MatDense_Seq_CUPM - Public API 954 // ========================================================================================== 955 956 template <device::cupm::DeviceType T> 957 inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept 958 { 959 return MATSEQDENSECUPM(); 960 } 961 962 template <device::cupm::DeviceType T> 963 inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept 964 { 965 return static_cast<Mat_SeqDenseCUPM *>(m->spptr); 966 } 967 968 template <device::cupm::DeviceType T> 969 inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept 970 { 971 return static_cast<Mat_SeqDense *>(m->data); 972 } 973 974 template <device::cupm::DeviceType T> 975 inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept 976 { 977 return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C"; 978 } 979 980 template <device::cupm::DeviceType T> 981 inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept 982 { 983 return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C"; 984 } 985 986 // ========================================================================================== 987 988 // MatCreate_SeqDenseCUPM() 989 template <device::cupm::DeviceType T> 990 inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept 991 { 992 PetscFunctionBegin; 993 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM())); 994 PetscCall(MatCreate_SeqDense(A)); 995 PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 996 PetscFunctionReturn(PETSC_SUCCESS); 997 } 998 999 template <device::cupm::DeviceType T> 1000 inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept 1001 { 1002 PetscFunctionBegin; 1003 // prevent copying back data if we own the data pointer 1004 if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU; 1005 PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A)); 1006 PetscCall(MatDestroy_SeqDense(A)); 1007 PetscFunctionReturn(PETSC_SUCCESS); 1008 } 1009 1010 // obj->ops->setup() 1011 template <device::cupm::DeviceType T> 1012 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept 1013 { 1014 PetscFunctionBegin; 1015 PetscCall(PetscLayoutSetUp(A->rmap)); 1016 PetscCall(PetscLayoutSetUp(A->cmap)); 1017 if (!A->preallocated) { 1018 PetscDeviceContext dctx; 1019 1020 PetscCall(GetHandles_(&dctx)); 1021 PetscCall(SetPreallocation(A, dctx, nullptr)); 1022 } 1023 PetscFunctionReturn(PETSC_SUCCESS); 1024 } 1025 1026 template <device::cupm::DeviceType T> 1027 inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept 1028 { 1029 PetscFunctionBegin; 1030 if (const auto mcu = MatCUPMCast(A)) { 1031 cupmStream_t stream; 1032 1033 PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME()); 1034 PetscCall(GetHandles_(&stream)); 1035 if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream)); 1036 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream)); 1037 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream)); 1038 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream)); 1039 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream)); 1040 PetscCall(VecDestroy(&mcu->workvec)); 1041 PetscCall(PetscFree(A->spptr /* mcu */)); 1042 } 1043 PetscFunctionReturn(PETSC_SUCCESS); 1044 } 1045 1046 // ========================================================================================== 1047 1048 template <device::cupm::DeviceType T> 1049 inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept 1050 { 1051 const auto mimpl = MatIMPLCast(A); 1052 const auto pobj = PetscObjectCast(A); 1053 1054 PetscFunctionBegin; 1055 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1056 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1057 A->boundtocpu = to_host; 1058 PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype)); 1059 if (to_host) { 1060 PetscDeviceContext dctx; 1061 1062 // make sure we have an up-to-date copy on the CPU 1063 PetscCall(GetHandles_(&dctx)); 1064 PetscCall(DeviceToHost_(A, dctx)); 1065 } else { 1066 PetscBool iscupm; 1067 1068 if (auto &cvec = mimpl->cvec) { 1069 PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm)); 1070 if (!iscupm) PetscCall(VecDestroy(&cvec)); 1071 } 1072 if (auto &cmat = mimpl->cmat) { 1073 PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm)); 1074 if (!iscupm) PetscCall(MatDestroy(&cmat)); 1075 } 1076 } 1077 1078 // ============================================================ 1079 // Composed ops 1080 // ============================================================ 1081 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>); 1082 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>); 1083 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>); 1084 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>); 1085 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>); 1086 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>); 1087 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>); 1088 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>); 1089 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>); 1090 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>); 1091 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>); 1092 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>); 1093 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>); 1094 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>); 1095 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>); 1096 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix); 1097 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix); 1098 MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor); 1099 MatComposeOp_CUPM(to_host, pobj, "MatMultColumnRange_C", MatMultColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>); 1100 MatComposeOp_CUPM(to_host, pobj, "MatMultAddColumnRange_C", MatMultAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>); 1101 MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeColumnRange_C", MatMultHermitianTransposeColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>); 1102 MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeAddColumnRange_C", MatMultHermitianTransposeAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>); 1103 // always the same 1104 PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense)); 1105 1106 // ============================================================ 1107 // Function pointer ops 1108 // ============================================================ 1109 MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate); 1110 MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>(A, xx, nullptr, yy); }); 1111 MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>(A, xx, nullptr, yy); }); 1112 MatSetOp_CUPM(to_host, A, multhermitiantranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>(A, xx, nullptr, yy); }); 1113 MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>); 1114 MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>); 1115 MatSetOp_CUPM(to_host, A, multhermitiantransposeadd, MatMultHermitianTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>); 1116 MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>); 1117 MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>); 1118 MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>); 1119 MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY); 1120 MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor); 1121 MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor); 1122 MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector); 1123 MatSetOp_CUPM(to_host, A, conjugate, MatConjugate_SeqDense, Conjugate); 1124 MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale); 1125 MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift); 1126 MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy); 1127 MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries); 1128 MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp); 1129 MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom); 1130 MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal); 1131 // seemingly always the same 1132 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense; 1133 1134 if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host)); 1135 PetscFunctionReturn(PETSC_SUCCESS); 1136 } 1137 1138 template <device::cupm::DeviceType T> 1139 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept 1140 { 1141 PetscFunctionBegin; 1142 PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat)); 1143 PetscFunctionReturn(PETSC_SUCCESS); 1144 } 1145 1146 template <device::cupm::DeviceType T> 1147 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept 1148 { 1149 PetscFunctionBegin; 1150 PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat)); 1151 PetscFunctionReturn(PETSC_SUCCESS); 1152 } 1153 1154 // ========================================================================================== 1155 1156 template <device::cupm::DeviceType T> 1157 template <PetscMemType mtype, PetscMemoryAccessMode access> 1158 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept 1159 { 1160 constexpr auto hostmem = PetscMemTypeHost(mtype); 1161 constexpr auto read_access = PetscMemoryAccessRead(access); 1162 1163 PetscFunctionBegin; 1164 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 1165 PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx)); 1166 if (hostmem) { 1167 if (read_access) { 1168 PetscCall(DeviceToHost_(m, dctx)); 1169 } else if (!MatIMPLCast(m)->v) { 1170 // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed 1171 PetscCall(MatSeqDenseSetPreallocation(m, nullptr)); 1172 } 1173 *array = MatIMPLCast(m)->v; 1174 } else { 1175 if (read_access) { 1176 PetscCall(HostToDevice_(m, dctx)); 1177 } else if (!MatCUPMCast(m)->d_v) { 1178 // write-only 1179 PetscCall(SetPreallocation(m, dctx, nullptr)); 1180 } 1181 *array = MatCUPMCast(m)->d_v; 1182 } 1183 if (PetscMemoryAccessWrite(access)) { 1184 m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 1185 PetscCall(PetscObjectStateIncrease(PetscObjectCast(m))); 1186 } 1187 PetscFunctionReturn(PETSC_SUCCESS); 1188 } 1189 1190 template <device::cupm::DeviceType T> 1191 template <PetscMemType mtype, PetscMemoryAccessMode access> 1192 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept 1193 { 1194 PetscFunctionBegin; 1195 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 1196 if (PetscMemoryAccessWrite(access)) { 1197 // WRITE or READ_WRITE 1198 m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 1199 PetscCall(PetscObjectStateIncrease(PetscObjectCast(m))); 1200 } 1201 if (array) { 1202 PetscCall(CheckPointerMatchesMemType_(*array, mtype)); 1203 *array = nullptr; 1204 } 1205 PetscFunctionReturn(PETSC_SUCCESS); 1206 } 1207 1208 template <device::cupm::DeviceType T> 1209 template <PetscMemoryAccessMode access> 1210 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept 1211 { 1212 PetscFunctionBegin; 1213 PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx)); 1214 if (mtype) *mtype = PETSC_MEMTYPE_CUPM(); 1215 PetscFunctionReturn(PETSC_SUCCESS); 1216 } 1217 1218 template <device::cupm::DeviceType T> 1219 template <PetscMemoryAccessMode access> 1220 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept 1221 { 1222 PetscFunctionBegin; 1223 PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx)); 1224 PetscFunctionReturn(PETSC_SUCCESS); 1225 } 1226 1227 // ========================================================================================== 1228 1229 template <device::cupm::DeviceType T> 1230 inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept 1231 { 1232 const auto mimpl = MatIMPLCast(A); 1233 const auto mcu = MatCUPMCast(A); 1234 1235 PetscFunctionBegin; 1236 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1237 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1238 PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME()); 1239 if (mimpl->v) { 1240 PetscDeviceContext dctx; 1241 1242 PetscCall(GetHandles_(&dctx)); 1243 PetscCall(HostToDevice_(A, dctx)); 1244 } 1245 mcu->unplacedarray = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array)); 1246 mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE); 1247 PetscFunctionReturn(PETSC_SUCCESS); 1248 } 1249 1250 template <device::cupm::DeviceType T> 1251 inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept 1252 { 1253 const auto mimpl = MatIMPLCast(A); 1254 const auto mcu = MatCUPMCast(A); 1255 1256 PetscFunctionBegin; 1257 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1258 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1259 PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME()); 1260 if (!mcu->d_user_alloc) { 1261 cupmStream_t stream; 1262 1263 PetscCall(GetHandles_(&stream)); 1264 PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream)); 1265 } 1266 mcu->d_v = const_cast<PetscScalar *>(array); 1267 mcu->d_user_alloc = PETSC_FALSE; 1268 PetscFunctionReturn(PETSC_SUCCESS); 1269 } 1270 1271 template <device::cupm::DeviceType T> 1272 inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept 1273 { 1274 const auto mimpl = MatIMPLCast(A); 1275 const auto mcu = MatCUPMCast(A); 1276 1277 PetscFunctionBegin; 1278 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1279 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1280 if (mimpl->v) { 1281 PetscDeviceContext dctx; 1282 1283 PetscCall(GetHandles_(&dctx)); 1284 PetscCall(HostToDevice_(A, dctx)); 1285 } 1286 mcu->d_v = util::exchange(mcu->unplacedarray, nullptr); 1287 mcu->d_user_alloc = mcu->d_unplaced_user_alloc; 1288 PetscFunctionReturn(PETSC_SUCCESS); 1289 } 1290 1291 // ========================================================================================== 1292 1293 template <device::cupm::DeviceType T> 1294 template <bool transpose_A, bool transpose_B> 1295 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept 1296 { 1297 cupmBlasInt_t m, n, k; 1298 PetscBool Aiscupm, Biscupm; 1299 PetscDeviceContext dctx; 1300 cupmBlasHandle_t handle; 1301 1302 PetscFunctionBegin; 1303 PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m)); 1304 PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n)); 1305 PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k)); 1306 if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS); 1307 1308 // we may end up with SEQDENSE as one of the arguments 1309 // REVIEW ME: how? and why is it not B and C???????? 1310 PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm)); 1311 PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm)); 1312 if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 1313 if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B)); 1314 PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n)); 1315 PetscCall(GetHandles_(&dctx, &handle)); 1316 1317 PetscCall(PetscLogGpuTimeBegin()); 1318 { 1319 const auto one = cupmScalarCast(1.0); 1320 const auto zero = cupmScalarCast(0.0); 1321 const auto da = DeviceArrayRead(dctx, A); 1322 const auto db = DeviceArrayRead(dctx, B); 1323 const auto dc = DeviceArrayWrite(dctx, C); 1324 PetscInt alda, blda, clda; 1325 1326 PetscCall(MatDenseGetLDA(A, &alda)); 1327 PetscCall(MatDenseGetLDA(B, &blda)); 1328 PetscCall(MatDenseGetLDA(C, &clda)); 1329 PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda)); 1330 } 1331 PetscCall(PetscLogGpuTimeEnd()); 1332 1333 PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1))); 1334 if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A)); 1335 if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 1336 PetscFunctionReturn(PETSC_SUCCESS); 1337 } 1338 1339 template <device::cupm::DeviceType T> 1340 inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept 1341 { 1342 const auto m = A->rmap->n; 1343 const auto n = A->cmap->n; 1344 1345 PetscFunctionBegin; 1346 PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)"); 1347 // The two matrices must have the same copy implementation to be eligible for fast copy 1348 if (A->ops->copy == B->ops->copy) { 1349 PetscDeviceContext dctx; 1350 cupmStream_t stream; 1351 1352 PetscCall(GetHandles_(&dctx, &stream)); 1353 PetscCall(PetscLogGpuTimeBegin()); 1354 { 1355 const auto va = DeviceArrayRead(dctx, A); 1356 const auto vb = DeviceArrayWrite(dctx, B); 1357 // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets 1358 // lda! 1359 const auto lda_a = MatIMPLCast(A)->lda; 1360 const auto lda_b = MatIMPLCast(B)->lda; 1361 1362 if (lda_a > m || lda_b > m) { 1363 PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME()); 1364 PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME()); 1365 PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream)); 1366 } else { 1367 PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream)); 1368 } 1369 } 1370 PetscCall(PetscLogGpuTimeEnd()); 1371 } else { 1372 PetscCall(MatCopy_Basic(A, B, str)); 1373 } 1374 PetscFunctionReturn(PETSC_SUCCESS); 1375 } 1376 1377 template <device::cupm::DeviceType T> 1378 inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept 1379 { 1380 PetscDeviceContext dctx; 1381 cupmStream_t stream; 1382 1383 PetscFunctionBegin; 1384 PetscCall(GetHandles_(&dctx, &stream)); 1385 PetscCall(PetscLogGpuTimeBegin()); 1386 { 1387 const auto va = DeviceArrayWrite(dctx, m); 1388 const auto lda = MatIMPLCast(m)->lda; 1389 const auto ma = m->rmap->n; 1390 const auto na = m->cmap->n; 1391 1392 if (lda > ma) { 1393 PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream)); 1394 } else { 1395 PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream)); 1396 } 1397 } 1398 PetscCall(PetscLogGpuTimeEnd()); 1399 PetscFunctionReturn(PETSC_SUCCESS); 1400 } 1401 1402 namespace detail 1403 { 1404 1405 // ========================================================================================== 1406 // SubMatIndexFunctor 1407 // 1408 // Iterator which permutes a linear index range into matrix indices for an nrows x ncols 1409 // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for 1410 // the i'th sequential entry in the matrix. 1411 // ========================================================================================== 1412 template <typename T> 1413 struct SubMatIndexFunctor { 1414 PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); } 1415 1416 PetscInt nrows; 1417 PetscInt ncols; 1418 PetscInt lda; 1419 }; 1420 1421 template <typename Iterator> 1422 struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<iter_difference_t<Iterator>>> { 1423 using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<iter_difference_t<Iterator>>>; 1424 1425 using iterator = typename base_type::iterator; 1426 1427 constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept : 1428 base_type{ 1429 std::move(first), std::move(last), {nrows, ncols, lda} 1430 } 1431 { 1432 } 1433 1434 PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); } 1435 }; 1436 1437 namespace 1438 { 1439 1440 template <typename T> 1441 PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept 1442 { 1443 const auto nrows = rend - rstart; 1444 const auto ncols = cend - cstart; 1445 const auto dptr = thrust::device_pointer_cast(ptr); 1446 1447 return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda}; 1448 } 1449 1450 } // namespace 1451 1452 struct conjugate { 1453 PETSC_NODISCARD PETSC_HOSTDEVICE_INLINE_DECL PetscScalar operator()(const PetscScalar &x) const noexcept { return PetscConj(x); } 1454 }; 1455 1456 } // namespace detail 1457 1458 template <device::cupm::DeviceType T> 1459 inline PetscErrorCode MatDense_Seq_CUPM<T>::Conjugate(Mat A) noexcept 1460 { 1461 const auto m = A->rmap->n; 1462 const auto n = A->cmap->n; 1463 const auto N = m * n; 1464 PetscDeviceContext dctx; 1465 cupmStream_t stream; 1466 1467 PetscFunctionBegin; 1468 if (PetscDefined(USE_COMPLEX)) { 1469 PetscCall(GetHandles_(&dctx, &stream)); 1470 PetscCall(PetscLogGpuTimeBegin()); 1471 { 1472 const auto da = DeviceArrayReadWrite(dctx, A); 1473 const auto lda = MatIMPLCast(A)->lda; 1474 cupmStream_t stream; 1475 PetscCall(GetHandlesFrom_(dctx, &stream)); 1476 1477 if (lda > m) { 1478 // clang-format off 1479 PetscCallThrust( 1480 const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data()); 1481 1482 THRUST_CALL( 1483 thrust::transform, 1484 stream, 1485 sub_mat.begin(), sub_mat.end(), sub_mat.begin(), 1486 detail::conjugate{} 1487 ) 1488 ); 1489 // clang-format on 1490 } else { 1491 // clang-format off 1492 PetscCallThrust( 1493 const auto aptr = thrust::device_pointer_cast(da.data()); 1494 1495 THRUST_CALL( 1496 thrust::transform, 1497 stream, 1498 aptr, aptr + N, aptr, 1499 detail::conjugate{} 1500 ) 1501 ); 1502 // clang-format on 1503 } 1504 } 1505 PetscCall(PetscLogGpuTimeEnd()); 1506 } 1507 PetscFunctionReturn(PETSC_SUCCESS); 1508 } 1509 1510 template <device::cupm::DeviceType T> 1511 inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept 1512 { 1513 const auto m = A->rmap->n; 1514 const auto n = A->cmap->n; 1515 const auto N = m * n; 1516 PetscDeviceContext dctx; 1517 1518 PetscFunctionBegin; 1519 PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n)); 1520 PetscCall(GetHandles_(&dctx)); 1521 { 1522 const auto da = DeviceArrayReadWrite(dctx, A); 1523 const auto lda = MatIMPLCast(A)->lda; 1524 1525 if (lda > m) { 1526 cupmStream_t stream; 1527 1528 PetscCall(GetHandlesFrom_(dctx, &stream)); 1529 // clang-format off 1530 PetscCallThrust( 1531 const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data()); 1532 1533 THRUST_CALL( 1534 thrust::transform, 1535 stream, 1536 sub_mat.begin(), sub_mat.end(), sub_mat.begin(), 1537 device::cupm::functors::make_times_equals(alpha) 1538 ) 1539 ); 1540 // clang-format on 1541 } else { 1542 const auto cu_alpha = cupmScalarCast(alpha); 1543 cupmBlasHandle_t handle; 1544 1545 PetscCall(GetHandlesFrom_(dctx, &handle)); 1546 PetscCall(PetscLogGpuTimeBegin()); 1547 PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1)); 1548 PetscCall(PetscLogGpuTimeEnd()); 1549 } 1550 } 1551 PetscCall(PetscLogGpuFlops(N)); 1552 PetscFunctionReturn(PETSC_SUCCESS); 1553 } 1554 1555 template <device::cupm::DeviceType T> 1556 inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept 1557 { 1558 const auto m_x = X->rmap->n, m_y = Y->rmap->n; 1559 const auto n_x = X->cmap->n, n_y = Y->cmap->n; 1560 const auto N = m_x * n_x; 1561 PetscDeviceContext dctx; 1562 1563 PetscFunctionBegin; 1564 if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS); 1565 PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y)); 1566 PetscCall(GetHandles_(&dctx)); 1567 { 1568 const auto dx = DeviceArrayRead(dctx, X); 1569 const auto dy = DeviceArrayReadWrite(dctx, Y); 1570 const auto lda_x = MatIMPLCast(X)->lda; 1571 const auto lda_y = MatIMPLCast(Y)->lda; 1572 1573 if (lda_x > m_x || lda_y > m_x) { 1574 cupmStream_t stream; 1575 1576 PetscCall(GetHandlesFrom_(dctx, &stream)); 1577 // clang-format off 1578 PetscCallThrust( 1579 const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data()); 1580 const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data()); 1581 1582 THRUST_CALL( 1583 thrust::transform, 1584 stream, 1585 sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(), 1586 device::cupm::functors::make_axpy(alpha) 1587 ); 1588 ); 1589 // clang-format on 1590 } else { 1591 const auto cu_alpha = cupmScalarCast(alpha); 1592 cupmBlasHandle_t handle; 1593 1594 PetscCall(GetHandlesFrom_(dctx, &handle)); 1595 PetscCall(PetscLogGpuTimeBegin()); 1596 PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1)); 1597 PetscCall(PetscLogGpuTimeEnd()); 1598 } 1599 } 1600 PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0))); 1601 PetscFunctionReturn(PETSC_SUCCESS); 1602 } 1603 1604 template <device::cupm::DeviceType T> 1605 inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept 1606 { 1607 const auto hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt; 1608 PetscDeviceContext dctx; 1609 1610 PetscFunctionBegin; 1611 PetscCall(GetHandles_(&dctx)); 1612 // do not call SetPreallocation() yet, we call it afterwards?? 1613 PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false)); 1614 PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt)); 1615 if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN)); 1616 // allocate memory if needed 1617 if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr)); 1618 PetscFunctionReturn(PETSC_SUCCESS); 1619 } 1620 1621 template <device::cupm::DeviceType T> 1622 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept 1623 { 1624 PetscBool device_rand_is_rander48; 1625 PetscBool device = PETSC_FALSE; 1626 1627 PetscFunctionBegin; 1628 // CUPMObject<hip>::PETSCDEVICERAD() is PETSCRANDER48 until PetscRandom is implemented for hiprand 1629 PetscCall(PetscStrncmp(PETSCDEVICERAND(), PETSCRANDER48, sizeof(PETSCRANDER48), &device_rand_is_rander48)); 1630 if (!device_rand_is_rander48) PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device)); 1631 if (device) { 1632 const auto m = A->rmap->n; 1633 const auto n = A->cmap->n; 1634 PetscDeviceContext dctx; 1635 1636 PetscCall(GetHandles_(&dctx)); 1637 { 1638 const auto a = DeviceArrayWrite(dctx, A); 1639 PetscInt lda; 1640 1641 PetscCall(MatDenseGetLDA(A, &lda)); 1642 if (lda > m) { 1643 for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda)); 1644 } else { 1645 PetscInt mn; 1646 1647 PetscCall(PetscIntMultError(m, n, &mn)); 1648 PetscCall(PetscRandomGetValues(rng, mn, a)); 1649 } 1650 } 1651 } else { 1652 PetscCall(MatSetRandom_SeqDense(A, rng)); 1653 } 1654 PetscFunctionReturn(PETSC_SUCCESS); 1655 } 1656 1657 // ========================================================================================== 1658 1659 template <device::cupm::DeviceType T> 1660 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept 1661 { 1662 const auto offloadmask = A->offloadmask; 1663 const auto n = A->rmap->n; 1664 const auto col_offset = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; }; 1665 PetscBool viscupm; 1666 PetscDeviceContext dctx; 1667 cupmStream_t stream; 1668 1669 PetscFunctionBegin; 1670 PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), "")); 1671 PetscCall(GetHandles_(&dctx, &stream)); 1672 if (viscupm && !v->boundtocpu) { 1673 const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v); 1674 1675 // update device data 1676 if (PetscOffloadDevice(offloadmask)) { 1677 PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream)); 1678 } else { 1679 PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream)); 1680 } 1681 } else { 1682 PetscScalar *x; 1683 1684 // update host data 1685 PetscCall(VecGetArrayWrite(v, &x)); 1686 if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) { 1687 PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n)); 1688 } else if (PetscOffloadDevice(offloadmask)) { 1689 PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream)); 1690 } 1691 PetscCall(VecRestoreArrayWrite(v, &x)); 1692 } 1693 PetscFunctionReturn(PETSC_SUCCESS); 1694 } 1695 1696 template <device::cupm::DeviceType T> 1697 template <PetscMemoryAccessMode access> 1698 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept 1699 { 1700 using namespace vec::cupm; 1701 const auto mimpl = MatIMPLCast(A); 1702 PetscDeviceContext dctx; 1703 1704 PetscFunctionBegin; 1705 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1706 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1707 mimpl->vecinuse = col + 1; 1708 if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec)); 1709 PetscCall(GetHandles_(&dctx)); 1710 PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx)); 1711 PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda))); 1712 if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec)); 1713 *v = mimpl->cvec; 1714 PetscFunctionReturn(PETSC_SUCCESS); 1715 } 1716 1717 template <device::cupm::DeviceType T> 1718 template <PetscMemoryAccessMode access> 1719 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept 1720 { 1721 using namespace vec::cupm; 1722 const auto mimpl = MatIMPLCast(A); 1723 const auto cvec = mimpl->cvec; 1724 PetscDeviceContext dctx; 1725 1726 PetscFunctionBegin; 1727 PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first"); 1728 PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector"); 1729 mimpl->vecinuse = 0; 1730 if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec)); 1731 PetscCall(VecCUPMResetArrayAsync<T>(cvec)); 1732 PetscCall(GetHandles_(&dctx)); 1733 PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx)); 1734 if (v) *v = nullptr; 1735 PetscFunctionReturn(PETSC_SUCCESS); 1736 } 1737 1738 // ========================================================================================== 1739 1740 template <device::cupm::DeviceType T> 1741 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept 1742 { 1743 Mat fact = nullptr; 1744 PetscDeviceContext dctx; 1745 1746 PetscFunctionBegin; 1747 PetscCall(GetHandles_(&dctx)); 1748 PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false)); 1749 fact->factortype = ftype; 1750 switch (ftype) { 1751 case MAT_FACTOR_LU: 1752 case MAT_FACTOR_ILU: // fall-through 1753 fact->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense; 1754 fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense; 1755 break; 1756 case MAT_FACTOR_CHOLESKY: 1757 case MAT_FACTOR_ICC: // fall-through 1758 fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense; 1759 break; 1760 case MAT_FACTOR_QR: { 1761 const auto pobj = PetscObjectCast(fact); 1762 1763 PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense)); 1764 PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense)); 1765 } break; 1766 case MAT_FACTOR_NONE: 1767 case MAT_FACTOR_ILUDT: // fall-through 1768 case MAT_FACTOR_NUM_TYPES: // fall-through 1769 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]); 1770 } 1771 PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype)); 1772 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU)); 1773 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU)); 1774 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY)); 1775 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC)); 1776 *fact_out = fact; 1777 PetscFunctionReturn(PETSC_SUCCESS); 1778 } 1779 1780 template <device::cupm::DeviceType T> 1781 inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept 1782 { 1783 const auto mimpl = MatIMPLCast(A); 1784 const auto mcu = MatCUPMCast(A); 1785 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 1786 cupmSolverHandle_t handle; 1787 PetscDeviceContext dctx; 1788 cupmStream_t stream; 1789 1790 PetscFunctionBegin; 1791 #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC) 1792 // HIP appears to have this by default?? 1793 PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher"); 1794 #endif 1795 if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS); 1796 PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]); 1797 // spd 1798 PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName()); 1799 1800 PetscCall(GetHandles_(&dctx, &handle, &stream)); 1801 { 1802 const auto da = DeviceArrayReadWrite(dctx, A); 1803 const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda); 1804 cupmBlasInt_t il; 1805 1806 PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il)); 1807 if (il > mcu->d_fact_lwork) { 1808 mcu->d_fact_lwork = il; 1809 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream)); 1810 PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream)); 1811 } 1812 PetscCall(PetscLogGpuTimeBegin()); 1813 PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 1814 PetscCall(PetscLogGpuTimeEnd()); 1815 } 1816 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 1817 // TODO (write cuda kernel) 1818 PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE)); 1819 PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0)); 1820 1821 A->ops->solve = nullptr; 1822 A->ops->solvetranspose = nullptr; 1823 A->ops->matsolve = nullptr; 1824 A->factortype = MAT_FACTOR_NONE; 1825 1826 PetscCall(PetscFree(A->solvertype)); 1827 PetscFunctionReturn(PETSC_SUCCESS); 1828 } 1829 1830 // ========================================================================================== 1831 1832 template <device::cupm::DeviceType T> 1833 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept 1834 { 1835 const auto mimpl = MatIMPLCast(A); 1836 const auto array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; }; 1837 const auto n = rend - rbegin; 1838 const auto m = cend - cbegin; 1839 auto &cmat = mimpl->cmat; 1840 PetscDeviceContext dctx; 1841 1842 PetscFunctionBegin; 1843 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1844 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1845 mimpl->matinuse = cbegin + 1; 1846 1847 PetscCall(GetHandles_(&dctx)); 1848 PetscCall(HostToDevice_(A, dctx)); 1849 1850 if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat)); 1851 { 1852 const auto device_array = array_offset(MatCUPMCast(A)->d_v); 1853 1854 if (cmat) { 1855 PetscCall(PlaceArray(cmat, device_array)); 1856 } else { 1857 PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx)); 1858 } 1859 } 1860 PetscCall(MatDenseSetLDA(cmat, mimpl->lda)); 1861 // place CPU array if present but do not copy any data 1862 if (const auto host_array = mimpl->v) { 1863 cmat->offloadmask = PETSC_OFFLOAD_GPU; 1864 PetscCall(MatDensePlaceArray(cmat, array_offset(host_array))); 1865 } 1866 1867 cmat->offloadmask = A->offloadmask; 1868 *mat = cmat; 1869 PetscFunctionReturn(PETSC_SUCCESS); 1870 } 1871 1872 template <device::cupm::DeviceType T> 1873 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept 1874 { 1875 const auto mimpl = MatIMPLCast(A); 1876 const auto cmat = mimpl->cmat; 1877 const auto reset = static_cast<bool>(mimpl->v); 1878 bool copy, was_offload_host; 1879 1880 PetscFunctionBegin; 1881 PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first"); 1882 PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix"); 1883 PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()"); 1884 mimpl->matinuse = 0; 1885 1886 // calls to ResetArray may change it, so save it here 1887 was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU; 1888 if (was_offload_host && !reset) { 1889 copy = true; 1890 PetscCall(MatSeqDenseSetPreallocation(A, nullptr)); 1891 } else { 1892 copy = false; 1893 } 1894 1895 PetscCall(ResetArray(cmat)); 1896 if (reset) PetscCall(MatDenseResetArray(cmat)); 1897 if (copy) { 1898 PetscDeviceContext dctx; 1899 1900 PetscCall(GetHandles_(&dctx)); 1901 PetscCall(DeviceToHost_(A, dctx)); 1902 } else { 1903 A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 1904 } 1905 1906 cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1907 *m = nullptr; 1908 PetscFunctionReturn(PETSC_SUCCESS); 1909 } 1910 1911 // ========================================================================================== 1912 1913 namespace 1914 { 1915 1916 template <device::cupm::DeviceType T> 1917 inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept 1918 { 1919 PetscFunctionBegin; 1920 if (TA) { 1921 if (TB) { 1922 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C)); 1923 } else { 1924 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C)); 1925 } 1926 } else { 1927 if (TB) { 1928 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C)); 1929 } else { 1930 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C)); 1931 } 1932 } 1933 PetscFunctionReturn(PETSC_SUCCESS); 1934 } 1935 1936 template <device::cupm::DeviceType T> 1937 inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept 1938 { 1939 PetscFunctionBegin; 1940 for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) { 1941 PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor)); 1942 PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor)); 1943 } 1944 PetscFunctionReturn(PETSC_SUCCESS); 1945 } 1946 1947 } // anonymous namespace 1948 1949 } // namespace impl 1950 1951 } // namespace cupm 1952 1953 } // namespace mat 1954 1955 } // namespace Petsc 1956