1 #pragma once 2 3 #include <petsc/private/matdensecupmimpl.h> /*I <petscmat.h> I*/ 4 #include <../src/mat/impls/dense/seq/dense.h> 5 6 #include <petsc/private/deviceimpl.h> // PetscDeviceContextGetOptionalNullContext_Internal() 7 #include <petsc/private/randomimpl.h> // _p_PetscRandom 8 #include <petsc/private/vecimpl.h> // _p_Vec 9 #include <petsc/private/cupmobject.hpp> 10 #include <petsc/private/cupmsolverinterface.hpp> 11 12 #include <petsc/private/cpp/type_traits.hpp> // PetscObjectCast() 13 #include <petsc/private/cpp/utility.hpp> // util::exchange() 14 15 #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp> // for VecSeq_CUPM 16 17 namespace Petsc 18 { 19 20 namespace mat 21 { 22 23 namespace cupm 24 { 25 26 namespace impl 27 { 28 29 template <device::cupm::DeviceType T> 30 class MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> { 31 public: 32 MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>); 33 34 private: 35 struct Mat_SeqDenseCUPM { 36 PetscScalar *d_v; // pointer to the matrix on the GPU 37 PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original 38 bool d_user_alloc; 39 bool d_unplaced_user_alloc; 40 // factorization support 41 cupmBlasInt_t *d_fact_ipiv; // device pivots 42 cupmScalar_t *d_fact_tau; // device QR tau vector 43 cupmBlasInt_t *d_fact_info; // device info 44 cupmScalar_t *d_fact_work; // device workspace 45 cupmBlasInt_t d_fact_lwork; // size of device workspace 46 // workspace 47 Vec workvec; 48 }; 49 50 static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept; 51 52 static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept; 53 static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept; 54 55 static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept; 56 57 template <typename Derived> 58 struct SolveCommon; 59 struct SolveQR; 60 struct SolveCholesky; 61 struct SolveLU; 62 63 template <typename Solver, bool transpose> 64 static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept; 65 template <typename Solver, bool transpose> 66 static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept; 67 template <bool transpose, bool hermitian> 68 static PetscErrorCode MatMultAddColumnRange_Dispatch_(Mat, Vec, Vec, Vec, PetscInt, PetscInt) noexcept; 69 template <bool transpose, bool hermitian> 70 static PetscErrorCode MatMultColumnRange_Dispatch_(Mat, Vec, Vec, PetscInt, PetscInt) noexcept; 71 template <bool transpose, bool hermitian> 72 static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept; 73 74 template <bool to_host> 75 static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept; 76 77 PETSC_NODISCARD static constexpr MatType MATIMPLCUPM_() noexcept; 78 PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept; 79 80 public: 81 PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept; 82 83 // define these by hand since they don't fit the above mold 84 PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept; 85 PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept; 86 87 static PetscErrorCode Create(Mat) noexcept; 88 static PetscErrorCode Destroy(Mat) noexcept; 89 static PetscErrorCode SetUp(Mat) noexcept; 90 static PetscErrorCode Reset(Mat) noexcept; 91 92 static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept; 93 static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept; 94 static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept; 95 96 template <PetscMemType, PetscMemoryAccessMode> 97 static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept; 98 template <PetscMemType, PetscMemoryAccessMode> 99 static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept; 100 template <PetscMemoryAccessMode> 101 static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept; 102 template <PetscMemoryAccessMode> 103 static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept; 104 105 private: 106 template <PetscMemType mtype, PetscMemoryAccessMode mode> 107 static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept 108 { 109 PetscDeviceContext dctx; 110 111 PetscFunctionBegin; 112 PetscCall(GetHandles_(&dctx)); 113 PetscCall(GetArray<mtype, mode>(m, p, dctx)); 114 PetscFunctionReturn(PETSC_SUCCESS); 115 } 116 117 template <PetscMemType mtype, PetscMemoryAccessMode mode> 118 static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept 119 { 120 PetscDeviceContext dctx; 121 122 PetscFunctionBegin; 123 PetscCall(GetHandles_(&dctx)); 124 PetscCall(RestoreArray<mtype, mode>(m, p, dctx)); 125 PetscFunctionReturn(PETSC_SUCCESS); 126 } 127 128 template <PetscMemoryAccessMode mode> 129 static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept 130 { 131 PetscDeviceContext dctx; 132 133 PetscFunctionBegin; 134 PetscCall(GetHandles_(&dctx)); 135 PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 template <PetscMemoryAccessMode mode> 140 static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept 141 { 142 PetscDeviceContext dctx; 143 144 PetscFunctionBegin; 145 PetscCall(GetHandles_(&dctx)); 146 PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx)); 147 PetscFunctionReturn(PETSC_SUCCESS); 148 } 149 150 public: 151 static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept; 152 static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept; 153 static PetscErrorCode ResetArray(Mat) noexcept; 154 155 template <bool transpose_A, bool transpose_B> 156 static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept; 157 static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept; 158 static PetscErrorCode ZeroEntries(Mat) noexcept; 159 static PetscErrorCode Scale(Mat, PetscScalar) noexcept; 160 static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept; 161 static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept; 162 static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept; 163 164 static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept; 165 template <PetscMemoryAccessMode> 166 static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept; 167 template <PetscMemoryAccessMode> 168 static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept; 169 170 static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept; 171 static PetscErrorCode InvertFactors(Mat) noexcept; 172 173 static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept; 174 static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept; 175 }; 176 177 } // namespace impl 178 179 namespace 180 { 181 182 // Declare this here so that the functions below can make use of it 183 template <device::cupm::DeviceType T> 184 inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept 185 { 186 PetscFunctionBegin; 187 PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate)); 188 PetscFunctionReturn(PETSC_SUCCESS); 189 } 190 191 } // anonymous namespace 192 193 namespace impl 194 { 195 196 // ========================================================================================== 197 // MatDense_Seq_CUPM - Private API - Utility 198 // ========================================================================================== 199 200 template <device::cupm::DeviceType T> 201 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept 202 { 203 const auto mcu = MatCUPMCast(m); 204 const auto nrows = m->rmap->n; 205 const auto ncols = m->cmap->n; 206 auto &lda = MatIMPLCast(m)->lda; 207 cupmStream_t stream; 208 209 PetscFunctionBegin; 210 PetscCheckTypeName(m, MATSEQDENSECUPM()); 211 PetscValidDeviceContext(dctx, 2); 212 PetscCall(checkCupmBlasIntCast(nrows)); 213 PetscCall(checkCupmBlasIntCast(ncols)); 214 PetscCall(GetHandlesFrom_(dctx, &stream)); 215 if (lda <= 0) lda = nrows; 216 if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream)); 217 if (user_device_array) { 218 mcu->d_user_alloc = PETSC_TRUE; 219 mcu->d_v = user_device_array; 220 } else { 221 std::size_t size; 222 223 mcu->d_user_alloc = PETSC_FALSE; 224 size = lda * ncols; 225 PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream)); 226 PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream)); 227 } 228 m->offloadmask = PETSC_OFFLOAD_GPU; 229 PetscFunctionReturn(PETSC_SUCCESS); 230 } 231 232 template <device::cupm::DeviceType T> 233 inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept 234 { 235 const auto nrows = m->rmap->n; 236 const auto ncols = m->cmap->n; 237 const auto copy = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED; 238 239 PetscFunctionBegin; 240 PetscCheckTypeName(m, MATSEQDENSECUPM()); 241 if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS); 242 PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols)); 243 if (copy) { 244 const auto mcu = MatCUPMCast(m); 245 cupmStream_t stream; 246 247 // Allocate GPU memory if not present 248 if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr)); 249 PetscCall(GetHandlesFrom_(dctx, &stream)); 250 PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0)); 251 { 252 const auto mimpl = MatIMPLCast(m); 253 const auto lda = mimpl->lda; 254 const auto src = mimpl->v; 255 const auto dest = mcu->d_v; 256 257 if (lda > nrows) { 258 PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream)); 259 } else { 260 PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream)); 261 } 262 } 263 PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0)); 264 // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH 265 m->offloadmask = PETSC_OFFLOAD_BOTH; 266 } 267 PetscFunctionReturn(PETSC_SUCCESS); 268 } 269 270 template <device::cupm::DeviceType T> 271 inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept 272 { 273 const auto nrows = m->rmap->n; 274 const auto ncols = m->cmap->n; 275 const auto copy = m->offloadmask == PETSC_OFFLOAD_GPU; 276 277 PetscFunctionBegin; 278 PetscCheckTypeName(m, MATSEQDENSECUPM()); 279 PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols)); 280 if (copy) { 281 const auto mimpl = MatIMPLCast(m); 282 cupmStream_t stream; 283 284 // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed 285 if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr)); 286 PetscCall(GetHandlesFrom_(dctx, &stream)); 287 PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0)); 288 { 289 const auto lda = mimpl->lda; 290 const auto dest = mimpl->v; 291 const auto src = MatCUPMCast(m)->d_v; 292 293 if (lda > nrows) { 294 PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream)); 295 } else { 296 PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream)); 297 } 298 } 299 PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0)); 300 // order is important, MatSeqDenseSetPreallocation() might set offloadmask 301 m->offloadmask = PETSC_OFFLOAD_BOTH; 302 } 303 PetscFunctionReturn(PETSC_SUCCESS); 304 } 305 306 template <device::cupm::DeviceType T> 307 inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept 308 { 309 PetscFunctionBegin; 310 if (PetscDefined(USE_DEBUG)) { 311 cupmBlasInt_t info = 0; 312 313 PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream)); 314 if (stream) PetscCallCUPM(cupmStreamSynchronize(stream)); 315 static_assert(std::is_same<decltype(info), int>::value, ""); 316 PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1); 317 PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info); 318 } 319 PetscFunctionReturn(PETSC_SUCCESS); 320 } 321 322 // ========================================================================================== 323 // MatDense_Seq_CUPM - Private API - Solver Dispatch 324 // ========================================================================================== 325 326 // specific solvers called through the dispatch_() family of functions 327 template <device::cupm::DeviceType T> 328 template <typename Derived> 329 struct MatDense_Seq_CUPM<T>::SolveCommon { 330 using derived_type = Derived; 331 332 template <typename F> 333 static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept 334 { 335 cupmBlasInt_t lwork; 336 337 PetscFunctionBegin; 338 PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork)); 339 if (lwork > mcu->d_fact_lwork) { 340 mcu->d_fact_lwork = lwork; 341 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream)); 342 PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream)); 343 } 344 PetscFunctionReturn(PETSC_SUCCESS); 345 } 346 347 static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept 348 { 349 const auto mcu = MatCUPMCast(A); 350 351 PetscFunctionBegin; 352 PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n)); 353 A->factortype = derived_type::MATFACTORTYPE(); 354 A->ops->solve = MatSolve_Factored_Dispatch_<derived_type, false>; 355 A->ops->solvetranspose = MatSolve_Factored_Dispatch_<derived_type, true>; 356 A->ops->matsolve = MatMatSolve_Factored_Dispatch_<derived_type, false>; 357 A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>; 358 359 PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype)); 360 if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream)); 361 PetscFunctionReturn(PETSC_SUCCESS); 362 } 363 }; 364 365 template <device::cupm::DeviceType T> 366 struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> { 367 using base_type = SolveCommon<SolveLU>; 368 369 static constexpr const char *NAME() noexcept { return "LU"; } 370 static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; } 371 372 static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept 373 { 374 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 375 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 376 cupmStream_t stream; 377 cupmSolverHandle_t handle; 378 PetscDeviceContext dctx; 379 380 PetscFunctionBegin; 381 if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS); 382 PetscCall(GetHandles_(&dctx, &handle, &stream)); 383 PetscCall(base_type::FactorPrepare(A, stream)); 384 { 385 const auto mcu = MatCUPMCast(A); 386 const auto da = DeviceArrayReadWrite(dctx, A); 387 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 388 389 // clang-format off 390 PetscCall( 391 base_type::ResizeFactLwork( 392 mcu, stream, 393 [&](cupmBlasInt_t *fact_lwork) 394 { 395 return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork); 396 } 397 ) 398 ); 399 // clang-format on 400 if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream)); 401 402 PetscCall(PetscLogGpuTimeBegin()); 403 PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info)); 404 PetscCall(PetscLogGpuTimeEnd()); 405 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 406 } 407 PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0)); 408 PetscFunctionReturn(PETSC_SUCCESS); 409 } 410 411 template <bool transpose> 412 static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept 413 { 414 const auto mcu = MatCUPMCast(A); 415 const auto fact_info = mcu->d_fact_info; 416 const auto fact_ipiv = mcu->d_fact_ipiv; 417 cupmSolverHandle_t handle; 418 419 PetscFunctionBegin; 420 PetscCall(GetHandlesFrom_(dctx, &handle)); 421 PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k)); 422 PetscCall(PetscLogGpuTimeBegin()); 423 { 424 constexpr auto op = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N; 425 const auto da = DeviceArrayRead(dctx, A); 426 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 427 428 // clang-format off 429 PetscCall( 430 base_type::ResizeFactLwork( 431 mcu, stream, 432 [&](cupmBlasInt_t *lwork) 433 { 434 return cupmSolverXgetrs_bufferSize( 435 handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork 436 ); 437 } 438 ) 439 ); 440 // clang-format on 441 PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info)); 442 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 443 } 444 PetscCall(PetscLogGpuTimeEnd()); 445 PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m))); 446 PetscFunctionReturn(PETSC_SUCCESS); 447 } 448 }; 449 450 template <device::cupm::DeviceType T> 451 struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> { 452 using base_type = SolveCommon<SolveCholesky>; 453 454 static constexpr const char *NAME() noexcept { return "Cholesky"; } 455 static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; } 456 457 static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept 458 { 459 const auto n = static_cast<cupmBlasInt_t>(A->rmap->n); 460 PetscDeviceContext dctx; 461 cupmSolverHandle_t handle; 462 cupmStream_t stream; 463 464 PetscFunctionBegin; 465 if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS); 466 PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName()); 467 PetscCall(GetHandles_(&dctx, &handle, &stream)); 468 PetscCall(base_type::FactorPrepare(A, stream)); 469 { 470 const auto mcu = MatCUPMCast(A); 471 const auto da = DeviceArrayReadWrite(dctx, A); 472 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 473 474 // clang-format off 475 PetscCall( 476 base_type::ResizeFactLwork( 477 mcu, stream, 478 [&](cupmBlasInt_t *fact_lwork) 479 { 480 return cupmSolverXpotrf_bufferSize( 481 handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork 482 ); 483 } 484 ) 485 ); 486 // clang-format on 487 PetscCall(PetscLogGpuTimeBegin()); 488 PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 489 PetscCall(PetscLogGpuTimeEnd()); 490 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 491 } 492 PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0)); 493 494 #if 0 495 // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs 496 // and *hetr* routines. The code below should work, and it can be activated when *sytrs 497 // routines will be available 498 if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream)); 499 if (!mcu->d_fact_lwork) { 500 PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork)); 501 PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream)); 502 } 503 if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream)); 504 PetscCall(PetscLogGpuTimeBegin()); 505 PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 506 PetscCall(PetscLogGpuTimeEnd()); 507 #endif 508 PetscFunctionReturn(PETSC_SUCCESS); 509 } 510 511 template <bool transpose> 512 static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept 513 { 514 const auto mcu = MatCUPMCast(A); 515 const auto fact_info = mcu->d_fact_info; 516 cupmSolverHandle_t handle; 517 518 PetscFunctionBegin; 519 PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName()); 520 PetscCall(GetHandlesFrom_(dctx, &handle)); 521 PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k)); 522 PetscCall(PetscLogGpuTimeBegin()); 523 { 524 const auto da = DeviceArrayRead(dctx, A); 525 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 526 527 // clang-format off 528 PetscCall( 529 base_type::ResizeFactLwork( 530 mcu, stream, 531 [&](cupmBlasInt_t *lwork) 532 { 533 return cupmSolverXpotrs_bufferSize( 534 handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork 535 ); 536 } 537 ) 538 ); 539 // clang-format on 540 PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info)); 541 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 542 } 543 PetscCall(PetscLogGpuTimeEnd()); 544 PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m))); 545 PetscFunctionReturn(PETSC_SUCCESS); 546 } 547 }; 548 549 template <device::cupm::DeviceType T> 550 struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> { 551 using base_type = SolveCommon<SolveQR>; 552 553 static constexpr const char *NAME() noexcept { return "QR"; } 554 static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; } 555 556 static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept 557 { 558 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 559 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 560 const auto min = std::min(m, n); 561 const auto mimpl = MatIMPLCast(A); 562 cupmStream_t stream; 563 cupmSolverHandle_t handle; 564 PetscDeviceContext dctx; 565 566 PetscFunctionBegin; 567 if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS); 568 PetscCall(GetHandles_(&dctx, &handle, &stream)); 569 PetscCall(base_type::FactorPrepare(A, stream)); 570 mimpl->rank = min; 571 { 572 const auto mcu = MatCUPMCast(A); 573 const auto da = DeviceArrayReadWrite(dctx, A); 574 const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda); 575 576 if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec)); 577 if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream)); 578 // clang-format off 579 PetscCall( 580 base_type::ResizeFactLwork( 581 mcu, stream, 582 [&](cupmBlasInt_t *fact_lwork) 583 { 584 return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork); 585 } 586 ) 587 ); 588 // clang-format on 589 PetscCall(PetscLogGpuTimeBegin()); 590 PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 591 PetscCall(PetscLogGpuTimeEnd()); 592 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 593 } 594 PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0))); 595 PetscFunctionReturn(PETSC_SUCCESS); 596 } 597 598 template <bool transpose> 599 static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept 600 { 601 const auto mimpl = MatIMPLCast(A); 602 const auto rank = static_cast<cupmBlasInt_t>(mimpl->rank); 603 const auto mcu = MatCUPMCast(A); 604 const auto fact_info = mcu->d_fact_info; 605 const auto fact_tau = mcu->d_fact_tau; 606 const auto fact_work = mcu->d_fact_work; 607 const auto fact_lwork = mcu->d_fact_lwork; 608 cupmSolverHandle_t solver_handle; 609 cupmBlasHandle_t blas_handle; 610 611 PetscFunctionBegin; 612 PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle)); 613 PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k)); 614 PetscCall(PetscLogGpuTimeBegin()); 615 { 616 const auto da = DeviceArrayRead(dctx, A); 617 const auto one = cupmScalarCast(1.0); 618 const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda); 619 620 if (transpose) { 621 PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx)); 622 PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info)); 623 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 624 } else { 625 constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T; 626 627 PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info)); 628 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 629 PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx)); 630 } 631 } 632 PetscCall(PetscLogGpuTimeEnd()); 633 PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank)))); 634 PetscFunctionReturn(PETSC_SUCCESS); 635 } 636 }; 637 638 template <device::cupm::DeviceType T> 639 template <typename Solver, bool transpose> 640 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept 641 { 642 using namespace vec::cupm; 643 const auto pobj_A = PetscObjectCast(A); 644 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 645 const auto k = static_cast<cupmBlasInt_t>(A->cmap->n); 646 auto &workvec = MatCUPMCast(A)->workvec; 647 PetscScalar *y_array = nullptr; 648 PetscDeviceContext dctx; 649 PetscBool xiscupm, yiscupm, aiscupm; 650 bool use_y_array_directly; 651 cupmStream_t stream; 652 653 PetscFunctionBegin; 654 PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve"); 655 PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm)); 656 PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm)); 657 PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm)); 658 PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????"); 659 PetscCall(GetHandles_(&dctx, &stream)); 660 use_y_array_directly = yiscupm && (k >= m); 661 { 662 const PetscScalar *x_array; 663 const auto xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask); 664 const auto copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice; 665 666 if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec)); 667 // The logic here is to try to minimize the amount of memory copying: 668 // 669 // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded 670 // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the 671 // data in order to copy it into the y array. So the array x will be wherever the data 672 // already is so that only one memcpy is performed 673 if (xisdevice) { 674 PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx)); 675 } else { 676 PetscCall(VecGetArrayRead(x, &x_array)); 677 } 678 PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx)); 679 PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream)); 680 if (xisdevice) { 681 PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx)); 682 } else { 683 PetscCall(VecRestoreArrayRead(x, &x_array)); 684 } 685 } 686 687 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 688 PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream)); 689 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A)); 690 691 if (use_y_array_directly) { 692 PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx)); 693 } else { 694 const auto copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost; 695 PetscScalar *yv; 696 697 // The logic here is that the data is not yet in either y's GPU array or its CPU array. 698 // There is nothing in the interface to say where the user would like it to end up. So we 699 // choose the GPU, because it is the faster option 700 if (yiscupm) { 701 PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx)); 702 } else { 703 PetscCall(VecGetArray(y, &yv)); 704 } 705 PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream)); 706 if (yiscupm) { 707 PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx)); 708 } else { 709 PetscCall(VecRestoreArray(y, &yv)); 710 } 711 PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array)); 712 } 713 PetscFunctionReturn(PETSC_SUCCESS); 714 } 715 716 template <device::cupm::DeviceType T> 717 template <typename Solver, bool transpose> 718 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept 719 { 720 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 721 const auto k = static_cast<cupmBlasInt_t>(A->cmap->n); 722 cupmBlasInt_t nrhs, ldb, ldx, ldy; 723 PetscScalar *y; 724 PetscBool biscupm, xiscupm, aiscupm; 725 PetscDeviceContext dctx; 726 cupmStream_t stream; 727 728 PetscFunctionBegin; 729 PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve"); 730 PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm)); 731 PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm)); 732 PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm)); 733 PetscCall(GetHandles_(&dctx, &stream)); 734 { 735 PetscInt n; 736 737 PetscCall(MatGetSize(B, nullptr, &n)); 738 PetscCall(PetscCUPMBlasIntCast(n, &nrhs)); 739 PetscCall(MatDenseGetLDA(B, &n)); 740 PetscCall(PetscCUPMBlasIntCast(n, &ldb)); 741 PetscCall(MatDenseGetLDA(X, &n)); 742 PetscCall(PetscCUPMBlasIntCast(n, &ldx)); 743 } 744 { 745 // The logic here is to try to minimize the amount of memory copying: 746 // 747 // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not 748 // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to 749 // get the data in order to copy it into the y array. So the array b will be wherever the 750 // data already is so that only one memcpy is performed 751 const auto bisdevice = biscupm && PetscOffloadDevice(B->offloadmask); 752 const auto copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice; 753 const PetscScalar *b; 754 755 if (bisdevice) { 756 b = DeviceArrayRead(dctx, B); 757 } else if (biscupm) { 758 b = HostArrayRead(dctx, B); 759 } else { 760 PetscCall(MatDenseGetArrayRead(B, &b)); 761 } 762 763 if (ldx < m || !xiscupm) { 764 // X's array cannot serve as the array (too small or not on device), B's array cannot 765 // serve as the array (const), so allocate a new array 766 ldy = m; 767 PetscCall(PetscCUPMMallocAsync(&y, nrhs * m)); 768 } else { 769 // X's array should serve as the array 770 ldy = ldx; 771 y = DeviceArrayWrite(dctx, X); 772 } 773 PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream)); 774 if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b)); 775 } 776 777 // convert to CUPM twice?????????????????????????????????? 778 // but A should already be CUPM?????????????????????????????????????? 779 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 780 PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream)); 781 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 782 783 if (ldx < m || !xiscupm) { 784 const auto copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost; 785 PetscScalar *x; 786 787 // The logic here is that the data is not yet in either X's GPU array or its CPU 788 // array. There is nothing in the interface to say where the user would like it to end up. 789 // So we choose the GPU, because it is the faster option 790 if (xiscupm) { 791 x = DeviceArrayWrite(dctx, X); 792 } else { 793 PetscCall(MatDenseGetArray(X, &x)); 794 } 795 PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream)); 796 if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x)); 797 PetscCallCUPM(cupmFreeAsync(y, stream)); 798 } 799 PetscFunctionReturn(PETSC_SUCCESS); 800 } 801 802 template <device::cupm::DeviceType T> 803 template <bool transpose, bool hermitian> 804 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAddColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end) noexcept 805 { 806 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 807 const auto n = static_cast<cupmBlasInt_t>(c_end - c_start); 808 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 809 cupmBlasHandle_t handle; 810 PetscDeviceContext dctx; 811 812 PetscFunctionBegin; 813 if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add 814 if (!m || !n) { 815 // mult only 816 if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0)); 817 PetscFunctionReturn(PETSC_SUCCESS); 818 } 819 PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n)); 820 PetscCall(GetHandles_(&dctx, &handle)); 821 { 822 constexpr auto op = transpose ? (hermitian ? CUPMBLAS_OP_C : CUPMBLAS_OP_T) : CUPMBLAS_OP_N; 823 const auto one = cupmScalarCast(1.0); 824 const auto zero = cupmScalarCast(0.0); 825 const auto da = DeviceArrayRead(dctx, A); 826 const auto dxx = VecSeq_CUPM::DeviceArrayRead(dctx, xx); 827 const auto dzz = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz); 828 829 PetscCall(PetscLogGpuTimeBegin()); 830 PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata() + c_start * lda, lda, dxx.cupmdata() + (transpose ? 0 : c_start), 1, (yy ? &one : &zero), dzz.cupmdata() + (transpose ? c_start : 0), 1)); 831 PetscCall(PetscLogGpuTimeEnd()); 832 } 833 PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m))); 834 PetscFunctionReturn(PETSC_SUCCESS); 835 } 836 837 template <device::cupm::DeviceType T> 838 template <bool transpose, bool hermitian> 839 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end) noexcept 840 { 841 PetscFunctionBegin; 842 PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, nullptr, yy, c_start, c_end)); 843 PetscFunctionReturn(PETSC_SUCCESS); 844 } 845 846 template <device::cupm::DeviceType T> 847 template <bool transpose, bool hermitian> 848 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept 849 { 850 PetscFunctionBegin; 851 PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, yy, zz, 0, A->cmap->n)); 852 PetscFunctionReturn(PETSC_SUCCESS); 853 } 854 855 // ========================================================================================== 856 // MatDense_Seq_CUPM - Private API - Conversion Dispatch 857 // ========================================================================================== 858 859 template <device::cupm::DeviceType T> 860 template <bool to_host> 861 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept 862 { 863 PetscFunctionBegin; 864 if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) { 865 // TODO these cases should be optimized 866 PetscCall(MatConvert_Basic(M, type, reuse, newmat)); 867 } else { 868 const auto B = *newmat; 869 const auto pobj = PetscObjectCast(B); 870 871 if (to_host) { 872 PetscCall(BindToCPU(B, PETSC_TRUE)); 873 PetscCall(Reset(B)); 874 } else { 875 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM())); 876 } 877 878 PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype)); 879 PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM())); 880 // cvec might be the wrong VecType, destroy and rebuild it if necessary 881 // REVIEW ME: this is possibly very inefficient 882 PetscCall(VecDestroy(&MatIMPLCast(B)->cvec)); 883 884 MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense); 885 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>); 886 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>); 887 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>); 888 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>); 889 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>); 890 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>); 891 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray); 892 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray); 893 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray); 894 MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense); 895 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation); 896 897 if (to_host) { 898 B->offloadmask = PETSC_OFFLOAD_CPU; 899 } else { 900 Mat_SeqDenseCUPM *mcu; 901 902 PetscCall(PetscNew(&mcu)); 903 B->spptr = mcu; 904 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host?? 905 PetscCall(BindToCPU(B, PETSC_FALSE)); 906 } 907 908 MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU); 909 MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy); 910 } 911 PetscFunctionReturn(PETSC_SUCCESS); 912 } 913 914 // ========================================================================================== 915 // MatDense_Seq_CUPM - Public API 916 // ========================================================================================== 917 918 template <device::cupm::DeviceType T> 919 inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept 920 { 921 return MATSEQDENSECUPM(); 922 } 923 924 template <device::cupm::DeviceType T> 925 inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept 926 { 927 return static_cast<Mat_SeqDenseCUPM *>(m->spptr); 928 } 929 930 template <device::cupm::DeviceType T> 931 inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept 932 { 933 return static_cast<Mat_SeqDense *>(m->data); 934 } 935 936 template <device::cupm::DeviceType T> 937 inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept 938 { 939 return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C"; 940 } 941 942 template <device::cupm::DeviceType T> 943 inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept 944 { 945 return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C"; 946 } 947 948 // ========================================================================================== 949 950 // MatCreate_SeqDenseCUPM() 951 template <device::cupm::DeviceType T> 952 inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept 953 { 954 PetscFunctionBegin; 955 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM())); 956 PetscCall(MatCreate_SeqDense(A)); 957 PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 958 PetscFunctionReturn(PETSC_SUCCESS); 959 } 960 961 template <device::cupm::DeviceType T> 962 inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept 963 { 964 PetscFunctionBegin; 965 // prevent copying back data if we own the data pointer 966 if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU; 967 PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A)); 968 PetscCall(MatDestroy_SeqDense(A)); 969 PetscFunctionReturn(PETSC_SUCCESS); 970 } 971 972 // obj->ops->setup() 973 template <device::cupm::DeviceType T> 974 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept 975 { 976 PetscFunctionBegin; 977 PetscCall(PetscLayoutSetUp(A->rmap)); 978 PetscCall(PetscLayoutSetUp(A->cmap)); 979 if (!A->preallocated) { 980 PetscDeviceContext dctx; 981 982 PetscCall(GetHandles_(&dctx)); 983 PetscCall(SetPreallocation(A, dctx, nullptr)); 984 } 985 PetscFunctionReturn(PETSC_SUCCESS); 986 } 987 988 template <device::cupm::DeviceType T> 989 inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept 990 { 991 PetscFunctionBegin; 992 if (const auto mcu = MatCUPMCast(A)) { 993 cupmStream_t stream; 994 995 PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME()); 996 PetscCall(GetHandles_(&stream)); 997 if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream)); 998 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream)); 999 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream)); 1000 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream)); 1001 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream)); 1002 PetscCall(VecDestroy(&mcu->workvec)); 1003 PetscCall(PetscFree(A->spptr /* mcu */)); 1004 } 1005 PetscFunctionReturn(PETSC_SUCCESS); 1006 } 1007 1008 // ========================================================================================== 1009 1010 template <device::cupm::DeviceType T> 1011 inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept 1012 { 1013 const auto mimpl = MatIMPLCast(A); 1014 const auto pobj = PetscObjectCast(A); 1015 1016 PetscFunctionBegin; 1017 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1018 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1019 A->boundtocpu = to_host; 1020 PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype)); 1021 if (to_host) { 1022 PetscDeviceContext dctx; 1023 1024 // make sure we have an up-to-date copy on the CPU 1025 PetscCall(GetHandles_(&dctx)); 1026 PetscCall(DeviceToHost_(A, dctx)); 1027 } else { 1028 PetscBool iscupm; 1029 1030 if (auto &cvec = mimpl->cvec) { 1031 PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm)); 1032 if (!iscupm) PetscCall(VecDestroy(&cvec)); 1033 } 1034 if (auto &cmat = mimpl->cmat) { 1035 PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm)); 1036 if (!iscupm) PetscCall(MatDestroy(&cmat)); 1037 } 1038 } 1039 1040 // ============================================================ 1041 // Composed ops 1042 // ============================================================ 1043 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>); 1044 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>); 1045 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>); 1046 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>); 1047 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>); 1048 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>); 1049 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>); 1050 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>); 1051 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>); 1052 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>); 1053 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>); 1054 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>); 1055 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>); 1056 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>); 1057 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>); 1058 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix); 1059 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix); 1060 MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor); 1061 MatComposeOp_CUPM(to_host, pobj, "MatMultAddColumnRange_C", MatMultAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>); 1062 MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeColumnRange_C", MatMultHermitianTransposeColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>); 1063 MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeAddColumnRange_C", MatMultHermitianTransposeAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>); 1064 // always the same 1065 PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense)); 1066 1067 // ============================================================ 1068 // Function pointer ops 1069 // ============================================================ 1070 MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate); 1071 MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>(A, xx, nullptr, yy); }); 1072 MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>(A, xx, nullptr, yy); }); 1073 MatSetOp_CUPM(to_host, A, multhermitiantranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>(A, xx, nullptr, yy); }); 1074 MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>); 1075 MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>); 1076 MatSetOp_CUPM(to_host, A, multhermitiantransposeadd, MatMultHermitianTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>); 1077 MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>); 1078 MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>); 1079 MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>); 1080 MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY); 1081 MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor); 1082 MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor); 1083 MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector); 1084 MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale); 1085 MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift); 1086 MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy); 1087 MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries); 1088 MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp); 1089 MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom); 1090 MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal); 1091 // seemingly always the same 1092 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense; 1093 1094 if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host)); 1095 PetscFunctionReturn(PETSC_SUCCESS); 1096 } 1097 1098 template <device::cupm::DeviceType T> 1099 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept 1100 { 1101 PetscFunctionBegin; 1102 PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat)); 1103 PetscFunctionReturn(PETSC_SUCCESS); 1104 } 1105 1106 template <device::cupm::DeviceType T> 1107 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept 1108 { 1109 PetscFunctionBegin; 1110 PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat)); 1111 PetscFunctionReturn(PETSC_SUCCESS); 1112 } 1113 1114 // ========================================================================================== 1115 1116 template <device::cupm::DeviceType T> 1117 template <PetscMemType mtype, PetscMemoryAccessMode access> 1118 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept 1119 { 1120 constexpr auto hostmem = PetscMemTypeHost(mtype); 1121 constexpr auto read_access = PetscMemoryAccessRead(access); 1122 1123 PetscFunctionBegin; 1124 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 1125 PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx)); 1126 if (hostmem) { 1127 if (read_access) { 1128 PetscCall(DeviceToHost_(m, dctx)); 1129 } else if (!MatIMPLCast(m)->v) { 1130 // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed 1131 PetscCall(MatSeqDenseSetPreallocation(m, nullptr)); 1132 } 1133 *array = MatIMPLCast(m)->v; 1134 } else { 1135 if (read_access) { 1136 PetscCall(HostToDevice_(m, dctx)); 1137 } else if (!MatCUPMCast(m)->d_v) { 1138 // write-only 1139 PetscCall(SetPreallocation(m, dctx, nullptr)); 1140 } 1141 *array = MatCUPMCast(m)->d_v; 1142 } 1143 if (PetscMemoryAccessWrite(access)) { 1144 m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 1145 PetscCall(PetscObjectStateIncrease(PetscObjectCast(m))); 1146 } 1147 PetscFunctionReturn(PETSC_SUCCESS); 1148 } 1149 1150 template <device::cupm::DeviceType T> 1151 template <PetscMemType mtype, PetscMemoryAccessMode access> 1152 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept 1153 { 1154 PetscFunctionBegin; 1155 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 1156 if (PetscMemoryAccessWrite(access)) { 1157 // WRITE or READ_WRITE 1158 m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 1159 PetscCall(PetscObjectStateIncrease(PetscObjectCast(m))); 1160 } 1161 if (array) { 1162 PetscCall(CheckPointerMatchesMemType_(*array, mtype)); 1163 *array = nullptr; 1164 } 1165 PetscFunctionReturn(PETSC_SUCCESS); 1166 } 1167 1168 template <device::cupm::DeviceType T> 1169 template <PetscMemoryAccessMode access> 1170 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept 1171 { 1172 PetscFunctionBegin; 1173 PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx)); 1174 if (mtype) *mtype = PETSC_MEMTYPE_CUPM(); 1175 PetscFunctionReturn(PETSC_SUCCESS); 1176 } 1177 1178 template <device::cupm::DeviceType T> 1179 template <PetscMemoryAccessMode access> 1180 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept 1181 { 1182 PetscFunctionBegin; 1183 PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx)); 1184 PetscFunctionReturn(PETSC_SUCCESS); 1185 } 1186 1187 // ========================================================================================== 1188 1189 template <device::cupm::DeviceType T> 1190 inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept 1191 { 1192 const auto mimpl = MatIMPLCast(A); 1193 const auto mcu = MatCUPMCast(A); 1194 1195 PetscFunctionBegin; 1196 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1197 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1198 PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME()); 1199 if (mimpl->v) { 1200 PetscDeviceContext dctx; 1201 1202 PetscCall(GetHandles_(&dctx)); 1203 PetscCall(HostToDevice_(A, dctx)); 1204 } 1205 mcu->unplacedarray = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array)); 1206 mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE); 1207 PetscFunctionReturn(PETSC_SUCCESS); 1208 } 1209 1210 template <device::cupm::DeviceType T> 1211 inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept 1212 { 1213 const auto mimpl = MatIMPLCast(A); 1214 const auto mcu = MatCUPMCast(A); 1215 1216 PetscFunctionBegin; 1217 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1218 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1219 PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME()); 1220 if (!mcu->d_user_alloc) { 1221 cupmStream_t stream; 1222 1223 PetscCall(GetHandles_(&stream)); 1224 PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream)); 1225 } 1226 mcu->d_v = const_cast<PetscScalar *>(array); 1227 mcu->d_user_alloc = PETSC_FALSE; 1228 PetscFunctionReturn(PETSC_SUCCESS); 1229 } 1230 1231 template <device::cupm::DeviceType T> 1232 inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept 1233 { 1234 const auto mimpl = MatIMPLCast(A); 1235 const auto mcu = MatCUPMCast(A); 1236 1237 PetscFunctionBegin; 1238 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1239 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1240 if (mimpl->v) { 1241 PetscDeviceContext dctx; 1242 1243 PetscCall(GetHandles_(&dctx)); 1244 PetscCall(HostToDevice_(A, dctx)); 1245 } 1246 mcu->d_v = util::exchange(mcu->unplacedarray, nullptr); 1247 mcu->d_user_alloc = mcu->d_unplaced_user_alloc; 1248 PetscFunctionReturn(PETSC_SUCCESS); 1249 } 1250 1251 // ========================================================================================== 1252 1253 template <device::cupm::DeviceType T> 1254 template <bool transpose_A, bool transpose_B> 1255 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept 1256 { 1257 cupmBlasInt_t m, n, k; 1258 PetscBool Aiscupm, Biscupm; 1259 PetscDeviceContext dctx; 1260 cupmBlasHandle_t handle; 1261 1262 PetscFunctionBegin; 1263 PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m)); 1264 PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n)); 1265 PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k)); 1266 if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS); 1267 1268 // we may end up with SEQDENSE as one of the arguments 1269 // REVIEW ME: how? and why is it not B and C???????? 1270 PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm)); 1271 PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm)); 1272 if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 1273 if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B)); 1274 PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n)); 1275 PetscCall(GetHandles_(&dctx, &handle)); 1276 1277 PetscCall(PetscLogGpuTimeBegin()); 1278 { 1279 const auto one = cupmScalarCast(1.0); 1280 const auto zero = cupmScalarCast(0.0); 1281 const auto da = DeviceArrayRead(dctx, A); 1282 const auto db = DeviceArrayRead(dctx, B); 1283 const auto dc = DeviceArrayWrite(dctx, C); 1284 PetscInt alda, blda, clda; 1285 1286 PetscCall(MatDenseGetLDA(A, &alda)); 1287 PetscCall(MatDenseGetLDA(B, &blda)); 1288 PetscCall(MatDenseGetLDA(C, &clda)); 1289 PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda)); 1290 } 1291 PetscCall(PetscLogGpuTimeEnd()); 1292 1293 PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1))); 1294 if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A)); 1295 if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 1296 PetscFunctionReturn(PETSC_SUCCESS); 1297 } 1298 1299 template <device::cupm::DeviceType T> 1300 inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept 1301 { 1302 const auto m = A->rmap->n; 1303 const auto n = A->cmap->n; 1304 1305 PetscFunctionBegin; 1306 PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)"); 1307 // The two matrices must have the same copy implementation to be eligible for fast copy 1308 if (A->ops->copy == B->ops->copy) { 1309 PetscDeviceContext dctx; 1310 cupmStream_t stream; 1311 1312 PetscCall(GetHandles_(&dctx, &stream)); 1313 PetscCall(PetscLogGpuTimeBegin()); 1314 { 1315 const auto va = DeviceArrayRead(dctx, A); 1316 const auto vb = DeviceArrayWrite(dctx, B); 1317 // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets 1318 // lda! 1319 const auto lda_a = MatIMPLCast(A)->lda; 1320 const auto lda_b = MatIMPLCast(B)->lda; 1321 1322 if (lda_a > m || lda_b > m) { 1323 PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME()); 1324 PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME()); 1325 PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream)); 1326 } else { 1327 PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream)); 1328 } 1329 } 1330 PetscCall(PetscLogGpuTimeEnd()); 1331 } else { 1332 PetscCall(MatCopy_Basic(A, B, str)); 1333 } 1334 PetscFunctionReturn(PETSC_SUCCESS); 1335 } 1336 1337 template <device::cupm::DeviceType T> 1338 inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept 1339 { 1340 PetscDeviceContext dctx; 1341 cupmStream_t stream; 1342 1343 PetscFunctionBegin; 1344 PetscCall(GetHandles_(&dctx, &stream)); 1345 PetscCall(PetscLogGpuTimeBegin()); 1346 { 1347 const auto va = DeviceArrayWrite(dctx, m); 1348 const auto lda = MatIMPLCast(m)->lda; 1349 const auto ma = m->rmap->n; 1350 const auto na = m->cmap->n; 1351 1352 if (lda > ma) { 1353 PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream)); 1354 } else { 1355 PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream)); 1356 } 1357 } 1358 PetscCall(PetscLogGpuTimeEnd()); 1359 PetscFunctionReturn(PETSC_SUCCESS); 1360 } 1361 1362 namespace detail 1363 { 1364 1365 // ========================================================================================== 1366 // SubMatIndexFunctor 1367 // 1368 // Iterator which permutes a linear index range into matrix indices for am nrows x ncols 1369 // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for 1370 // the i'th sequential entry in the matrix. 1371 // ========================================================================================== 1372 template <typename T> 1373 struct SubMatIndexFunctor { 1374 PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); } 1375 1376 PetscInt nrows; 1377 PetscInt ncols; 1378 PetscInt lda; 1379 }; 1380 1381 template <typename Iterator> 1382 struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>> { 1383 using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>>; 1384 1385 using iterator = typename base_type::iterator; 1386 1387 constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept : 1388 base_type{ 1389 std::move(first), std::move(last), {nrows, ncols, lda} 1390 } 1391 { 1392 } 1393 1394 PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); } 1395 }; 1396 1397 namespace 1398 { 1399 1400 template <typename T> 1401 PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept 1402 { 1403 const auto nrows = rend - rstart; 1404 const auto ncols = cend - cstart; 1405 const auto dptr = thrust::device_pointer_cast(ptr); 1406 1407 return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda}; 1408 } 1409 1410 } // namespace 1411 1412 } // namespace detail 1413 1414 template <device::cupm::DeviceType T> 1415 inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept 1416 { 1417 const auto m = A->rmap->n; 1418 const auto n = A->cmap->n; 1419 const auto N = m * n; 1420 PetscDeviceContext dctx; 1421 1422 PetscFunctionBegin; 1423 PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n)); 1424 PetscCall(GetHandles_(&dctx)); 1425 { 1426 const auto da = DeviceArrayReadWrite(dctx, A); 1427 const auto lda = MatIMPLCast(A)->lda; 1428 1429 if (lda > m) { 1430 cupmStream_t stream; 1431 1432 PetscCall(GetHandlesFrom_(dctx, &stream)); 1433 // clang-format off 1434 PetscCallThrust( 1435 const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data()); 1436 1437 THRUST_CALL( 1438 thrust::transform, 1439 stream, 1440 sub_mat.begin(), sub_mat.end(), sub_mat.begin(), 1441 device::cupm::functors::make_times_equals(alpha) 1442 ) 1443 ); 1444 // clang-format on 1445 } else { 1446 const auto cu_alpha = cupmScalarCast(alpha); 1447 cupmBlasHandle_t handle; 1448 1449 PetscCall(GetHandlesFrom_(dctx, &handle)); 1450 PetscCall(PetscLogGpuTimeBegin()); 1451 PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1)); 1452 PetscCall(PetscLogGpuTimeEnd()); 1453 } 1454 } 1455 PetscCall(PetscLogGpuFlops(N)); 1456 PetscFunctionReturn(PETSC_SUCCESS); 1457 } 1458 1459 template <device::cupm::DeviceType T> 1460 inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept 1461 { 1462 const auto m_x = X->rmap->n, m_y = Y->rmap->n; 1463 const auto n_x = X->cmap->n, n_y = Y->cmap->n; 1464 const auto N = m_x * n_x; 1465 PetscDeviceContext dctx; 1466 1467 PetscFunctionBegin; 1468 if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS); 1469 PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y)); 1470 PetscCall(GetHandles_(&dctx)); 1471 { 1472 const auto dx = DeviceArrayRead(dctx, X); 1473 const auto dy = DeviceArrayReadWrite(dctx, Y); 1474 const auto lda_x = MatIMPLCast(X)->lda; 1475 const auto lda_y = MatIMPLCast(Y)->lda; 1476 1477 if (lda_x > m_x || lda_y > m_x) { 1478 cupmStream_t stream; 1479 1480 PetscCall(GetHandlesFrom_(dctx, &stream)); 1481 // clang-format off 1482 PetscCallThrust( 1483 const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data()); 1484 const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data()); 1485 1486 THRUST_CALL( 1487 thrust::transform, 1488 stream, 1489 sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(), 1490 device::cupm::functors::make_axpy(alpha) 1491 ); 1492 ); 1493 // clang-format on 1494 } else { 1495 const auto cu_alpha = cupmScalarCast(alpha); 1496 cupmBlasHandle_t handle; 1497 1498 PetscCall(GetHandlesFrom_(dctx, &handle)); 1499 PetscCall(PetscLogGpuTimeBegin()); 1500 PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1)); 1501 PetscCall(PetscLogGpuTimeEnd()); 1502 } 1503 } 1504 PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0))); 1505 PetscFunctionReturn(PETSC_SUCCESS); 1506 } 1507 1508 template <device::cupm::DeviceType T> 1509 inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept 1510 { 1511 const auto hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt; 1512 PetscDeviceContext dctx; 1513 1514 PetscFunctionBegin; 1515 PetscCall(GetHandles_(&dctx)); 1516 // do not call SetPreallocation() yet, we call it afterwards?? 1517 PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false)); 1518 PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt)); 1519 if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN)); 1520 // allocate memory if needed 1521 if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr)); 1522 PetscFunctionReturn(PETSC_SUCCESS); 1523 } 1524 1525 template <device::cupm::DeviceType T> 1526 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept 1527 { 1528 PetscBool device; 1529 1530 PetscFunctionBegin; 1531 PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device)); 1532 if (device) { 1533 const auto m = A->rmap->n; 1534 const auto n = A->cmap->n; 1535 PetscDeviceContext dctx; 1536 1537 PetscCall(GetHandles_(&dctx)); 1538 { 1539 const auto a = DeviceArrayWrite(dctx, A); 1540 PetscInt lda; 1541 1542 PetscCall(MatDenseGetLDA(A, &lda)); 1543 if (lda > m) { 1544 for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda)); 1545 } else { 1546 PetscInt mn; 1547 1548 PetscCall(PetscIntMultError(m, n, &mn)); 1549 PetscCall(PetscRandomGetValues(rng, mn, a)); 1550 } 1551 } 1552 } else { 1553 PetscCall(MatSetRandom_SeqDense(A, rng)); 1554 } 1555 PetscFunctionReturn(PETSC_SUCCESS); 1556 } 1557 1558 // ========================================================================================== 1559 1560 template <device::cupm::DeviceType T> 1561 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept 1562 { 1563 const auto offloadmask = A->offloadmask; 1564 const auto n = A->rmap->n; 1565 const auto col_offset = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; }; 1566 PetscBool viscupm; 1567 PetscDeviceContext dctx; 1568 cupmStream_t stream; 1569 1570 PetscFunctionBegin; 1571 PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), "")); 1572 PetscCall(GetHandles_(&dctx, &stream)); 1573 if (viscupm && !v->boundtocpu) { 1574 const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v); 1575 1576 // update device data 1577 if (PetscOffloadDevice(offloadmask)) { 1578 PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream)); 1579 } else { 1580 PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream)); 1581 } 1582 } else { 1583 PetscScalar *x; 1584 1585 // update host data 1586 PetscCall(VecGetArrayWrite(v, &x)); 1587 if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) { 1588 PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n)); 1589 } else if (PetscOffloadDevice(offloadmask)) { 1590 PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream)); 1591 } 1592 PetscCall(VecRestoreArrayWrite(v, &x)); 1593 } 1594 PetscFunctionReturn(PETSC_SUCCESS); 1595 } 1596 1597 template <device::cupm::DeviceType T> 1598 template <PetscMemoryAccessMode access> 1599 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept 1600 { 1601 using namespace vec::cupm; 1602 const auto mimpl = MatIMPLCast(A); 1603 PetscDeviceContext dctx; 1604 1605 PetscFunctionBegin; 1606 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1607 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1608 mimpl->vecinuse = col + 1; 1609 if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec)); 1610 PetscCall(GetHandles_(&dctx)); 1611 PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx)); 1612 PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda))); 1613 if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec)); 1614 *v = mimpl->cvec; 1615 PetscFunctionReturn(PETSC_SUCCESS); 1616 } 1617 1618 template <device::cupm::DeviceType T> 1619 template <PetscMemoryAccessMode access> 1620 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept 1621 { 1622 using namespace vec::cupm; 1623 const auto mimpl = MatIMPLCast(A); 1624 const auto cvec = mimpl->cvec; 1625 PetscDeviceContext dctx; 1626 1627 PetscFunctionBegin; 1628 PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first"); 1629 PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector"); 1630 mimpl->vecinuse = 0; 1631 if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec)); 1632 PetscCall(VecCUPMResetArrayAsync<T>(cvec)); 1633 PetscCall(GetHandles_(&dctx)); 1634 PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx)); 1635 if (v) *v = nullptr; 1636 PetscFunctionReturn(PETSC_SUCCESS); 1637 } 1638 1639 // ========================================================================================== 1640 1641 template <device::cupm::DeviceType T> 1642 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept 1643 { 1644 Mat fact = nullptr; 1645 PetscDeviceContext dctx; 1646 1647 PetscFunctionBegin; 1648 PetscCall(GetHandles_(&dctx)); 1649 PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false)); 1650 fact->factortype = ftype; 1651 switch (ftype) { 1652 case MAT_FACTOR_LU: 1653 case MAT_FACTOR_ILU: // fall-through 1654 fact->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense; 1655 fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense; 1656 break; 1657 case MAT_FACTOR_CHOLESKY: 1658 case MAT_FACTOR_ICC: // fall-through 1659 fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense; 1660 break; 1661 case MAT_FACTOR_QR: { 1662 const auto pobj = PetscObjectCast(fact); 1663 1664 PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense)); 1665 PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense)); 1666 } break; 1667 case MAT_FACTOR_NONE: 1668 case MAT_FACTOR_ILUDT: // fall-through 1669 case MAT_FACTOR_NUM_TYPES: // fall-through 1670 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]); 1671 } 1672 PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype)); 1673 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU)); 1674 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU)); 1675 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY)); 1676 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC)); 1677 *fact_out = fact; 1678 PetscFunctionReturn(PETSC_SUCCESS); 1679 } 1680 1681 template <device::cupm::DeviceType T> 1682 inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept 1683 { 1684 const auto mimpl = MatIMPLCast(A); 1685 const auto mcu = MatCUPMCast(A); 1686 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 1687 cupmSolverHandle_t handle; 1688 PetscDeviceContext dctx; 1689 cupmStream_t stream; 1690 1691 PetscFunctionBegin; 1692 #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC) 1693 // HIP appears to have this by default?? 1694 PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher"); 1695 #endif 1696 if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS); 1697 PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]); 1698 // spd 1699 PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName()); 1700 1701 PetscCall(GetHandles_(&dctx, &handle, &stream)); 1702 { 1703 const auto da = DeviceArrayReadWrite(dctx, A); 1704 const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda); 1705 cupmBlasInt_t il; 1706 1707 PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il)); 1708 if (il > mcu->d_fact_lwork) { 1709 mcu->d_fact_lwork = il; 1710 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream)); 1711 PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream)); 1712 } 1713 PetscCall(PetscLogGpuTimeBegin()); 1714 PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 1715 PetscCall(PetscLogGpuTimeEnd()); 1716 } 1717 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 1718 // TODO (write cuda kernel) 1719 PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE)); 1720 PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0)); 1721 1722 A->ops->solve = nullptr; 1723 A->ops->solvetranspose = nullptr; 1724 A->ops->matsolve = nullptr; 1725 A->factortype = MAT_FACTOR_NONE; 1726 1727 PetscCall(PetscFree(A->solvertype)); 1728 PetscFunctionReturn(PETSC_SUCCESS); 1729 } 1730 1731 // ========================================================================================== 1732 1733 template <device::cupm::DeviceType T> 1734 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept 1735 { 1736 const auto mimpl = MatIMPLCast(A); 1737 const auto array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; }; 1738 const auto n = rend - rbegin; 1739 const auto m = cend - cbegin; 1740 auto &cmat = mimpl->cmat; 1741 PetscDeviceContext dctx; 1742 1743 PetscFunctionBegin; 1744 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1745 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1746 mimpl->matinuse = cbegin + 1; 1747 1748 PetscCall(GetHandles_(&dctx)); 1749 PetscCall(HostToDevice_(A, dctx)); 1750 1751 if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat)); 1752 { 1753 const auto device_array = array_offset(MatCUPMCast(A)->d_v); 1754 1755 if (cmat) { 1756 PetscCall(PlaceArray(cmat, device_array)); 1757 } else { 1758 PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx)); 1759 } 1760 } 1761 PetscCall(MatDenseSetLDA(cmat, mimpl->lda)); 1762 // place CPU array if present but do not copy any data 1763 if (const auto host_array = mimpl->v) { 1764 cmat->offloadmask = PETSC_OFFLOAD_GPU; 1765 PetscCall(MatDensePlaceArray(cmat, array_offset(host_array))); 1766 } 1767 1768 cmat->offloadmask = A->offloadmask; 1769 *mat = cmat; 1770 PetscFunctionReturn(PETSC_SUCCESS); 1771 } 1772 1773 template <device::cupm::DeviceType T> 1774 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept 1775 { 1776 const auto mimpl = MatIMPLCast(A); 1777 const auto cmat = mimpl->cmat; 1778 const auto reset = static_cast<bool>(mimpl->v); 1779 bool copy, was_offload_host; 1780 1781 PetscFunctionBegin; 1782 PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first"); 1783 PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix"); 1784 PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()"); 1785 mimpl->matinuse = 0; 1786 1787 // calls to ResetArray may change it, so save it here 1788 was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU; 1789 if (was_offload_host && !reset) { 1790 copy = true; 1791 PetscCall(MatSeqDenseSetPreallocation(A, nullptr)); 1792 } else { 1793 copy = false; 1794 } 1795 1796 PetscCall(ResetArray(cmat)); 1797 if (reset) PetscCall(MatDenseResetArray(cmat)); 1798 if (copy) { 1799 PetscDeviceContext dctx; 1800 1801 PetscCall(GetHandles_(&dctx)); 1802 PetscCall(DeviceToHost_(A, dctx)); 1803 } else { 1804 A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 1805 } 1806 1807 cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1808 *m = nullptr; 1809 PetscFunctionReturn(PETSC_SUCCESS); 1810 } 1811 1812 // ========================================================================================== 1813 1814 namespace 1815 { 1816 1817 template <device::cupm::DeviceType T> 1818 inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept 1819 { 1820 PetscFunctionBegin; 1821 if (TA) { 1822 if (TB) { 1823 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C)); 1824 } else { 1825 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C)); 1826 } 1827 } else { 1828 if (TB) { 1829 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C)); 1830 } else { 1831 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C)); 1832 } 1833 } 1834 PetscFunctionReturn(PETSC_SUCCESS); 1835 } 1836 1837 template <device::cupm::DeviceType T> 1838 inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept 1839 { 1840 PetscFunctionBegin; 1841 for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) { 1842 PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor)); 1843 PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor)); 1844 } 1845 PetscFunctionReturn(PETSC_SUCCESS); 1846 } 1847 1848 } // anonymous namespace 1849 1850 } // namespace impl 1851 1852 } // namespace cupm 1853 1854 } // namespace mat 1855 1856 } // namespace Petsc 1857