1 #pragma once 2 3 #include <petsc/private/matdensecupmimpl.h> /*I <petscmat.h> I*/ 4 #include <../src/mat/impls/dense/seq/dense.h> 5 6 #include <petsc/private/deviceimpl.h> // PetscDeviceContextGetOptionalNullContext_Internal() 7 #include <petsc/private/randomimpl.h> // _p_PetscRandom 8 #include <petsc/private/vecimpl.h> // _p_Vec 9 #include <petsc/private/cupmobject.hpp> 10 #include <petsc/private/cupmsolverinterface.hpp> 11 12 #include <petsc/private/cpp/type_traits.hpp> // PetscObjectCast() 13 #include <petsc/private/cpp/utility.hpp> // util::exchange() 14 15 #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp> // for VecSeq_CUPM 16 17 namespace Petsc 18 { 19 20 namespace mat 21 { 22 23 namespace cupm 24 { 25 26 namespace impl 27 { 28 29 template <device::cupm::DeviceType T> 30 class MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> { 31 public: 32 MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>); 33 34 private: 35 struct Mat_SeqDenseCUPM { 36 PetscScalar *d_v; // pointer to the matrix on the GPU 37 PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original 38 bool d_user_alloc; 39 bool d_unplaced_user_alloc; 40 // factorization support 41 cupmBlasInt_t *d_fact_ipiv; // device pivots 42 cupmScalar_t *d_fact_tau; // device QR tau vector 43 cupmBlasInt_t *d_fact_info; // device info 44 cupmScalar_t *d_fact_work; // device workspace 45 cupmBlasInt_t d_fact_lwork; // size of device workspace 46 // workspace 47 Vec workvec; 48 }; 49 50 static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept; 51 52 static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept; 53 static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept; 54 55 static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept; 56 57 template <typename Derived> 58 struct SolveCommon; 59 struct SolveQR; 60 struct SolveCholesky; 61 struct SolveLU; 62 63 template <typename Solver, bool transpose> 64 static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept; 65 template <typename Solver, bool transpose> 66 static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept; 67 template <bool transpose> 68 static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept; 69 70 template <bool to_host> 71 static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept; 72 73 PETSC_NODISCARD static constexpr MatType MATIMPLCUPM_() noexcept; 74 PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept; 75 76 public: 77 PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept; 78 79 // define these by hand since they don't fit the above mold 80 PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept; 81 PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept; 82 83 static PetscErrorCode Create(Mat) noexcept; 84 static PetscErrorCode Destroy(Mat) noexcept; 85 static PetscErrorCode SetUp(Mat) noexcept; 86 static PetscErrorCode Reset(Mat) noexcept; 87 88 static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept; 89 static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept; 90 static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept; 91 92 template <PetscMemType, PetscMemoryAccessMode> 93 static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept; 94 template <PetscMemType, PetscMemoryAccessMode> 95 static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept; 96 template <PetscMemoryAccessMode> 97 static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept; 98 template <PetscMemoryAccessMode> 99 static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept; 100 101 private: 102 template <PetscMemType mtype, PetscMemoryAccessMode mode> 103 static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept 104 { 105 PetscDeviceContext dctx; 106 107 PetscFunctionBegin; 108 PetscCall(GetHandles_(&dctx)); 109 PetscCall(GetArray<mtype, mode>(m, p, dctx)); 110 PetscFunctionReturn(PETSC_SUCCESS); 111 } 112 113 template <PetscMemType mtype, PetscMemoryAccessMode mode> 114 static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept 115 { 116 PetscDeviceContext dctx; 117 118 PetscFunctionBegin; 119 PetscCall(GetHandles_(&dctx)); 120 PetscCall(RestoreArray<mtype, mode>(m, p, dctx)); 121 PetscFunctionReturn(PETSC_SUCCESS); 122 } 123 124 template <PetscMemoryAccessMode mode> 125 static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept 126 { 127 PetscDeviceContext dctx; 128 129 PetscFunctionBegin; 130 PetscCall(GetHandles_(&dctx)); 131 PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx)); 132 PetscFunctionReturn(PETSC_SUCCESS); 133 } 134 135 template <PetscMemoryAccessMode mode> 136 static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept 137 { 138 PetscDeviceContext dctx; 139 140 PetscFunctionBegin; 141 PetscCall(GetHandles_(&dctx)); 142 PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx)); 143 PetscFunctionReturn(PETSC_SUCCESS); 144 } 145 146 public: 147 static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept; 148 static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept; 149 static PetscErrorCode ResetArray(Mat) noexcept; 150 151 template <bool transpose_A, bool transpose_B> 152 static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept; 153 static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept; 154 static PetscErrorCode ZeroEntries(Mat) noexcept; 155 static PetscErrorCode Scale(Mat, PetscScalar) noexcept; 156 static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept; 157 static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept; 158 static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept; 159 160 static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept; 161 template <PetscMemoryAccessMode> 162 static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept; 163 template <PetscMemoryAccessMode> 164 static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept; 165 166 static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept; 167 static PetscErrorCode InvertFactors(Mat) noexcept; 168 169 static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept; 170 static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept; 171 }; 172 173 } // namespace impl 174 175 namespace 176 { 177 178 // Declare this here so that the functions below can make use of it 179 template <device::cupm::DeviceType T> 180 inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept 181 { 182 PetscFunctionBegin; 183 PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate)); 184 PetscFunctionReturn(PETSC_SUCCESS); 185 } 186 187 } // anonymous namespace 188 189 namespace impl 190 { 191 192 // ========================================================================================== 193 // MatDense_Seq_CUPM - Private API - Utility 194 // ========================================================================================== 195 196 template <device::cupm::DeviceType T> 197 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept 198 { 199 const auto mcu = MatCUPMCast(m); 200 const auto nrows = m->rmap->n; 201 const auto ncols = m->cmap->n; 202 auto &lda = MatIMPLCast(m)->lda; 203 cupmStream_t stream; 204 205 PetscFunctionBegin; 206 PetscCheckTypeName(m, MATSEQDENSECUPM()); 207 PetscValidDeviceContext(dctx, 2); 208 PetscCall(checkCupmBlasIntCast(nrows)); 209 PetscCall(checkCupmBlasIntCast(ncols)); 210 PetscCall(GetHandlesFrom_(dctx, &stream)); 211 if (lda <= 0) lda = nrows; 212 if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream)); 213 if (user_device_array) { 214 mcu->d_user_alloc = PETSC_TRUE; 215 mcu->d_v = user_device_array; 216 } else { 217 PetscInt size; 218 219 mcu->d_user_alloc = PETSC_FALSE; 220 PetscCall(PetscIntMultError(lda, ncols, &size)); 221 PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream)); 222 PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream)); 223 } 224 m->offloadmask = PETSC_OFFLOAD_GPU; 225 PetscFunctionReturn(PETSC_SUCCESS); 226 } 227 228 template <device::cupm::DeviceType T> 229 inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept 230 { 231 const auto nrows = m->rmap->n; 232 const auto ncols = m->cmap->n; 233 const auto copy = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED; 234 235 PetscFunctionBegin; 236 PetscCheckTypeName(m, MATSEQDENSECUPM()); 237 if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS); 238 PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols)); 239 if (copy) { 240 const auto mcu = MatCUPMCast(m); 241 cupmStream_t stream; 242 243 // Allocate GPU memory if not present 244 if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr)); 245 PetscCall(GetHandlesFrom_(dctx, &stream)); 246 PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0)); 247 { 248 const auto mimpl = MatIMPLCast(m); 249 const auto lda = mimpl->lda; 250 const auto src = mimpl->v; 251 const auto dest = mcu->d_v; 252 253 if (lda > nrows) { 254 PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream)); 255 } else { 256 PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream)); 257 } 258 } 259 PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0)); 260 // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH 261 m->offloadmask = PETSC_OFFLOAD_BOTH; 262 } 263 PetscFunctionReturn(PETSC_SUCCESS); 264 } 265 266 template <device::cupm::DeviceType T> 267 inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept 268 { 269 const auto nrows = m->rmap->n; 270 const auto ncols = m->cmap->n; 271 const auto copy = m->offloadmask == PETSC_OFFLOAD_GPU; 272 273 PetscFunctionBegin; 274 PetscCheckTypeName(m, MATSEQDENSECUPM()); 275 PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols)); 276 if (copy) { 277 const auto mimpl = MatIMPLCast(m); 278 cupmStream_t stream; 279 280 // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed 281 if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr)); 282 PetscCall(GetHandlesFrom_(dctx, &stream)); 283 PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0)); 284 { 285 const auto lda = mimpl->lda; 286 const auto dest = mimpl->v; 287 const auto src = MatCUPMCast(m)->d_v; 288 289 if (lda > nrows) { 290 PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream)); 291 } else { 292 PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream)); 293 } 294 } 295 PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0)); 296 // order is important, MatSeqDenseSetPreallocation() might set offloadmask 297 m->offloadmask = PETSC_OFFLOAD_BOTH; 298 } 299 PetscFunctionReturn(PETSC_SUCCESS); 300 } 301 302 template <device::cupm::DeviceType T> 303 inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept 304 { 305 PetscFunctionBegin; 306 if (PetscDefined(USE_DEBUG)) { 307 cupmBlasInt_t info = 0; 308 309 PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream)); 310 if (stream) PetscCallCUPM(cupmStreamSynchronize(stream)); 311 static_assert(std::is_same<decltype(info), int>::value, ""); 312 PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1); 313 PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info); 314 } 315 PetscFunctionReturn(PETSC_SUCCESS); 316 } 317 318 // ========================================================================================== 319 // MatDense_Seq_CUPM - Private API - Solver Dispatch 320 // ========================================================================================== 321 322 // specific solvers called through the dispatch_() family of functions 323 template <device::cupm::DeviceType T> 324 template <typename Derived> 325 struct MatDense_Seq_CUPM<T>::SolveCommon { 326 using derived_type = Derived; 327 328 template <typename F> 329 static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept 330 { 331 cupmBlasInt_t lwork; 332 333 PetscFunctionBegin; 334 PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork)); 335 if (lwork > mcu->d_fact_lwork) { 336 mcu->d_fact_lwork = lwork; 337 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream)); 338 PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream)); 339 } 340 PetscFunctionReturn(PETSC_SUCCESS); 341 } 342 343 static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept 344 { 345 const auto mcu = MatCUPMCast(A); 346 347 PetscFunctionBegin; 348 PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n)); 349 A->factortype = derived_type::MATFACTORTYPE(); 350 A->ops->solve = MatSolve_Factored_Dispatch_<derived_type, false>; 351 A->ops->solvetranspose = MatSolve_Factored_Dispatch_<derived_type, true>; 352 A->ops->matsolve = MatMatSolve_Factored_Dispatch_<derived_type, false>; 353 A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>; 354 355 PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype)); 356 if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream)); 357 PetscFunctionReturn(PETSC_SUCCESS); 358 } 359 }; 360 361 template <device::cupm::DeviceType T> 362 struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> { 363 using base_type = SolveCommon<SolveLU>; 364 365 static constexpr const char *NAME() noexcept { return "LU"; } 366 static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; } 367 368 static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept 369 { 370 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 371 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 372 cupmStream_t stream; 373 cupmSolverHandle_t handle; 374 PetscDeviceContext dctx; 375 376 PetscFunctionBegin; 377 if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS); 378 PetscCall(GetHandles_(&dctx, &handle, &stream)); 379 PetscCall(base_type::FactorPrepare(A, stream)); 380 { 381 const auto mcu = MatCUPMCast(A); 382 const auto da = DeviceArrayReadWrite(dctx, A); 383 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 384 385 // clang-format off 386 PetscCall( 387 base_type::ResizeFactLwork( 388 mcu, stream, 389 [&](cupmBlasInt_t *fact_lwork) 390 { 391 return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork); 392 } 393 ) 394 ); 395 // clang-format on 396 if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream)); 397 398 PetscCall(PetscLogGpuTimeBegin()); 399 PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info)); 400 PetscCall(PetscLogGpuTimeEnd()); 401 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 402 } 403 PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0)); 404 PetscFunctionReturn(PETSC_SUCCESS); 405 } 406 407 template <bool transpose> 408 static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept 409 { 410 const auto mcu = MatCUPMCast(A); 411 const auto fact_info = mcu->d_fact_info; 412 const auto fact_ipiv = mcu->d_fact_ipiv; 413 cupmSolverHandle_t handle; 414 415 PetscFunctionBegin; 416 PetscCall(GetHandlesFrom_(dctx, &handle)); 417 PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k)); 418 PetscCall(PetscLogGpuTimeBegin()); 419 { 420 constexpr auto op = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N; 421 const auto da = DeviceArrayRead(dctx, A); 422 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 423 424 // clang-format off 425 PetscCall( 426 base_type::ResizeFactLwork( 427 mcu, stream, 428 [&](cupmBlasInt_t *lwork) 429 { 430 return cupmSolverXgetrs_bufferSize( 431 handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork 432 ); 433 } 434 ) 435 ); 436 // clang-format on 437 PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info)); 438 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 439 } 440 PetscCall(PetscLogGpuTimeEnd()); 441 PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m))); 442 PetscFunctionReturn(PETSC_SUCCESS); 443 } 444 }; 445 446 template <device::cupm::DeviceType T> 447 struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> { 448 using base_type = SolveCommon<SolveCholesky>; 449 450 static constexpr const char *NAME() noexcept { return "Cholesky"; } 451 static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; } 452 453 static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept 454 { 455 const auto n = static_cast<cupmBlasInt_t>(A->rmap->n); 456 PetscDeviceContext dctx; 457 cupmSolverHandle_t handle; 458 cupmStream_t stream; 459 460 PetscFunctionBegin; 461 if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS); 462 PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName()); 463 PetscCall(GetHandles_(&dctx, &handle, &stream)); 464 PetscCall(base_type::FactorPrepare(A, stream)); 465 { 466 const auto mcu = MatCUPMCast(A); 467 const auto da = DeviceArrayReadWrite(dctx, A); 468 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 469 470 // clang-format off 471 PetscCall( 472 base_type::ResizeFactLwork( 473 mcu, stream, 474 [&](cupmBlasInt_t *fact_lwork) 475 { 476 return cupmSolverXpotrf_bufferSize( 477 handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork 478 ); 479 } 480 ) 481 ); 482 // clang-format on 483 PetscCall(PetscLogGpuTimeBegin()); 484 PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 485 PetscCall(PetscLogGpuTimeEnd()); 486 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 487 } 488 PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0)); 489 490 #if 0 491 // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs 492 // and *hetr* routines. The code below should work, and it can be activated when *sytrs 493 // routines will be available 494 if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream)); 495 if (!mcu->d_fact_lwork) { 496 PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork)); 497 PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream)); 498 } 499 if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream)); 500 PetscCall(PetscLogGpuTimeBegin()); 501 PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 502 PetscCall(PetscLogGpuTimeEnd()); 503 #endif 504 PetscFunctionReturn(PETSC_SUCCESS); 505 } 506 507 template <bool transpose> 508 static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept 509 { 510 const auto mcu = MatCUPMCast(A); 511 const auto fact_info = mcu->d_fact_info; 512 cupmSolverHandle_t handle; 513 514 PetscFunctionBegin; 515 PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName()); 516 PetscCall(GetHandlesFrom_(dctx, &handle)); 517 PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k)); 518 PetscCall(PetscLogGpuTimeBegin()); 519 { 520 const auto da = DeviceArrayRead(dctx, A); 521 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 522 523 // clang-format off 524 PetscCall( 525 base_type::ResizeFactLwork( 526 mcu, stream, 527 [&](cupmBlasInt_t *lwork) 528 { 529 return cupmSolverXpotrs_bufferSize( 530 handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork 531 ); 532 } 533 ) 534 ); 535 // clang-format on 536 PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info)); 537 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 538 } 539 PetscCall(PetscLogGpuTimeEnd()); 540 PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m))); 541 PetscFunctionReturn(PETSC_SUCCESS); 542 } 543 }; 544 545 template <device::cupm::DeviceType T> 546 struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> { 547 using base_type = SolveCommon<SolveQR>; 548 549 static constexpr const char *NAME() noexcept { return "QR"; } 550 static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; } 551 552 static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept 553 { 554 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 555 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 556 const auto min = std::min(m, n); 557 const auto mimpl = MatIMPLCast(A); 558 cupmStream_t stream; 559 cupmSolverHandle_t handle; 560 PetscDeviceContext dctx; 561 562 PetscFunctionBegin; 563 if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS); 564 PetscCall(GetHandles_(&dctx, &handle, &stream)); 565 PetscCall(base_type::FactorPrepare(A, stream)); 566 mimpl->rank = min; 567 { 568 const auto mcu = MatCUPMCast(A); 569 const auto da = DeviceArrayReadWrite(dctx, A); 570 const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda); 571 572 if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec)); 573 if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream)); 574 // clang-format off 575 PetscCall( 576 base_type::ResizeFactLwork( 577 mcu, stream, 578 [&](cupmBlasInt_t *fact_lwork) 579 { 580 return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork); 581 } 582 ) 583 ); 584 // clang-format on 585 PetscCall(PetscLogGpuTimeBegin()); 586 PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 587 PetscCall(PetscLogGpuTimeEnd()); 588 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 589 } 590 PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0))); 591 PetscFunctionReturn(PETSC_SUCCESS); 592 } 593 594 template <bool transpose> 595 static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept 596 { 597 const auto mimpl = MatIMPLCast(A); 598 const auto rank = static_cast<cupmBlasInt_t>(mimpl->rank); 599 const auto mcu = MatCUPMCast(A); 600 const auto fact_info = mcu->d_fact_info; 601 const auto fact_tau = mcu->d_fact_tau; 602 const auto fact_work = mcu->d_fact_work; 603 const auto fact_lwork = mcu->d_fact_lwork; 604 cupmSolverHandle_t solver_handle; 605 cupmBlasHandle_t blas_handle; 606 607 PetscFunctionBegin; 608 PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle)); 609 PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k)); 610 PetscCall(PetscLogGpuTimeBegin()); 611 { 612 const auto da = DeviceArrayRead(dctx, A); 613 const auto one = cupmScalarCast(1.0); 614 const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda); 615 616 if (transpose) { 617 PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx)); 618 PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info)); 619 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 620 } else { 621 constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T; 622 623 PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info)); 624 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 625 PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx)); 626 } 627 } 628 PetscCall(PetscLogGpuTimeEnd()); 629 PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank)))); 630 PetscFunctionReturn(PETSC_SUCCESS); 631 } 632 }; 633 634 template <device::cupm::DeviceType T> 635 template <typename Solver, bool transpose> 636 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept 637 { 638 using namespace vec::cupm; 639 const auto pobj_A = PetscObjectCast(A); 640 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 641 const auto k = static_cast<cupmBlasInt_t>(A->cmap->n); 642 auto &workvec = MatCUPMCast(A)->workvec; 643 PetscScalar *y_array = nullptr; 644 PetscDeviceContext dctx; 645 PetscBool xiscupm, yiscupm, aiscupm; 646 bool use_y_array_directly; 647 cupmStream_t stream; 648 649 PetscFunctionBegin; 650 PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve"); 651 PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm)); 652 PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm)); 653 PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm)); 654 PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????"); 655 PetscCall(GetHandles_(&dctx, &stream)); 656 use_y_array_directly = yiscupm && (k >= m); 657 { 658 const PetscScalar *x_array; 659 const auto xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask); 660 const auto copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice; 661 662 if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec)); 663 // The logic here is to try to minimize the amount of memory copying: 664 // 665 // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded 666 // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the 667 // data in order to copy it into the y array. So the array x will be wherever the data 668 // already is so that only one memcpy is performed 669 if (xisdevice) { 670 PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx)); 671 } else { 672 PetscCall(VecGetArrayRead(x, &x_array)); 673 } 674 PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx)); 675 PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream)); 676 if (xisdevice) { 677 PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx)); 678 } else { 679 PetscCall(VecRestoreArrayRead(x, &x_array)); 680 } 681 } 682 683 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 684 PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream)); 685 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A)); 686 687 if (use_y_array_directly) { 688 PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx)); 689 } else { 690 const auto copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost; 691 PetscScalar *yv; 692 693 // The logic here is that the data is not yet in either y's GPU array or its CPU array. 694 // There is nothing in the interface to say where the user would like it to end up. So we 695 // choose the GPU, because it is the faster option 696 if (yiscupm) { 697 PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx)); 698 } else { 699 PetscCall(VecGetArray(y, &yv)); 700 } 701 PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream)); 702 if (yiscupm) { 703 PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx)); 704 } else { 705 PetscCall(VecRestoreArray(y, &yv)); 706 } 707 PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array)); 708 } 709 PetscFunctionReturn(PETSC_SUCCESS); 710 } 711 712 template <device::cupm::DeviceType T> 713 template <typename Solver, bool transpose> 714 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept 715 { 716 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 717 const auto k = static_cast<cupmBlasInt_t>(A->cmap->n); 718 cupmBlasInt_t nrhs, ldb, ldx, ldy; 719 PetscScalar *y; 720 PetscBool biscupm, xiscupm, aiscupm; 721 PetscDeviceContext dctx; 722 cupmStream_t stream; 723 724 PetscFunctionBegin; 725 PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve"); 726 PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm)); 727 PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm)); 728 PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm)); 729 PetscCall(GetHandles_(&dctx, &stream)); 730 { 731 PetscInt n; 732 733 PetscCall(MatGetSize(B, nullptr, &n)); 734 PetscCall(PetscCUPMBlasIntCast(n, &nrhs)); 735 PetscCall(MatDenseGetLDA(B, &n)); 736 PetscCall(PetscCUPMBlasIntCast(n, &ldb)); 737 PetscCall(MatDenseGetLDA(X, &n)); 738 PetscCall(PetscCUPMBlasIntCast(n, &ldx)); 739 } 740 { 741 // The logic here is to try to minimize the amount of memory copying: 742 // 743 // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not 744 // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to 745 // get the data in order to copy it into the y array. So the array b will be wherever the 746 // data already is so that only one memcpy is performed 747 const auto bisdevice = biscupm && PetscOffloadDevice(B->offloadmask); 748 const auto copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice; 749 const PetscScalar *b; 750 751 if (bisdevice) { 752 b = DeviceArrayRead(dctx, B); 753 } else if (biscupm) { 754 b = HostArrayRead(dctx, B); 755 } else { 756 PetscCall(MatDenseGetArrayRead(B, &b)); 757 } 758 759 if (ldx < m || !xiscupm) { 760 // X's array cannot serve as the array (too small or not on device), B's array cannot 761 // serve as the array (const), so allocate a new array 762 ldy = m; 763 PetscCall(PetscCUPMMallocAsync(&y, nrhs * m)); 764 } else { 765 // X's array should serve as the array 766 ldy = ldx; 767 y = DeviceArrayWrite(dctx, X); 768 } 769 PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream)); 770 if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b)); 771 } 772 773 // convert to CUPM twice?????????????????????????????????? 774 // but A should already be CUPM?????????????????????????????????????? 775 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 776 PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream)); 777 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 778 779 if (ldx < m || !xiscupm) { 780 const auto copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost; 781 PetscScalar *x; 782 783 // The logic here is that the data is not yet in either X's GPU array or its CPU 784 // array. There is nothing in the interface to say where the user would like it to end up. 785 // So we choose the GPU, because it is the faster option 786 if (xiscupm) { 787 x = DeviceArrayWrite(dctx, X); 788 } else { 789 PetscCall(MatDenseGetArray(X, &x)); 790 } 791 PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream)); 792 if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x)); 793 PetscCallCUPM(cupmFreeAsync(y, stream)); 794 } 795 PetscFunctionReturn(PETSC_SUCCESS); 796 } 797 798 template <device::cupm::DeviceType T> 799 template <bool transpose> 800 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept 801 { 802 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 803 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 804 cupmBlasHandle_t handle; 805 PetscDeviceContext dctx; 806 807 PetscFunctionBegin; 808 if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add 809 if (!m || !n) { 810 // mult only 811 if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0)); 812 PetscFunctionReturn(PETSC_SUCCESS); 813 } 814 PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n)); 815 PetscCall(GetHandles_(&dctx, &handle)); 816 { 817 constexpr auto op = transpose ? CUPMBLAS_OP_T : CUPMBLAS_OP_N; 818 const auto one = cupmScalarCast(1.0); 819 const auto zero = cupmScalarCast(0.0); 820 const auto da = DeviceArrayRead(dctx, A); 821 const auto dxx = VecSeq_CUPM::DeviceArrayRead(dctx, xx); 822 const auto dzz = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz); 823 824 PetscCall(PetscLogGpuTimeBegin()); 825 PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata(), static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda), dxx.cupmdata(), 1, (yy ? &one : &zero), dzz.cupmdata(), 1)); 826 PetscCall(PetscLogGpuTimeEnd()); 827 } 828 PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m))); 829 PetscFunctionReturn(PETSC_SUCCESS); 830 } 831 832 // ========================================================================================== 833 // MatDense_Seq_CUPM - Private API - Conversion Dispatch 834 // ========================================================================================== 835 836 template <device::cupm::DeviceType T> 837 template <bool to_host> 838 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept 839 { 840 PetscFunctionBegin; 841 if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) { 842 // TODO these cases should be optimized 843 PetscCall(MatConvert_Basic(M, type, reuse, newmat)); 844 } else { 845 const auto B = *newmat; 846 const auto pobj = PetscObjectCast(B); 847 848 if (to_host) { 849 PetscCall(BindToCPU(B, PETSC_TRUE)); 850 PetscCall(Reset(B)); 851 } else { 852 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM())); 853 } 854 855 PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype)); 856 PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM())); 857 // cvec might be the wrong VecType, destroy and rebuild it if necessary 858 // REVIEW ME: this is possibly very inefficient 859 PetscCall(VecDestroy(&MatIMPLCast(B)->cvec)); 860 861 MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense); 862 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>); 863 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>); 864 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>); 865 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>); 866 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>); 867 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>); 868 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray); 869 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray); 870 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray); 871 MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense); 872 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation); 873 874 if (to_host) { 875 B->offloadmask = PETSC_OFFLOAD_CPU; 876 } else { 877 Mat_SeqDenseCUPM *mcu; 878 879 PetscCall(PetscNew(&mcu)); 880 B->spptr = mcu; 881 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host?? 882 PetscCall(BindToCPU(B, PETSC_FALSE)); 883 } 884 885 MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU); 886 MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy); 887 } 888 PetscFunctionReturn(PETSC_SUCCESS); 889 } 890 891 // ========================================================================================== 892 // MatDense_Seq_CUPM - Public API 893 // ========================================================================================== 894 895 template <device::cupm::DeviceType T> 896 inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept 897 { 898 return MATSEQDENSECUPM(); 899 } 900 901 template <device::cupm::DeviceType T> 902 inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept 903 { 904 return static_cast<Mat_SeqDenseCUPM *>(m->spptr); 905 } 906 907 template <device::cupm::DeviceType T> 908 inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept 909 { 910 return static_cast<Mat_SeqDense *>(m->data); 911 } 912 913 template <device::cupm::DeviceType T> 914 inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept 915 { 916 return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C"; 917 } 918 919 template <device::cupm::DeviceType T> 920 inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept 921 { 922 return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C"; 923 } 924 925 // ========================================================================================== 926 927 // MatCreate_SeqDenseCUPM() 928 template <device::cupm::DeviceType T> 929 inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept 930 { 931 PetscFunctionBegin; 932 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM())); 933 PetscCall(MatCreate_SeqDense(A)); 934 PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 935 PetscFunctionReturn(PETSC_SUCCESS); 936 } 937 938 template <device::cupm::DeviceType T> 939 inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept 940 { 941 PetscFunctionBegin; 942 // prevent copying back data if we own the data pointer 943 if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU; 944 PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A)); 945 PetscCall(MatDestroy_SeqDense(A)); 946 PetscFunctionReturn(PETSC_SUCCESS); 947 } 948 949 // obj->ops->setup() 950 template <device::cupm::DeviceType T> 951 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept 952 { 953 PetscFunctionBegin; 954 PetscCall(PetscLayoutSetUp(A->rmap)); 955 PetscCall(PetscLayoutSetUp(A->cmap)); 956 if (!A->preallocated) { 957 PetscDeviceContext dctx; 958 959 PetscCall(GetHandles_(&dctx)); 960 PetscCall(SetPreallocation(A, dctx, nullptr)); 961 } 962 PetscFunctionReturn(PETSC_SUCCESS); 963 } 964 965 template <device::cupm::DeviceType T> 966 inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept 967 { 968 PetscFunctionBegin; 969 if (const auto mcu = MatCUPMCast(A)) { 970 cupmStream_t stream; 971 972 PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME()); 973 PetscCall(GetHandles_(&stream)); 974 if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream)); 975 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream)); 976 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream)); 977 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream)); 978 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream)); 979 PetscCall(VecDestroy(&mcu->workvec)); 980 PetscCall(PetscFree(A->spptr /* mcu */)); 981 } 982 PetscFunctionReturn(PETSC_SUCCESS); 983 } 984 985 // ========================================================================================== 986 987 template <device::cupm::DeviceType T> 988 inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept 989 { 990 const auto mimpl = MatIMPLCast(A); 991 const auto pobj = PetscObjectCast(A); 992 993 PetscFunctionBegin; 994 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 995 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 996 A->boundtocpu = to_host; 997 PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype)); 998 if (to_host) { 999 PetscDeviceContext dctx; 1000 1001 // make sure we have an up-to-date copy on the CPU 1002 PetscCall(GetHandles_(&dctx)); 1003 PetscCall(DeviceToHost_(A, dctx)); 1004 } else { 1005 PetscBool iscupm; 1006 1007 if (auto &cvec = mimpl->cvec) { 1008 PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm)); 1009 if (!iscupm) PetscCall(VecDestroy(&cvec)); 1010 } 1011 if (auto &cmat = mimpl->cmat) { 1012 PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm)); 1013 if (!iscupm) PetscCall(MatDestroy(&cmat)); 1014 } 1015 } 1016 1017 // ============================================================ 1018 // Composed ops 1019 // ============================================================ 1020 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>); 1021 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>); 1022 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>); 1023 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>); 1024 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>); 1025 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>); 1026 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>); 1027 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>); 1028 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>); 1029 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>); 1030 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>); 1031 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>); 1032 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>); 1033 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>); 1034 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>); 1035 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix); 1036 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix); 1037 MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor); 1038 // always the same 1039 PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense)); 1040 1041 // ============================================================ 1042 // Function pointer ops 1043 // ============================================================ 1044 MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate); 1045 MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false>(A, xx, nullptr, yy); }); 1046 MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true>(A, xx, nullptr, yy); }); 1047 MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false>); 1048 MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true>); 1049 MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>); 1050 MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>); 1051 MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>); 1052 MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY); 1053 MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor); 1054 MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor); 1055 MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector); 1056 MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale); 1057 MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift); 1058 MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy); 1059 MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries); 1060 MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp); 1061 MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom); 1062 MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal); 1063 // seemingly always the same 1064 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense; 1065 1066 if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host)); 1067 PetscFunctionReturn(PETSC_SUCCESS); 1068 } 1069 1070 template <device::cupm::DeviceType T> 1071 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept 1072 { 1073 PetscFunctionBegin; 1074 PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat)); 1075 PetscFunctionReturn(PETSC_SUCCESS); 1076 } 1077 1078 template <device::cupm::DeviceType T> 1079 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept 1080 { 1081 PetscFunctionBegin; 1082 PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat)); 1083 PetscFunctionReturn(PETSC_SUCCESS); 1084 } 1085 1086 // ========================================================================================== 1087 1088 template <device::cupm::DeviceType T> 1089 template <PetscMemType mtype, PetscMemoryAccessMode access> 1090 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept 1091 { 1092 constexpr auto hostmem = PetscMemTypeHost(mtype); 1093 constexpr auto read_access = PetscMemoryAccessRead(access); 1094 1095 PetscFunctionBegin; 1096 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 1097 PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx)); 1098 if (hostmem) { 1099 if (read_access) { 1100 PetscCall(DeviceToHost_(m, dctx)); 1101 } else if (!MatIMPLCast(m)->v) { 1102 // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed 1103 PetscCall(MatSeqDenseSetPreallocation(m, nullptr)); 1104 } 1105 *array = MatIMPLCast(m)->v; 1106 } else { 1107 if (read_access) { 1108 PetscCall(HostToDevice_(m, dctx)); 1109 } else if (!MatCUPMCast(m)->d_v) { 1110 // write-only 1111 PetscCall(SetPreallocation(m, dctx, nullptr)); 1112 } 1113 *array = MatCUPMCast(m)->d_v; 1114 } 1115 if (PetscMemoryAccessWrite(access)) { 1116 m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 1117 PetscCall(PetscObjectStateIncrease(PetscObjectCast(m))); 1118 } 1119 PetscFunctionReturn(PETSC_SUCCESS); 1120 } 1121 1122 template <device::cupm::DeviceType T> 1123 template <PetscMemType mtype, PetscMemoryAccessMode access> 1124 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept 1125 { 1126 PetscFunctionBegin; 1127 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 1128 if (PetscMemoryAccessWrite(access)) { 1129 // WRITE or READ_WRITE 1130 m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 1131 PetscCall(PetscObjectStateIncrease(PetscObjectCast(m))); 1132 } 1133 if (array) { 1134 PetscCall(CheckPointerMatchesMemType_(*array, mtype)); 1135 *array = nullptr; 1136 } 1137 PetscFunctionReturn(PETSC_SUCCESS); 1138 } 1139 1140 template <device::cupm::DeviceType T> 1141 template <PetscMemoryAccessMode access> 1142 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept 1143 { 1144 PetscFunctionBegin; 1145 PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx)); 1146 if (mtype) *mtype = PETSC_MEMTYPE_CUPM(); 1147 PetscFunctionReturn(PETSC_SUCCESS); 1148 } 1149 1150 template <device::cupm::DeviceType T> 1151 template <PetscMemoryAccessMode access> 1152 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept 1153 { 1154 PetscFunctionBegin; 1155 PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx)); 1156 PetscFunctionReturn(PETSC_SUCCESS); 1157 } 1158 1159 // ========================================================================================== 1160 1161 template <device::cupm::DeviceType T> 1162 inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept 1163 { 1164 const auto mimpl = MatIMPLCast(A); 1165 const auto mcu = MatCUPMCast(A); 1166 1167 PetscFunctionBegin; 1168 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1169 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1170 PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME()); 1171 if (mimpl->v) { 1172 PetscDeviceContext dctx; 1173 1174 PetscCall(GetHandles_(&dctx)); 1175 PetscCall(HostToDevice_(A, dctx)); 1176 } 1177 mcu->unplacedarray = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array)); 1178 mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE); 1179 PetscFunctionReturn(PETSC_SUCCESS); 1180 } 1181 1182 template <device::cupm::DeviceType T> 1183 inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept 1184 { 1185 const auto mimpl = MatIMPLCast(A); 1186 const auto mcu = MatCUPMCast(A); 1187 1188 PetscFunctionBegin; 1189 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1190 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1191 PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME()); 1192 if (!mcu->d_user_alloc) { 1193 cupmStream_t stream; 1194 1195 PetscCall(GetHandles_(&stream)); 1196 PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream)); 1197 } 1198 mcu->d_v = const_cast<PetscScalar *>(array); 1199 mcu->d_user_alloc = PETSC_FALSE; 1200 PetscFunctionReturn(PETSC_SUCCESS); 1201 } 1202 1203 template <device::cupm::DeviceType T> 1204 inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept 1205 { 1206 const auto mimpl = MatIMPLCast(A); 1207 const auto mcu = MatCUPMCast(A); 1208 1209 PetscFunctionBegin; 1210 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1211 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1212 if (mimpl->v) { 1213 PetscDeviceContext dctx; 1214 1215 PetscCall(GetHandles_(&dctx)); 1216 PetscCall(HostToDevice_(A, dctx)); 1217 } 1218 mcu->d_v = util::exchange(mcu->unplacedarray, nullptr); 1219 mcu->d_user_alloc = mcu->d_unplaced_user_alloc; 1220 PetscFunctionReturn(PETSC_SUCCESS); 1221 } 1222 1223 // ========================================================================================== 1224 1225 template <device::cupm::DeviceType T> 1226 template <bool transpose_A, bool transpose_B> 1227 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept 1228 { 1229 cupmBlasInt_t m, n, k; 1230 PetscBool Aiscupm, Biscupm; 1231 PetscDeviceContext dctx; 1232 cupmBlasHandle_t handle; 1233 1234 PetscFunctionBegin; 1235 PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m)); 1236 PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n)); 1237 PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k)); 1238 if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS); 1239 1240 // we may end up with SEQDENSE as one of the arguments 1241 // REVIEW ME: how? and why is it not B and C???????? 1242 PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm)); 1243 PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm)); 1244 if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 1245 if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B)); 1246 PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n)); 1247 PetscCall(GetHandles_(&dctx, &handle)); 1248 1249 PetscCall(PetscLogGpuTimeBegin()); 1250 { 1251 const auto one = cupmScalarCast(1.0); 1252 const auto zero = cupmScalarCast(0.0); 1253 const auto da = DeviceArrayRead(dctx, A); 1254 const auto db = DeviceArrayRead(dctx, B); 1255 const auto dc = DeviceArrayWrite(dctx, C); 1256 PetscInt alda, blda, clda; 1257 1258 PetscCall(MatDenseGetLDA(A, &alda)); 1259 PetscCall(MatDenseGetLDA(B, &blda)); 1260 PetscCall(MatDenseGetLDA(C, &clda)); 1261 PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda)); 1262 } 1263 PetscCall(PetscLogGpuTimeEnd()); 1264 1265 PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1))); 1266 if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A)); 1267 if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 1268 PetscFunctionReturn(PETSC_SUCCESS); 1269 } 1270 1271 template <device::cupm::DeviceType T> 1272 inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept 1273 { 1274 const auto m = A->rmap->n; 1275 const auto n = A->cmap->n; 1276 1277 PetscFunctionBegin; 1278 PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)"); 1279 // The two matrices must have the same copy implementation to be eligible for fast copy 1280 if (A->ops->copy == B->ops->copy) { 1281 PetscDeviceContext dctx; 1282 cupmStream_t stream; 1283 1284 PetscCall(GetHandles_(&dctx, &stream)); 1285 PetscCall(PetscLogGpuTimeBegin()); 1286 { 1287 const auto va = DeviceArrayRead(dctx, A); 1288 const auto vb = DeviceArrayWrite(dctx, B); 1289 // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets 1290 // lda! 1291 const auto lda_a = MatIMPLCast(A)->lda; 1292 const auto lda_b = MatIMPLCast(B)->lda; 1293 1294 if (lda_a > m || lda_b > m) { 1295 PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME()); 1296 PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME()); 1297 PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream)); 1298 } else { 1299 PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream)); 1300 } 1301 } 1302 PetscCall(PetscLogGpuTimeEnd()); 1303 } else { 1304 PetscCall(MatCopy_Basic(A, B, str)); 1305 } 1306 PetscFunctionReturn(PETSC_SUCCESS); 1307 } 1308 1309 template <device::cupm::DeviceType T> 1310 inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept 1311 { 1312 PetscDeviceContext dctx; 1313 cupmStream_t stream; 1314 1315 PetscFunctionBegin; 1316 PetscCall(GetHandles_(&dctx, &stream)); 1317 PetscCall(PetscLogGpuTimeBegin()); 1318 { 1319 const auto va = DeviceArrayWrite(dctx, m); 1320 const auto lda = MatIMPLCast(m)->lda; 1321 const auto ma = m->rmap->n; 1322 const auto na = m->cmap->n; 1323 1324 if (lda > ma) { 1325 PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream)); 1326 } else { 1327 PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream)); 1328 } 1329 } 1330 PetscCall(PetscLogGpuTimeEnd()); 1331 PetscFunctionReturn(PETSC_SUCCESS); 1332 } 1333 1334 namespace detail 1335 { 1336 1337 // ========================================================================================== 1338 // SubMatIndexFunctor 1339 // 1340 // Iterator which permutes a linear index range into matrix indices for am nrows x ncols 1341 // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for 1342 // the i'th sequential entry in the matrix. 1343 // ========================================================================================== 1344 template <typename T> 1345 struct SubMatIndexFunctor { 1346 PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); } 1347 1348 PetscInt nrows; 1349 PetscInt ncols; 1350 PetscInt lda; 1351 }; 1352 1353 template <typename Iterator> 1354 struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>> { 1355 using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>>; 1356 1357 using iterator = typename base_type::iterator; 1358 1359 constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept : 1360 base_type{ 1361 std::move(first), std::move(last), {nrows, ncols, lda} 1362 } 1363 { 1364 } 1365 1366 PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); } 1367 }; 1368 1369 namespace 1370 { 1371 1372 template <typename T> 1373 PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept 1374 { 1375 const auto nrows = rend - rstart; 1376 const auto ncols = cend - cstart; 1377 const auto dptr = thrust::device_pointer_cast(ptr); 1378 1379 return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda}; 1380 } 1381 1382 } // namespace 1383 1384 } // namespace detail 1385 1386 template <device::cupm::DeviceType T> 1387 inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept 1388 { 1389 const auto m = A->rmap->n; 1390 const auto n = A->cmap->n; 1391 const auto N = m * n; 1392 PetscDeviceContext dctx; 1393 1394 PetscFunctionBegin; 1395 PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n)); 1396 PetscCall(GetHandles_(&dctx)); 1397 { 1398 const auto da = DeviceArrayReadWrite(dctx, A); 1399 const auto lda = MatIMPLCast(A)->lda; 1400 1401 if (lda > m) { 1402 cupmStream_t stream; 1403 1404 PetscCall(GetHandlesFrom_(dctx, &stream)); 1405 // clang-format off 1406 PetscCallThrust( 1407 const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data()); 1408 1409 THRUST_CALL( 1410 thrust::transform, 1411 stream, 1412 sub_mat.begin(), sub_mat.end(), sub_mat.begin(), 1413 device::cupm::functors::make_times_equals(alpha) 1414 ) 1415 ); 1416 // clang-format on 1417 } else { 1418 const auto cu_alpha = cupmScalarCast(alpha); 1419 cupmBlasHandle_t handle; 1420 1421 PetscCall(GetHandlesFrom_(dctx, &handle)); 1422 PetscCall(PetscLogGpuTimeBegin()); 1423 PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1)); 1424 PetscCall(PetscLogGpuTimeEnd()); 1425 } 1426 } 1427 PetscCall(PetscLogGpuFlops(N)); 1428 PetscFunctionReturn(PETSC_SUCCESS); 1429 } 1430 1431 template <device::cupm::DeviceType T> 1432 inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept 1433 { 1434 const auto m_x = X->rmap->n, m_y = Y->rmap->n; 1435 const auto n_x = X->cmap->n, n_y = Y->cmap->n; 1436 const auto N = m_x * n_x; 1437 PetscDeviceContext dctx; 1438 1439 PetscFunctionBegin; 1440 if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS); 1441 PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y)); 1442 PetscCall(GetHandles_(&dctx)); 1443 { 1444 const auto dx = DeviceArrayRead(dctx, X); 1445 const auto dy = DeviceArrayReadWrite(dctx, Y); 1446 const auto lda_x = MatIMPLCast(X)->lda; 1447 const auto lda_y = MatIMPLCast(Y)->lda; 1448 1449 if (lda_x > m_x || lda_y > m_x) { 1450 cupmStream_t stream; 1451 1452 PetscCall(GetHandlesFrom_(dctx, &stream)); 1453 // clang-format off 1454 PetscCallThrust( 1455 const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data()); 1456 const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data()); 1457 1458 THRUST_CALL( 1459 thrust::transform, 1460 stream, 1461 sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(), 1462 device::cupm::functors::make_axpy(alpha) 1463 ); 1464 ); 1465 // clang-format on 1466 } else { 1467 const auto cu_alpha = cupmScalarCast(alpha); 1468 cupmBlasHandle_t handle; 1469 1470 PetscCall(GetHandlesFrom_(dctx, &handle)); 1471 PetscCall(PetscLogGpuTimeBegin()); 1472 PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1)); 1473 PetscCall(PetscLogGpuTimeEnd()); 1474 } 1475 } 1476 PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0))); 1477 PetscFunctionReturn(PETSC_SUCCESS); 1478 } 1479 1480 template <device::cupm::DeviceType T> 1481 inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept 1482 { 1483 const auto hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt; 1484 PetscDeviceContext dctx; 1485 1486 PetscFunctionBegin; 1487 PetscCall(GetHandles_(&dctx)); 1488 // do not call SetPreallocation() yet, we call it afterwards?? 1489 PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false)); 1490 PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt)); 1491 if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN)); 1492 // allocate memory if needed 1493 if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr)); 1494 PetscFunctionReturn(PETSC_SUCCESS); 1495 } 1496 1497 template <device::cupm::DeviceType T> 1498 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept 1499 { 1500 PetscBool device; 1501 1502 PetscFunctionBegin; 1503 PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device)); 1504 if (device) { 1505 const auto m = A->rmap->n; 1506 const auto n = A->cmap->n; 1507 PetscDeviceContext dctx; 1508 1509 PetscCall(GetHandles_(&dctx)); 1510 { 1511 const auto a = DeviceArrayWrite(dctx, A); 1512 PetscInt lda; 1513 1514 PetscCall(MatDenseGetLDA(A, &lda)); 1515 if (lda > m) { 1516 for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda)); 1517 } else { 1518 PetscInt mn; 1519 1520 PetscCall(PetscIntMultError(m, n, &mn)); 1521 PetscCall(PetscRandomGetValues(rng, mn, a)); 1522 } 1523 } 1524 } else { 1525 PetscCall(MatSetRandom_SeqDense(A, rng)); 1526 } 1527 PetscFunctionReturn(PETSC_SUCCESS); 1528 } 1529 1530 // ========================================================================================== 1531 1532 template <device::cupm::DeviceType T> 1533 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept 1534 { 1535 const auto offloadmask = A->offloadmask; 1536 const auto n = A->rmap->n; 1537 const auto col_offset = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; }; 1538 PetscBool viscupm; 1539 PetscDeviceContext dctx; 1540 cupmStream_t stream; 1541 1542 PetscFunctionBegin; 1543 PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), "")); 1544 PetscCall(GetHandles_(&dctx, &stream)); 1545 if (viscupm && !v->boundtocpu) { 1546 const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v); 1547 1548 // update device data 1549 if (PetscOffloadDevice(offloadmask)) { 1550 PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream)); 1551 } else { 1552 PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream)); 1553 } 1554 } else { 1555 PetscScalar *x; 1556 1557 // update host data 1558 PetscCall(VecGetArrayWrite(v, &x)); 1559 if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) { 1560 PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n)); 1561 } else if (PetscOffloadDevice(offloadmask)) { 1562 PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream)); 1563 } 1564 PetscCall(VecRestoreArrayWrite(v, &x)); 1565 } 1566 PetscFunctionReturn(PETSC_SUCCESS); 1567 } 1568 1569 template <device::cupm::DeviceType T> 1570 template <PetscMemoryAccessMode access> 1571 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept 1572 { 1573 using namespace vec::cupm; 1574 const auto mimpl = MatIMPLCast(A); 1575 PetscDeviceContext dctx; 1576 1577 PetscFunctionBegin; 1578 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1579 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1580 mimpl->vecinuse = col + 1; 1581 if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec)); 1582 PetscCall(GetHandles_(&dctx)); 1583 PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx)); 1584 PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda))); 1585 if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec)); 1586 *v = mimpl->cvec; 1587 PetscFunctionReturn(PETSC_SUCCESS); 1588 } 1589 1590 template <device::cupm::DeviceType T> 1591 template <PetscMemoryAccessMode access> 1592 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept 1593 { 1594 using namespace vec::cupm; 1595 const auto mimpl = MatIMPLCast(A); 1596 const auto cvec = mimpl->cvec; 1597 PetscDeviceContext dctx; 1598 1599 PetscFunctionBegin; 1600 PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first"); 1601 PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector"); 1602 mimpl->vecinuse = 0; 1603 if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec)); 1604 PetscCall(VecCUPMResetArrayAsync<T>(cvec)); 1605 PetscCall(GetHandles_(&dctx)); 1606 PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx)); 1607 if (v) *v = nullptr; 1608 PetscFunctionReturn(PETSC_SUCCESS); 1609 } 1610 1611 // ========================================================================================== 1612 1613 template <device::cupm::DeviceType T> 1614 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept 1615 { 1616 Mat fact = nullptr; 1617 PetscDeviceContext dctx; 1618 1619 PetscFunctionBegin; 1620 PetscCall(GetHandles_(&dctx)); 1621 PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false)); 1622 fact->factortype = ftype; 1623 switch (ftype) { 1624 case MAT_FACTOR_LU: 1625 case MAT_FACTOR_ILU: // fall-through 1626 fact->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense; 1627 fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense; 1628 break; 1629 case MAT_FACTOR_CHOLESKY: 1630 case MAT_FACTOR_ICC: // fall-through 1631 fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense; 1632 break; 1633 case MAT_FACTOR_QR: { 1634 const auto pobj = PetscObjectCast(fact); 1635 1636 PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense)); 1637 PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense)); 1638 } break; 1639 case MAT_FACTOR_NONE: 1640 case MAT_FACTOR_ILUDT: // fall-through 1641 case MAT_FACTOR_NUM_TYPES: // fall-through 1642 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]); 1643 } 1644 PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype)); 1645 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU)); 1646 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU)); 1647 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY)); 1648 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC)); 1649 *fact_out = fact; 1650 PetscFunctionReturn(PETSC_SUCCESS); 1651 } 1652 1653 template <device::cupm::DeviceType T> 1654 inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept 1655 { 1656 const auto mimpl = MatIMPLCast(A); 1657 const auto mcu = MatCUPMCast(A); 1658 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 1659 cupmSolverHandle_t handle; 1660 PetscDeviceContext dctx; 1661 cupmStream_t stream; 1662 1663 PetscFunctionBegin; 1664 #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC) 1665 // HIP appears to have this by default?? 1666 PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher"); 1667 #endif 1668 if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS); 1669 PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]); 1670 // spd 1671 PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName()); 1672 1673 PetscCall(GetHandles_(&dctx, &handle, &stream)); 1674 { 1675 const auto da = DeviceArrayReadWrite(dctx, A); 1676 const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda); 1677 cupmBlasInt_t il; 1678 1679 PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il)); 1680 if (il > mcu->d_fact_lwork) { 1681 mcu->d_fact_lwork = il; 1682 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream)); 1683 PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream)); 1684 } 1685 PetscCall(PetscLogGpuTimeBegin()); 1686 PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 1687 PetscCall(PetscLogGpuTimeEnd()); 1688 } 1689 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 1690 // TODO (write cuda kernel) 1691 PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE)); 1692 PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0)); 1693 1694 A->ops->solve = nullptr; 1695 A->ops->solvetranspose = nullptr; 1696 A->ops->matsolve = nullptr; 1697 A->factortype = MAT_FACTOR_NONE; 1698 1699 PetscCall(PetscFree(A->solvertype)); 1700 PetscFunctionReturn(PETSC_SUCCESS); 1701 } 1702 1703 // ========================================================================================== 1704 1705 template <device::cupm::DeviceType T> 1706 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept 1707 { 1708 const auto mimpl = MatIMPLCast(A); 1709 const auto array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; }; 1710 const auto n = rend - rbegin; 1711 const auto m = cend - cbegin; 1712 auto &cmat = mimpl->cmat; 1713 PetscDeviceContext dctx; 1714 1715 PetscFunctionBegin; 1716 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1717 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1718 mimpl->matinuse = cbegin + 1; 1719 1720 PetscCall(GetHandles_(&dctx)); 1721 PetscCall(HostToDevice_(A, dctx)); 1722 1723 if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat)); 1724 { 1725 const auto device_array = array_offset(MatCUPMCast(A)->d_v); 1726 1727 if (cmat) { 1728 PetscCall(PlaceArray(cmat, device_array)); 1729 } else { 1730 PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx)); 1731 } 1732 } 1733 PetscCall(MatDenseSetLDA(cmat, mimpl->lda)); 1734 // place CPU array if present but do not copy any data 1735 if (const auto host_array = mimpl->v) { 1736 cmat->offloadmask = PETSC_OFFLOAD_GPU; 1737 PetscCall(MatDensePlaceArray(cmat, array_offset(host_array))); 1738 } 1739 1740 cmat->offloadmask = A->offloadmask; 1741 *mat = cmat; 1742 PetscFunctionReturn(PETSC_SUCCESS); 1743 } 1744 1745 template <device::cupm::DeviceType T> 1746 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept 1747 { 1748 const auto mimpl = MatIMPLCast(A); 1749 const auto cmat = mimpl->cmat; 1750 const auto reset = static_cast<bool>(mimpl->v); 1751 bool copy, was_offload_host; 1752 1753 PetscFunctionBegin; 1754 PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first"); 1755 PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix"); 1756 PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()"); 1757 mimpl->matinuse = 0; 1758 1759 // calls to ResetArray may change it, so save it here 1760 was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU; 1761 if (was_offload_host && !reset) { 1762 copy = true; 1763 PetscCall(MatSeqDenseSetPreallocation(A, nullptr)); 1764 } else { 1765 copy = false; 1766 } 1767 1768 PetscCall(ResetArray(cmat)); 1769 if (reset) PetscCall(MatDenseResetArray(cmat)); 1770 if (copy) { 1771 PetscDeviceContext dctx; 1772 1773 PetscCall(GetHandles_(&dctx)); 1774 PetscCall(DeviceToHost_(A, dctx)); 1775 } else { 1776 A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 1777 } 1778 1779 cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1780 *m = nullptr; 1781 PetscFunctionReturn(PETSC_SUCCESS); 1782 } 1783 1784 // ========================================================================================== 1785 1786 namespace 1787 { 1788 1789 template <device::cupm::DeviceType T> 1790 inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept 1791 { 1792 PetscFunctionBegin; 1793 if (TA) { 1794 if (TB) { 1795 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C)); 1796 } else { 1797 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C)); 1798 } 1799 } else { 1800 if (TB) { 1801 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C)); 1802 } else { 1803 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C)); 1804 } 1805 } 1806 PetscFunctionReturn(PETSC_SUCCESS); 1807 } 1808 1809 template <device::cupm::DeviceType T> 1810 inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept 1811 { 1812 PetscFunctionBegin; 1813 for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) { 1814 PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor)); 1815 PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor)); 1816 } 1817 PetscFunctionReturn(PETSC_SUCCESS); 1818 } 1819 1820 } // anonymous namespace 1821 1822 } // namespace impl 1823 1824 } // namespace cupm 1825 1826 } // namespace mat 1827 1828 } // namespace Petsc 1829