1 #ifndef PETSCMATSEQDENSECUPM_HPP 2 #define PETSCMATSEQDENSECUPM_HPP 3 4 #include <petsc/private/matdensecupmimpl.h> /*I <petscmat.h> I*/ 5 #include <../src/mat/impls/dense/seq/dense.h> 6 7 #include <petsc/private/deviceimpl.h> // PetscDeviceContextGetOptionalNullContext_Internal() 8 #include <petsc/private/randomimpl.h> // _p_PetscRandom 9 #include <petsc/private/vecimpl.h> // _p_Vec 10 #include <petsc/private/cupmobject.hpp> 11 #include <petsc/private/cupmsolverinterface.hpp> 12 13 #include <petsc/private/cpp/type_traits.hpp> // PetscObjectCast() 14 #include <petsc/private/cpp/utility.hpp> // util::exchange() 15 16 #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp> // for VecSeq_CUPM 17 18 namespace Petsc 19 { 20 21 namespace mat 22 { 23 24 namespace cupm 25 { 26 27 namespace impl 28 { 29 30 template <device::cupm::DeviceType T> 31 class MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> { 32 public: 33 MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>); 34 35 private: 36 struct Mat_SeqDenseCUPM { 37 PetscScalar *d_v; // pointer to the matrix on the GPU 38 PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original 39 bool d_user_alloc; 40 bool d_unplaced_user_alloc; 41 // factorization support 42 cupmBlasInt_t *d_fact_ipiv; // device pivots 43 cupmScalar_t *d_fact_tau; // device QR tau vector 44 cupmBlasInt_t *d_fact_info; // device info 45 cupmScalar_t *d_fact_work; // device workspace 46 cupmBlasInt_t d_fact_lwork; // size of device workspace 47 // workspace 48 Vec workvec; 49 }; 50 51 static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept; 52 53 static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept; 54 static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept; 55 56 static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept; 57 58 template <typename Derived> 59 struct SolveCommon; 60 struct SolveQR; 61 struct SolveCholesky; 62 struct SolveLU; 63 64 template <typename Solver, bool transpose> 65 static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept; 66 template <typename Solver, bool transpose> 67 static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept; 68 template <bool transpose> 69 static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept; 70 71 template <bool to_host> 72 static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept; 73 74 PETSC_NODISCARD static constexpr MatType MATIMPLCUPM_() noexcept; 75 PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept; 76 77 public: 78 PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept; 79 80 // define these by hand since they don't fit the above mold 81 PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept; 82 PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept; 83 84 static PetscErrorCode Create(Mat) noexcept; 85 static PetscErrorCode Destroy(Mat) noexcept; 86 static PetscErrorCode SetUp(Mat) noexcept; 87 static PetscErrorCode Reset(Mat) noexcept; 88 89 static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept; 90 static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept; 91 static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept; 92 93 template <PetscMemType, PetscMemoryAccessMode> 94 static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept; 95 template <PetscMemType, PetscMemoryAccessMode> 96 static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept; 97 template <PetscMemoryAccessMode> 98 static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept; 99 template <PetscMemoryAccessMode> 100 static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept; 101 102 private: 103 template <PetscMemType mtype, PetscMemoryAccessMode mode> 104 static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept 105 { 106 PetscDeviceContext dctx; 107 108 PetscFunctionBegin; 109 PetscCall(GetHandles_(&dctx)); 110 PetscCall(GetArray<mtype, mode>(m, p, dctx)); 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 template <PetscMemType mtype, PetscMemoryAccessMode mode> 115 static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept 116 { 117 PetscDeviceContext dctx; 118 119 PetscFunctionBegin; 120 PetscCall(GetHandles_(&dctx)); 121 PetscCall(RestoreArray<mtype, mode>(m, p, dctx)); 122 PetscFunctionReturn(PETSC_SUCCESS); 123 } 124 125 template <PetscMemoryAccessMode mode> 126 static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept 127 { 128 PetscDeviceContext dctx; 129 130 PetscFunctionBegin; 131 PetscCall(GetHandles_(&dctx)); 132 PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx)); 133 PetscFunctionReturn(PETSC_SUCCESS); 134 } 135 136 template <PetscMemoryAccessMode mode> 137 static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept 138 { 139 PetscDeviceContext dctx; 140 141 PetscFunctionBegin; 142 PetscCall(GetHandles_(&dctx)); 143 PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx)); 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 public: 148 static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept; 149 static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept; 150 static PetscErrorCode ResetArray(Mat) noexcept; 151 152 template <bool transpose_A, bool transpose_B> 153 static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept; 154 static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept; 155 static PetscErrorCode ZeroEntries(Mat) noexcept; 156 static PetscErrorCode Scale(Mat, PetscScalar) noexcept; 157 static PetscErrorCode Shift(Mat, PetscScalar) noexcept; 158 static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept; 159 static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept; 160 static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept; 161 162 static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept; 163 template <PetscMemoryAccessMode> 164 static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept; 165 template <PetscMemoryAccessMode> 166 static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept; 167 168 static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept; 169 static PetscErrorCode InvertFactors(Mat) noexcept; 170 171 static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept; 172 static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept; 173 }; 174 175 } // namespace impl 176 177 namespace 178 { 179 180 // Declare this here so that the functions below can make use of it 181 template <device::cupm::DeviceType T> 182 inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept 183 { 184 PetscFunctionBegin; 185 PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate)); 186 PetscFunctionReturn(PETSC_SUCCESS); 187 } 188 189 } // anonymous namespace 190 191 namespace impl 192 { 193 194 // ========================================================================================== 195 // MatDense_Seq_CUPM - Private API - Utility 196 // ========================================================================================== 197 198 template <device::cupm::DeviceType T> 199 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept 200 { 201 const auto mcu = MatCUPMCast(m); 202 const auto nrows = m->rmap->n; 203 const auto ncols = m->cmap->n; 204 auto &lda = MatIMPLCast(m)->lda; 205 cupmStream_t stream; 206 207 PetscFunctionBegin; 208 PetscCheckTypeName(m, MATSEQDENSECUPM()); 209 PetscValidDeviceContext(dctx, 2); 210 PetscCall(checkCupmBlasIntCast(nrows)); 211 PetscCall(checkCupmBlasIntCast(ncols)); 212 PetscCall(GetHandlesFrom_(dctx, &stream)); 213 if (lda <= 0) lda = nrows; 214 if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream)); 215 if (user_device_array) { 216 mcu->d_user_alloc = PETSC_TRUE; 217 mcu->d_v = user_device_array; 218 } else { 219 PetscInt size; 220 221 mcu->d_user_alloc = PETSC_FALSE; 222 PetscCall(PetscIntMultError(lda, ncols, &size)); 223 PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream)); 224 PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream)); 225 } 226 m->offloadmask = PETSC_OFFLOAD_GPU; 227 PetscFunctionReturn(PETSC_SUCCESS); 228 } 229 230 template <device::cupm::DeviceType T> 231 inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept 232 { 233 const auto nrows = m->rmap->n; 234 const auto ncols = m->cmap->n; 235 const auto copy = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED; 236 237 PetscFunctionBegin; 238 PetscCheckTypeName(m, MATSEQDENSECUPM()); 239 if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS); 240 PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols)); 241 if (copy) { 242 const auto mcu = MatCUPMCast(m); 243 cupmStream_t stream; 244 245 // Allocate GPU memory if not present 246 if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr)); 247 PetscCall(GetHandlesFrom_(dctx, &stream)); 248 PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0)); 249 { 250 const auto mimpl = MatIMPLCast(m); 251 const auto lda = mimpl->lda; 252 const auto src = mimpl->v; 253 const auto dest = mcu->d_v; 254 255 if (lda > nrows) { 256 PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream)); 257 } else { 258 PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream)); 259 } 260 } 261 PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0)); 262 // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH 263 m->offloadmask = PETSC_OFFLOAD_BOTH; 264 } 265 PetscFunctionReturn(PETSC_SUCCESS); 266 } 267 268 template <device::cupm::DeviceType T> 269 inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept 270 { 271 const auto nrows = m->rmap->n; 272 const auto ncols = m->cmap->n; 273 const auto copy = m->offloadmask == PETSC_OFFLOAD_GPU; 274 275 PetscFunctionBegin; 276 PetscCheckTypeName(m, MATSEQDENSECUPM()); 277 PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols)); 278 if (copy) { 279 const auto mimpl = MatIMPLCast(m); 280 cupmStream_t stream; 281 282 // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed 283 if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr)); 284 PetscCall(GetHandlesFrom_(dctx, &stream)); 285 PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0)); 286 { 287 const auto lda = mimpl->lda; 288 const auto dest = mimpl->v; 289 const auto src = MatCUPMCast(m)->d_v; 290 291 if (lda > nrows) { 292 PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream)); 293 } else { 294 PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream)); 295 } 296 } 297 PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0)); 298 // order is important, MatSeqDenseSetPreallocation() might set offloadmask 299 m->offloadmask = PETSC_OFFLOAD_BOTH; 300 } 301 PetscFunctionReturn(PETSC_SUCCESS); 302 } 303 304 template <device::cupm::DeviceType T> 305 inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept 306 { 307 PetscFunctionBegin; 308 if (PetscDefined(USE_DEBUG)) { 309 cupmBlasInt_t info = 0; 310 311 PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream)); 312 if (stream) PetscCallCUPM(cupmStreamSynchronize(stream)); 313 static_assert(std::is_same<decltype(info), int>::value, ""); 314 PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1); 315 PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info); 316 } 317 PetscFunctionReturn(PETSC_SUCCESS); 318 } 319 320 // ========================================================================================== 321 // MatDense_Seq_CUPM - Private API - Solver Dispatch 322 // ========================================================================================== 323 324 // specific solvers called through the dispatch_() family of functions 325 template <device::cupm::DeviceType T> 326 template <typename Derived> 327 struct MatDense_Seq_CUPM<T>::SolveCommon { 328 using derived_type = Derived; 329 330 template <typename F> 331 static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept 332 { 333 cupmBlasInt_t lwork; 334 335 PetscFunctionBegin; 336 PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork)); 337 if (lwork > mcu->d_fact_lwork) { 338 mcu->d_fact_lwork = lwork; 339 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream)); 340 PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream)); 341 } 342 PetscFunctionReturn(PETSC_SUCCESS); 343 } 344 345 static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept 346 { 347 const auto mcu = MatCUPMCast(A); 348 349 PetscFunctionBegin; 350 PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n)); 351 A->factortype = derived_type::MATFACTORTYPE(); 352 A->ops->solve = MatSolve_Factored_Dispatch_<derived_type, false>; 353 A->ops->solvetranspose = MatSolve_Factored_Dispatch_<derived_type, true>; 354 A->ops->matsolve = MatMatSolve_Factored_Dispatch_<derived_type, false>; 355 A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>; 356 357 PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype)); 358 if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream)); 359 PetscFunctionReturn(PETSC_SUCCESS); 360 } 361 }; 362 363 template <device::cupm::DeviceType T> 364 struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> { 365 using base_type = SolveCommon<SolveLU>; 366 367 static constexpr const char *NAME() noexcept { return "LU"; } 368 static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; } 369 370 static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept 371 { 372 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 373 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 374 cupmStream_t stream; 375 cupmSolverHandle_t handle; 376 PetscDeviceContext dctx; 377 378 PetscFunctionBegin; 379 if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS); 380 PetscCall(GetHandles_(&dctx, &handle, &stream)); 381 PetscCall(base_type::FactorPrepare(A, stream)); 382 { 383 const auto mcu = MatCUPMCast(A); 384 const auto da = DeviceArrayReadWrite(dctx, A); 385 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 386 387 // clang-format off 388 PetscCall( 389 base_type::ResizeFactLwork( 390 mcu, stream, 391 [&](cupmBlasInt_t *fact_lwork) 392 { 393 return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork); 394 } 395 ) 396 ); 397 // clang-format on 398 if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream)); 399 400 PetscCall(PetscLogGpuTimeBegin()); 401 PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info)); 402 PetscCall(PetscLogGpuTimeEnd()); 403 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 404 } 405 PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0)); 406 PetscFunctionReturn(PETSC_SUCCESS); 407 } 408 409 template <bool transpose> 410 static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept 411 { 412 const auto mcu = MatCUPMCast(A); 413 const auto fact_info = mcu->d_fact_info; 414 const auto fact_ipiv = mcu->d_fact_ipiv; 415 cupmSolverHandle_t handle; 416 417 PetscFunctionBegin; 418 PetscCall(GetHandlesFrom_(dctx, &handle)); 419 PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k)); 420 PetscCall(PetscLogGpuTimeBegin()); 421 { 422 constexpr auto op = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N; 423 const auto da = DeviceArrayRead(dctx, A); 424 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 425 426 // clang-format off 427 PetscCall( 428 base_type::ResizeFactLwork( 429 mcu, stream, 430 [&](cupmBlasInt_t *lwork) 431 { 432 return cupmSolverXgetrs_bufferSize( 433 handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork 434 ); 435 } 436 ) 437 ); 438 // clang-format on 439 PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info)); 440 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 441 } 442 PetscCall(PetscLogGpuTimeEnd()); 443 PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m))); 444 PetscFunctionReturn(PETSC_SUCCESS); 445 } 446 }; 447 448 template <device::cupm::DeviceType T> 449 struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> { 450 using base_type = SolveCommon<SolveCholesky>; 451 452 static constexpr const char *NAME() noexcept { return "Cholesky"; } 453 static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; } 454 455 static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept 456 { 457 const auto n = static_cast<cupmBlasInt_t>(A->rmap->n); 458 PetscDeviceContext dctx; 459 cupmSolverHandle_t handle; 460 cupmStream_t stream; 461 462 PetscFunctionBegin; 463 if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS); 464 PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName()); 465 PetscCall(GetHandles_(&dctx, &handle, &stream)); 466 PetscCall(base_type::FactorPrepare(A, stream)); 467 { 468 const auto mcu = MatCUPMCast(A); 469 const auto da = DeviceArrayReadWrite(dctx, A); 470 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 471 472 // clang-format off 473 PetscCall( 474 base_type::ResizeFactLwork( 475 mcu, stream, 476 [&](cupmBlasInt_t *fact_lwork) 477 { 478 return cupmSolverXpotrf_bufferSize( 479 handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork 480 ); 481 } 482 ) 483 ); 484 // clang-format on 485 PetscCall(PetscLogGpuTimeBegin()); 486 PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 487 PetscCall(PetscLogGpuTimeEnd()); 488 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 489 } 490 PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0)); 491 492 #if 0 493 // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs 494 // and *hetr* routines. The code below should work, and it can be activated when *sytrs 495 // routines will be available 496 if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream)); 497 if (!mcu->d_fact_lwork) { 498 PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork)); 499 PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream)); 500 } 501 if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream)); 502 PetscCall(PetscLogGpuTimeBegin()); 503 PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 504 PetscCall(PetscLogGpuTimeEnd()); 505 #endif 506 PetscFunctionReturn(PETSC_SUCCESS); 507 } 508 509 template <bool transpose> 510 static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept 511 { 512 const auto mcu = MatCUPMCast(A); 513 const auto fact_info = mcu->d_fact_info; 514 cupmSolverHandle_t handle; 515 516 PetscFunctionBegin; 517 PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName()); 518 PetscCall(GetHandlesFrom_(dctx, &handle)); 519 PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k)); 520 PetscCall(PetscLogGpuTimeBegin()); 521 { 522 const auto da = DeviceArrayRead(dctx, A); 523 const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda); 524 525 // clang-format off 526 PetscCall( 527 base_type::ResizeFactLwork( 528 mcu, stream, 529 [&](cupmBlasInt_t *lwork) 530 { 531 return cupmSolverXpotrs_bufferSize( 532 handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork 533 ); 534 } 535 ) 536 ); 537 // clang-format on 538 PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info)); 539 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 540 } 541 PetscCall(PetscLogGpuTimeEnd()); 542 PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m))); 543 PetscFunctionReturn(PETSC_SUCCESS); 544 } 545 }; 546 547 template <device::cupm::DeviceType T> 548 struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> { 549 using base_type = SolveCommon<SolveQR>; 550 551 static constexpr const char *NAME() noexcept { return "QR"; } 552 static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; } 553 554 static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept 555 { 556 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 557 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 558 const auto min = std::min(m, n); 559 const auto mimpl = MatIMPLCast(A); 560 cupmStream_t stream; 561 cupmSolverHandle_t handle; 562 PetscDeviceContext dctx; 563 564 PetscFunctionBegin; 565 if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS); 566 PetscCall(GetHandles_(&dctx, &handle, &stream)); 567 PetscCall(base_type::FactorPrepare(A, stream)); 568 mimpl->rank = min; 569 { 570 const auto mcu = MatCUPMCast(A); 571 const auto da = DeviceArrayReadWrite(dctx, A); 572 const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda); 573 574 if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec)); 575 if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream)); 576 // clang-format off 577 PetscCall( 578 base_type::ResizeFactLwork( 579 mcu, stream, 580 [&](cupmBlasInt_t *fact_lwork) 581 { 582 return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork); 583 } 584 ) 585 ); 586 // clang-format on 587 PetscCall(PetscLogGpuTimeBegin()); 588 PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 589 PetscCall(PetscLogGpuTimeEnd()); 590 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 591 } 592 PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0))); 593 PetscFunctionReturn(PETSC_SUCCESS); 594 } 595 596 template <bool transpose> 597 static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept 598 { 599 const auto mimpl = MatIMPLCast(A); 600 const auto rank = static_cast<cupmBlasInt_t>(mimpl->rank); 601 const auto mcu = MatCUPMCast(A); 602 const auto fact_info = mcu->d_fact_info; 603 const auto fact_tau = mcu->d_fact_tau; 604 const auto fact_work = mcu->d_fact_work; 605 const auto fact_lwork = mcu->d_fact_lwork; 606 cupmSolverHandle_t solver_handle; 607 cupmBlasHandle_t blas_handle; 608 609 PetscFunctionBegin; 610 PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle)); 611 PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k)); 612 PetscCall(PetscLogGpuTimeBegin()); 613 { 614 const auto da = DeviceArrayRead(dctx, A); 615 const auto one = cupmScalarCast(1.0); 616 const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda); 617 618 if (transpose) { 619 PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx)); 620 PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info)); 621 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 622 } else { 623 constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T; 624 625 PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info)); 626 PetscCall(CheckCUPMSolverInfo_(fact_info, stream)); 627 PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx)); 628 } 629 } 630 PetscCall(PetscLogGpuTimeEnd()); 631 PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank)))); 632 PetscFunctionReturn(PETSC_SUCCESS); 633 } 634 }; 635 636 template <device::cupm::DeviceType T> 637 template <typename Solver, bool transpose> 638 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept 639 { 640 using namespace vec::cupm; 641 const auto pobj_A = PetscObjectCast(A); 642 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 643 const auto k = static_cast<cupmBlasInt_t>(A->cmap->n); 644 auto &workvec = MatCUPMCast(A)->workvec; 645 PetscScalar *y_array = nullptr; 646 PetscDeviceContext dctx; 647 PetscBool xiscupm, yiscupm, aiscupm; 648 bool use_y_array_directly; 649 cupmStream_t stream; 650 651 PetscFunctionBegin; 652 PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve"); 653 PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm)); 654 PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm)); 655 PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm)); 656 PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????"); 657 PetscCall(GetHandles_(&dctx, &stream)); 658 use_y_array_directly = yiscupm && (k >= m); 659 { 660 const PetscScalar *x_array; 661 const auto xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask); 662 const auto copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice; 663 664 if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec)); 665 // The logic here is to try to minimize the amount of memory copying: 666 // 667 // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded 668 // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the 669 // data in order to copy it into the y array. So the array x will be wherever the data 670 // already is so that only one memcpy is performed 671 if (xisdevice) { 672 PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx)); 673 } else { 674 PetscCall(VecGetArrayRead(x, &x_array)); 675 } 676 PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx)); 677 PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream)); 678 if (xisdevice) { 679 PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx)); 680 } else { 681 PetscCall(VecRestoreArrayRead(x, &x_array)); 682 } 683 } 684 685 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 686 PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream)); 687 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A)); 688 689 if (use_y_array_directly) { 690 PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx)); 691 } else { 692 const auto copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost; 693 PetscScalar *yv; 694 695 // The logic here is that the data is not yet in either y's GPU array or its CPU array. 696 // There is nothing in the interface to say where the user would like it to end up. So we 697 // choose the GPU, because it is the faster option 698 if (yiscupm) { 699 PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx)); 700 } else { 701 PetscCall(VecGetArray(y, &yv)); 702 } 703 PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream)); 704 if (yiscupm) { 705 PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx)); 706 } else { 707 PetscCall(VecRestoreArray(y, &yv)); 708 } 709 PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array)); 710 } 711 PetscFunctionReturn(PETSC_SUCCESS); 712 } 713 714 template <device::cupm::DeviceType T> 715 template <typename Solver, bool transpose> 716 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept 717 { 718 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 719 const auto k = static_cast<cupmBlasInt_t>(A->cmap->n); 720 cupmBlasInt_t nrhs, ldb, ldx, ldy; 721 PetscScalar *y; 722 PetscBool biscupm, xiscupm, aiscupm; 723 PetscDeviceContext dctx; 724 cupmStream_t stream; 725 726 PetscFunctionBegin; 727 PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve"); 728 PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm)); 729 PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm)); 730 PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm)); 731 PetscCall(GetHandles_(&dctx, &stream)); 732 { 733 PetscInt n; 734 735 PetscCall(MatGetSize(B, nullptr, &n)); 736 PetscCall(PetscCUPMBlasIntCast(n, &nrhs)); 737 PetscCall(MatDenseGetLDA(B, &n)); 738 PetscCall(PetscCUPMBlasIntCast(n, &ldb)); 739 PetscCall(MatDenseGetLDA(X, &n)); 740 PetscCall(PetscCUPMBlasIntCast(n, &ldx)); 741 } 742 { 743 // The logic here is to try to minimize the amount of memory copying: 744 // 745 // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not 746 // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to 747 // get the data in order to copy it into the y array. So the array b will be wherever the 748 // data already is so that only one memcpy is performed 749 const auto bisdevice = biscupm && PetscOffloadDevice(B->offloadmask); 750 const auto copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice; 751 const PetscScalar *b; 752 753 if (bisdevice) { 754 b = DeviceArrayRead(dctx, B); 755 } else if (biscupm) { 756 b = HostArrayRead(dctx, B); 757 } else { 758 PetscCall(MatDenseGetArrayRead(B, &b)); 759 } 760 761 if (ldx < m || !xiscupm) { 762 // X's array cannot serve as the array (too small or not on device), B's array cannot 763 // serve as the array (const), so allocate a new array 764 ldy = m; 765 PetscCall(PetscCUPMMallocAsync(&y, nrhs * m)); 766 } else { 767 // X's array should serve as the array 768 ldy = ldx; 769 y = DeviceArrayWrite(dctx, X); 770 } 771 PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream)); 772 if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b)); 773 } 774 775 // convert to CUPM twice?????????????????????????????????? 776 // but A should already be CUPM?????????????????????????????????????? 777 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 778 PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream)); 779 if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 780 781 if (ldx < m || !xiscupm) { 782 const auto copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost; 783 PetscScalar *x; 784 785 // The logic here is that the data is not yet in either X's GPU array or its CPU 786 // array. There is nothing in the interface to say where the user would like it to end up. 787 // So we choose the GPU, because it is the faster option 788 if (xiscupm) { 789 x = DeviceArrayWrite(dctx, X); 790 } else { 791 PetscCall(MatDenseGetArray(X, &x)); 792 } 793 PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream)); 794 if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x)); 795 PetscCallCUPM(cupmFreeAsync(y, stream)); 796 } 797 PetscFunctionReturn(PETSC_SUCCESS); 798 } 799 800 template <device::cupm::DeviceType T> 801 template <bool transpose> 802 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept 803 { 804 const auto m = static_cast<cupmBlasInt_t>(A->rmap->n); 805 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 806 cupmBlasHandle_t handle; 807 PetscDeviceContext dctx; 808 809 PetscFunctionBegin; 810 if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add 811 if (!m || !n) { 812 // mult only 813 if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0)); 814 PetscFunctionReturn(PETSC_SUCCESS); 815 } 816 PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n)); 817 PetscCall(GetHandles_(&dctx, &handle)); 818 { 819 constexpr auto op = transpose ? CUPMBLAS_OP_T : CUPMBLAS_OP_N; 820 const auto one = cupmScalarCast(1.0); 821 const auto zero = cupmScalarCast(0.0); 822 const auto da = DeviceArrayRead(dctx, A); 823 const auto dxx = VecSeq_CUPM::DeviceArrayRead(dctx, xx); 824 const auto dzz = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz); 825 826 PetscCall(PetscLogGpuTimeBegin()); 827 PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata(), static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda), dxx.cupmdata(), 1, (yy ? &one : &zero), dzz.cupmdata(), 1)); 828 PetscCall(PetscLogGpuTimeEnd()); 829 } 830 PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m))); 831 PetscFunctionReturn(PETSC_SUCCESS); 832 } 833 834 // ========================================================================================== 835 // MatDense_Seq_CUPM - Private API - Conversion Dispatch 836 // ========================================================================================== 837 838 template <device::cupm::DeviceType T> 839 template <bool to_host> 840 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept 841 { 842 PetscFunctionBegin; 843 if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) { 844 // TODO these cases should be optimized 845 PetscCall(MatConvert_Basic(M, type, reuse, newmat)); 846 } else { 847 const auto B = *newmat; 848 const auto pobj = PetscObjectCast(B); 849 850 if (to_host) { 851 PetscCall(BindToCPU(B, PETSC_TRUE)); 852 PetscCall(Reset(B)); 853 } else { 854 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM())); 855 } 856 857 PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype)); 858 PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM())); 859 // cvec might be the wrong VecType, destroy and rebuild it if necessary 860 // REVIEW ME: this is possibly very inefficient 861 PetscCall(VecDestroy(&MatIMPLCast(B)->cvec)); 862 863 MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense); 864 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>); 865 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>); 866 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>); 867 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>); 868 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>); 869 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>); 870 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray); 871 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray); 872 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray); 873 MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense); 874 MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation); 875 876 if (to_host) { 877 B->offloadmask = PETSC_OFFLOAD_CPU; 878 } else { 879 Mat_SeqDenseCUPM *mcu; 880 881 PetscCall(PetscNew(&mcu)); 882 B->spptr = mcu; 883 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host?? 884 PetscCall(BindToCPU(B, PETSC_FALSE)); 885 } 886 887 MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU); 888 MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy); 889 } 890 PetscFunctionReturn(PETSC_SUCCESS); 891 } 892 893 // ========================================================================================== 894 // MatDense_Seq_CUPM - Public API 895 // ========================================================================================== 896 897 template <device::cupm::DeviceType T> 898 inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept 899 { 900 return MATSEQDENSECUPM(); 901 } 902 903 template <device::cupm::DeviceType T> 904 inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept 905 { 906 return static_cast<Mat_SeqDenseCUPM *>(m->spptr); 907 } 908 909 template <device::cupm::DeviceType T> 910 inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept 911 { 912 return static_cast<Mat_SeqDense *>(m->data); 913 } 914 915 template <device::cupm::DeviceType T> 916 inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept 917 { 918 return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C"; 919 } 920 921 template <device::cupm::DeviceType T> 922 inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept 923 { 924 return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C"; 925 } 926 927 // ========================================================================================== 928 929 // MatCreate_SeqDenseCUPM() 930 template <device::cupm::DeviceType T> 931 inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept 932 { 933 PetscFunctionBegin; 934 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM())); 935 PetscCall(MatCreate_SeqDense(A)); 936 PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 937 PetscFunctionReturn(PETSC_SUCCESS); 938 } 939 940 template <device::cupm::DeviceType T> 941 inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept 942 { 943 PetscFunctionBegin; 944 // prevent copying back data if we own the data pointer 945 if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU; 946 PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A)); 947 PetscCall(MatDestroy_SeqDense(A)); 948 PetscFunctionReturn(PETSC_SUCCESS); 949 } 950 951 // obj->ops->setup() 952 template <device::cupm::DeviceType T> 953 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept 954 { 955 PetscFunctionBegin; 956 PetscCall(PetscLayoutSetUp(A->rmap)); 957 PetscCall(PetscLayoutSetUp(A->cmap)); 958 if (!A->preallocated) { 959 PetscDeviceContext dctx; 960 961 PetscCall(GetHandles_(&dctx)); 962 PetscCall(SetPreallocation(A, dctx, nullptr)); 963 } 964 PetscFunctionReturn(PETSC_SUCCESS); 965 } 966 967 template <device::cupm::DeviceType T> 968 inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept 969 { 970 PetscFunctionBegin; 971 if (const auto mcu = MatCUPMCast(A)) { 972 cupmStream_t stream; 973 974 PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME()); 975 PetscCall(GetHandles_(&stream)); 976 if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream)); 977 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream)); 978 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream)); 979 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream)); 980 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream)); 981 PetscCall(VecDestroy(&mcu->workvec)); 982 PetscCall(PetscFree(A->spptr /* mcu */)); 983 } 984 PetscFunctionReturn(PETSC_SUCCESS); 985 } 986 987 // ========================================================================================== 988 989 template <device::cupm::DeviceType T> 990 inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept 991 { 992 const auto mimpl = MatIMPLCast(A); 993 const auto pobj = PetscObjectCast(A); 994 995 PetscFunctionBegin; 996 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 997 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 998 A->boundtocpu = to_host; 999 PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype)); 1000 if (to_host) { 1001 PetscDeviceContext dctx; 1002 1003 // make sure we have an up-to-date copy on the CPU 1004 PetscCall(GetHandles_(&dctx)); 1005 PetscCall(DeviceToHost_(A, dctx)); 1006 } else { 1007 PetscBool iscupm; 1008 1009 if (auto &cvec = mimpl->cvec) { 1010 PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm)); 1011 if (!iscupm) PetscCall(VecDestroy(&cvec)); 1012 } 1013 if (auto &cmat = mimpl->cmat) { 1014 PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm)); 1015 if (!iscupm) PetscCall(MatDestroy(&cmat)); 1016 } 1017 } 1018 1019 // ============================================================ 1020 // Composed ops 1021 // ============================================================ 1022 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>); 1023 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>); 1024 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>); 1025 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>); 1026 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>); 1027 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>); 1028 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>); 1029 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>); 1030 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>); 1031 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>); 1032 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>); 1033 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>); 1034 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>); 1035 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>); 1036 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>); 1037 MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix); 1038 MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix); 1039 MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor); 1040 // always the same 1041 PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense)); 1042 1043 // ============================================================ 1044 // Function pointer ops 1045 // ============================================================ 1046 MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate); 1047 MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false>(A, xx, nullptr, yy); }); 1048 MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true>(A, xx, nullptr, yy); }); 1049 MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false>); 1050 MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true>); 1051 MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>); 1052 MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>); 1053 MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>); 1054 MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY); 1055 MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor); 1056 MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor); 1057 MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector); 1058 MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale); 1059 MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift); 1060 MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy); 1061 MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries); 1062 MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp); 1063 MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom); 1064 // seemingly always the same 1065 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense; 1066 1067 if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host)); 1068 PetscFunctionReturn(PETSC_SUCCESS); 1069 } 1070 1071 template <device::cupm::DeviceType T> 1072 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept 1073 { 1074 PetscFunctionBegin; 1075 PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat)); 1076 PetscFunctionReturn(PETSC_SUCCESS); 1077 } 1078 1079 template <device::cupm::DeviceType T> 1080 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept 1081 { 1082 PetscFunctionBegin; 1083 PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat)); 1084 PetscFunctionReturn(PETSC_SUCCESS); 1085 } 1086 1087 // ========================================================================================== 1088 1089 template <device::cupm::DeviceType T> 1090 template <PetscMemType mtype, PetscMemoryAccessMode access> 1091 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept 1092 { 1093 constexpr auto hostmem = PetscMemTypeHost(mtype); 1094 constexpr auto read_access = PetscMemoryAccessRead(access); 1095 1096 PetscFunctionBegin; 1097 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 1098 PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx)); 1099 if (hostmem) { 1100 if (read_access) { 1101 PetscCall(DeviceToHost_(m, dctx)); 1102 } else if (!MatIMPLCast(m)->v) { 1103 // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed 1104 PetscCall(MatSeqDenseSetPreallocation(m, nullptr)); 1105 } 1106 *array = MatIMPLCast(m)->v; 1107 } else { 1108 if (read_access) { 1109 PetscCall(HostToDevice_(m, dctx)); 1110 } else if (!MatCUPMCast(m)->d_v) { 1111 // write-only 1112 PetscCall(SetPreallocation(m, dctx, nullptr)); 1113 } 1114 *array = MatCUPMCast(m)->d_v; 1115 } 1116 if (PetscMemoryAccessWrite(access)) { 1117 m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 1118 PetscCall(PetscObjectStateIncrease(PetscObjectCast(m))); 1119 } 1120 PetscFunctionReturn(PETSC_SUCCESS); 1121 } 1122 1123 template <device::cupm::DeviceType T> 1124 template <PetscMemType mtype, PetscMemoryAccessMode access> 1125 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept 1126 { 1127 PetscFunctionBegin; 1128 static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), ""); 1129 if (PetscMemoryAccessWrite(access)) { 1130 // WRITE or READ_WRITE 1131 m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 1132 PetscCall(PetscObjectStateIncrease(PetscObjectCast(m))); 1133 } 1134 if (array) { 1135 PetscCall(CheckPointerMatchesMemType_(*array, mtype)); 1136 *array = nullptr; 1137 } 1138 PetscFunctionReturn(PETSC_SUCCESS); 1139 } 1140 1141 template <device::cupm::DeviceType T> 1142 template <PetscMemoryAccessMode access> 1143 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept 1144 { 1145 PetscFunctionBegin; 1146 PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx)); 1147 if (mtype) *mtype = PETSC_MEMTYPE_CUPM(); 1148 PetscFunctionReturn(PETSC_SUCCESS); 1149 } 1150 1151 template <device::cupm::DeviceType T> 1152 template <PetscMemoryAccessMode access> 1153 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept 1154 { 1155 PetscFunctionBegin; 1156 PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx)); 1157 PetscFunctionReturn(PETSC_SUCCESS); 1158 } 1159 1160 // ========================================================================================== 1161 1162 template <device::cupm::DeviceType T> 1163 inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept 1164 { 1165 const auto mimpl = MatIMPLCast(A); 1166 const auto mcu = MatCUPMCast(A); 1167 1168 PetscFunctionBegin; 1169 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1170 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1171 PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME()); 1172 if (mimpl->v) { 1173 PetscDeviceContext dctx; 1174 1175 PetscCall(GetHandles_(&dctx)); 1176 PetscCall(HostToDevice_(A, dctx)); 1177 } 1178 mcu->unplacedarray = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array)); 1179 mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE); 1180 PetscFunctionReturn(PETSC_SUCCESS); 1181 } 1182 1183 template <device::cupm::DeviceType T> 1184 inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept 1185 { 1186 const auto mimpl = MatIMPLCast(A); 1187 const auto mcu = MatCUPMCast(A); 1188 1189 PetscFunctionBegin; 1190 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1191 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1192 PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME()); 1193 if (!mcu->d_user_alloc) { 1194 cupmStream_t stream; 1195 1196 PetscCall(GetHandles_(&stream)); 1197 PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream)); 1198 } 1199 mcu->d_v = const_cast<PetscScalar *>(array); 1200 mcu->d_user_alloc = PETSC_FALSE; 1201 PetscFunctionReturn(PETSC_SUCCESS); 1202 } 1203 1204 template <device::cupm::DeviceType T> 1205 inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept 1206 { 1207 const auto mimpl = MatIMPLCast(A); 1208 const auto mcu = MatCUPMCast(A); 1209 1210 PetscFunctionBegin; 1211 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1212 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1213 if (mimpl->v) { 1214 PetscDeviceContext dctx; 1215 1216 PetscCall(GetHandles_(&dctx)); 1217 PetscCall(HostToDevice_(A, dctx)); 1218 } 1219 mcu->d_v = util::exchange(mcu->unplacedarray, nullptr); 1220 mcu->d_user_alloc = mcu->d_unplaced_user_alloc; 1221 PetscFunctionReturn(PETSC_SUCCESS); 1222 } 1223 1224 // ========================================================================================== 1225 1226 template <device::cupm::DeviceType T> 1227 template <bool transpose_A, bool transpose_B> 1228 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept 1229 { 1230 cupmBlasInt_t m, n, k; 1231 PetscBool Aiscupm, Biscupm; 1232 PetscDeviceContext dctx; 1233 cupmBlasHandle_t handle; 1234 1235 PetscFunctionBegin; 1236 PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m)); 1237 PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n)); 1238 PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k)); 1239 if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS); 1240 1241 // we may end up with SEQDENSE as one of the arguments 1242 // REVIEW ME: how? and why is it not B and C???????? 1243 PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm)); 1244 PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm)); 1245 if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A)); 1246 if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B)); 1247 PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n)); 1248 PetscCall(GetHandles_(&dctx, &handle)); 1249 1250 PetscCall(PetscLogGpuTimeBegin()); 1251 { 1252 const auto one = cupmScalarCast(1.0); 1253 const auto zero = cupmScalarCast(0.0); 1254 const auto da = DeviceArrayRead(dctx, A); 1255 const auto db = DeviceArrayRead(dctx, B); 1256 const auto dc = DeviceArrayWrite(dctx, C); 1257 PetscInt alda, blda, clda; 1258 1259 PetscCall(MatDenseGetLDA(A, &alda)); 1260 PetscCall(MatDenseGetLDA(B, &blda)); 1261 PetscCall(MatDenseGetLDA(C, &clda)); 1262 PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda)); 1263 } 1264 PetscCall(PetscLogGpuTimeEnd()); 1265 1266 PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1))); 1267 if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A)); 1268 if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 1269 PetscFunctionReturn(PETSC_SUCCESS); 1270 } 1271 1272 template <device::cupm::DeviceType T> 1273 inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept 1274 { 1275 const auto m = A->rmap->n; 1276 const auto n = A->cmap->n; 1277 1278 PetscFunctionBegin; 1279 PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)"); 1280 // The two matrices must have the same copy implementation to be eligible for fast copy 1281 if (A->ops->copy == B->ops->copy) { 1282 PetscDeviceContext dctx; 1283 cupmStream_t stream; 1284 1285 PetscCall(GetHandles_(&dctx, &stream)); 1286 PetscCall(PetscLogGpuTimeBegin()); 1287 { 1288 const auto va = DeviceArrayRead(dctx, A); 1289 const auto vb = DeviceArrayWrite(dctx, B); 1290 // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets 1291 // lda! 1292 const auto lda_a = MatIMPLCast(A)->lda; 1293 const auto lda_b = MatIMPLCast(B)->lda; 1294 1295 if (lda_a > m || lda_b > m) { 1296 PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME()); 1297 PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME()); 1298 PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream)); 1299 } else { 1300 PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream)); 1301 } 1302 } 1303 PetscCall(PetscLogGpuTimeEnd()); 1304 } else { 1305 PetscCall(MatCopy_Basic(A, B, str)); 1306 } 1307 PetscFunctionReturn(PETSC_SUCCESS); 1308 } 1309 1310 template <device::cupm::DeviceType T> 1311 inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept 1312 { 1313 PetscDeviceContext dctx; 1314 cupmStream_t stream; 1315 1316 PetscFunctionBegin; 1317 PetscCall(GetHandles_(&dctx, &stream)); 1318 PetscCall(PetscLogGpuTimeBegin()); 1319 { 1320 const auto va = DeviceArrayWrite(dctx, m); 1321 const auto lda = MatIMPLCast(m)->lda; 1322 const auto ma = m->rmap->n; 1323 const auto na = m->cmap->n; 1324 1325 if (lda > ma) { 1326 PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream)); 1327 } else { 1328 PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream)); 1329 } 1330 } 1331 PetscCall(PetscLogGpuTimeEnd()); 1332 PetscFunctionReturn(PETSC_SUCCESS); 1333 } 1334 1335 namespace detail 1336 { 1337 1338 // ========================================================================================== 1339 // SubMatIndexFunctor 1340 // 1341 // Iterator which permutes a linear index range into matrix indices for am nrows x ncols 1342 // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for 1343 // the i'th sequential entry in the matrix. 1344 // ========================================================================================== 1345 template <typename T> 1346 struct SubMatIndexFunctor { 1347 PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); } 1348 1349 PetscInt nrows; 1350 PetscInt ncols; 1351 PetscInt lda; 1352 }; 1353 1354 template <typename Iterator> 1355 struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>> { 1356 using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>>; 1357 1358 using iterator = typename base_type::iterator; 1359 1360 constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept : 1361 base_type{ 1362 std::move(first), std::move(last), {nrows, ncols, lda} 1363 } 1364 { 1365 } 1366 1367 PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); } 1368 }; 1369 1370 namespace 1371 { 1372 1373 template <typename T> 1374 PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept 1375 { 1376 const auto nrows = rend - rstart; 1377 const auto ncols = cend - cstart; 1378 const auto dptr = thrust::device_pointer_cast(ptr); 1379 1380 return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda}; 1381 } 1382 1383 } // namespace 1384 1385 } // namespace detail 1386 1387 template <device::cupm::DeviceType T> 1388 inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept 1389 { 1390 const auto m = A->rmap->n; 1391 const auto n = A->cmap->n; 1392 const auto N = m * n; 1393 PetscDeviceContext dctx; 1394 1395 PetscFunctionBegin; 1396 PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n)); 1397 PetscCall(GetHandles_(&dctx)); 1398 { 1399 const auto da = DeviceArrayReadWrite(dctx, A); 1400 const auto lda = MatIMPLCast(A)->lda; 1401 1402 if (lda > m) { 1403 cupmStream_t stream; 1404 1405 PetscCall(GetHandlesFrom_(dctx, &stream)); 1406 // clang-format off 1407 PetscCallThrust( 1408 const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data()); 1409 1410 THRUST_CALL( 1411 thrust::transform, 1412 stream, 1413 sub_mat.begin(), sub_mat.end(), sub_mat.begin(), 1414 device::cupm::functors::make_times_equals(alpha) 1415 ) 1416 ); 1417 // clang-format on 1418 } else { 1419 const auto cu_alpha = cupmScalarCast(alpha); 1420 cupmBlasHandle_t handle; 1421 1422 PetscCall(GetHandlesFrom_(dctx, &handle)); 1423 PetscCall(PetscLogGpuTimeBegin()); 1424 PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1)); 1425 PetscCall(PetscLogGpuTimeEnd()); 1426 } 1427 } 1428 PetscCall(PetscLogGpuFlops(N)); 1429 PetscFunctionReturn(PETSC_SUCCESS); 1430 } 1431 1432 template <device::cupm::DeviceType T> 1433 inline PetscErrorCode MatDense_Seq_CUPM<T>::Shift(Mat A, PetscScalar alpha) noexcept 1434 { 1435 const auto m = A->rmap->n; 1436 const auto n = A->cmap->n; 1437 PetscDeviceContext dctx; 1438 1439 PetscFunctionBegin; 1440 PetscCall(GetHandles_(&dctx)); 1441 PetscCall(PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n)); 1442 PetscCall(DiagonalUnaryTransform(A, 0, m, n, dctx, device::cupm::functors::make_plus_equals(alpha))); 1443 PetscFunctionReturn(PETSC_SUCCESS); 1444 } 1445 1446 template <device::cupm::DeviceType T> 1447 inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept 1448 { 1449 const auto m_x = X->rmap->n, m_y = Y->rmap->n; 1450 const auto n_x = X->cmap->n, n_y = Y->cmap->n; 1451 const auto N = m_x * n_x; 1452 PetscDeviceContext dctx; 1453 1454 PetscFunctionBegin; 1455 if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS); 1456 PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y)); 1457 PetscCall(GetHandles_(&dctx)); 1458 { 1459 const auto dx = DeviceArrayRead(dctx, X); 1460 const auto dy = DeviceArrayReadWrite(dctx, Y); 1461 const auto lda_x = MatIMPLCast(X)->lda; 1462 const auto lda_y = MatIMPLCast(Y)->lda; 1463 1464 if (lda_x > m_x || lda_y > m_x) { 1465 cupmStream_t stream; 1466 1467 PetscCall(GetHandlesFrom_(dctx, &stream)); 1468 // clang-format off 1469 PetscCallThrust( 1470 const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data()); 1471 const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data()); 1472 1473 THRUST_CALL( 1474 thrust::transform, 1475 stream, 1476 sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(), 1477 device::cupm::functors::make_axpy(alpha) 1478 ); 1479 ); 1480 // clang-format on 1481 } else { 1482 const auto cu_alpha = cupmScalarCast(alpha); 1483 cupmBlasHandle_t handle; 1484 1485 PetscCall(GetHandlesFrom_(dctx, &handle)); 1486 PetscCall(PetscLogGpuTimeBegin()); 1487 PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1)); 1488 PetscCall(PetscLogGpuTimeEnd()); 1489 } 1490 } 1491 PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0))); 1492 PetscFunctionReturn(PETSC_SUCCESS); 1493 } 1494 1495 template <device::cupm::DeviceType T> 1496 inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept 1497 { 1498 const auto hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt; 1499 PetscDeviceContext dctx; 1500 1501 PetscFunctionBegin; 1502 PetscCall(GetHandles_(&dctx)); 1503 // do not call SetPreallocation() yet, we call it afterwards?? 1504 PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false)); 1505 PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt)); 1506 if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN)); 1507 // allocate memory if needed 1508 if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr)); 1509 PetscFunctionReturn(PETSC_SUCCESS); 1510 } 1511 1512 template <device::cupm::DeviceType T> 1513 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept 1514 { 1515 PetscBool device; 1516 1517 PetscFunctionBegin; 1518 PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device)); 1519 if (device) { 1520 const auto m = A->rmap->n; 1521 const auto n = A->cmap->n; 1522 PetscDeviceContext dctx; 1523 1524 PetscCall(GetHandles_(&dctx)); 1525 { 1526 const auto a = DeviceArrayWrite(dctx, A); 1527 PetscInt lda; 1528 1529 PetscCall(MatDenseGetLDA(A, &lda)); 1530 if (lda > m) { 1531 for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda)); 1532 } else { 1533 PetscInt mn; 1534 1535 PetscCall(PetscIntMultError(m, n, &mn)); 1536 PetscCall(PetscRandomGetValues(rng, mn, a)); 1537 } 1538 } 1539 } else { 1540 PetscCall(MatSetRandom_SeqDense(A, rng)); 1541 } 1542 PetscFunctionReturn(PETSC_SUCCESS); 1543 } 1544 1545 // ========================================================================================== 1546 1547 template <device::cupm::DeviceType T> 1548 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept 1549 { 1550 const auto offloadmask = A->offloadmask; 1551 const auto n = A->rmap->n; 1552 const auto col_offset = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; }; 1553 PetscBool viscupm; 1554 PetscDeviceContext dctx; 1555 cupmStream_t stream; 1556 1557 PetscFunctionBegin; 1558 PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), "")); 1559 PetscCall(GetHandles_(&dctx, &stream)); 1560 if (viscupm && !v->boundtocpu) { 1561 const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v); 1562 1563 // update device data 1564 if (PetscOffloadDevice(offloadmask)) { 1565 PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream)); 1566 } else { 1567 PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream)); 1568 } 1569 } else { 1570 PetscScalar *x; 1571 1572 // update host data 1573 PetscCall(VecGetArrayWrite(v, &x)); 1574 if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) { 1575 PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n)); 1576 } else if (PetscOffloadDevice(offloadmask)) { 1577 PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream)); 1578 } 1579 PetscCall(VecRestoreArrayWrite(v, &x)); 1580 } 1581 PetscFunctionReturn(PETSC_SUCCESS); 1582 } 1583 1584 template <device::cupm::DeviceType T> 1585 template <PetscMemoryAccessMode access> 1586 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept 1587 { 1588 using namespace vec::cupm; 1589 const auto mimpl = MatIMPLCast(A); 1590 PetscDeviceContext dctx; 1591 1592 PetscFunctionBegin; 1593 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1594 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1595 mimpl->vecinuse = col + 1; 1596 if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec)); 1597 PetscCall(GetHandles_(&dctx)); 1598 PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx)); 1599 PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda))); 1600 if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec)); 1601 *v = mimpl->cvec; 1602 PetscFunctionReturn(PETSC_SUCCESS); 1603 } 1604 1605 template <device::cupm::DeviceType T> 1606 template <PetscMemoryAccessMode access> 1607 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept 1608 { 1609 using namespace vec::cupm; 1610 const auto mimpl = MatIMPLCast(A); 1611 const auto cvec = mimpl->cvec; 1612 PetscDeviceContext dctx; 1613 1614 PetscFunctionBegin; 1615 PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first"); 1616 PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector"); 1617 mimpl->vecinuse = 0; 1618 if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec)); 1619 PetscCall(VecCUPMResetArrayAsync<T>(cvec)); 1620 PetscCall(GetHandles_(&dctx)); 1621 PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx)); 1622 if (v) *v = nullptr; 1623 PetscFunctionReturn(PETSC_SUCCESS); 1624 } 1625 1626 // ========================================================================================== 1627 1628 template <device::cupm::DeviceType T> 1629 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept 1630 { 1631 Mat fact = nullptr; 1632 PetscDeviceContext dctx; 1633 1634 PetscFunctionBegin; 1635 PetscCall(GetHandles_(&dctx)); 1636 PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false)); 1637 fact->factortype = ftype; 1638 switch (ftype) { 1639 case MAT_FACTOR_LU: 1640 case MAT_FACTOR_ILU: // fall-through 1641 fact->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense; 1642 fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense; 1643 break; 1644 case MAT_FACTOR_CHOLESKY: 1645 case MAT_FACTOR_ICC: // fall-through 1646 fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense; 1647 break; 1648 case MAT_FACTOR_QR: { 1649 const auto pobj = PetscObjectCast(fact); 1650 1651 PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense)); 1652 PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense)); 1653 } break; 1654 case MAT_FACTOR_NONE: 1655 case MAT_FACTOR_ILUDT: // fall-through 1656 case MAT_FACTOR_NUM_TYPES: // fall-through 1657 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]); 1658 } 1659 PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype)); 1660 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU)); 1661 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU)); 1662 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY)); 1663 PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC)); 1664 *fact_out = fact; 1665 PetscFunctionReturn(PETSC_SUCCESS); 1666 } 1667 1668 template <device::cupm::DeviceType T> 1669 inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept 1670 { 1671 const auto mimpl = MatIMPLCast(A); 1672 const auto mcu = MatCUPMCast(A); 1673 const auto n = static_cast<cupmBlasInt_t>(A->cmap->n); 1674 cupmSolverHandle_t handle; 1675 PetscDeviceContext dctx; 1676 cupmStream_t stream; 1677 1678 PetscFunctionBegin; 1679 #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC) 1680 // HIP appears to have this by default?? 1681 PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher"); 1682 #endif 1683 if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS); 1684 PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]); 1685 // spd 1686 PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName()); 1687 1688 PetscCall(GetHandles_(&dctx, &handle, &stream)); 1689 { 1690 const auto da = DeviceArrayReadWrite(dctx, A); 1691 const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda); 1692 cupmBlasInt_t il; 1693 1694 PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il)); 1695 if (il > mcu->d_fact_lwork) { 1696 mcu->d_fact_lwork = il; 1697 PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream)); 1698 PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream)); 1699 } 1700 PetscCall(PetscLogGpuTimeBegin()); 1701 PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info)); 1702 PetscCall(PetscLogGpuTimeEnd()); 1703 } 1704 PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream)); 1705 // TODO (write cuda kernel) 1706 PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE)); 1707 PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0)); 1708 1709 A->ops->solve = nullptr; 1710 A->ops->solvetranspose = nullptr; 1711 A->ops->matsolve = nullptr; 1712 A->factortype = MAT_FACTOR_NONE; 1713 1714 PetscCall(PetscFree(A->solvertype)); 1715 PetscFunctionReturn(PETSC_SUCCESS); 1716 } 1717 1718 // ========================================================================================== 1719 1720 template <device::cupm::DeviceType T> 1721 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept 1722 { 1723 const auto mimpl = MatIMPLCast(A); 1724 const auto array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; }; 1725 const auto n = rend - rbegin; 1726 const auto m = cend - cbegin; 1727 auto &cmat = mimpl->cmat; 1728 PetscDeviceContext dctx; 1729 1730 PetscFunctionBegin; 1731 PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first"); 1732 PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first"); 1733 mimpl->matinuse = cbegin + 1; 1734 1735 PetscCall(GetHandles_(&dctx)); 1736 PetscCall(HostToDevice_(A, dctx)); 1737 1738 if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat)); 1739 { 1740 const auto device_array = array_offset(MatCUPMCast(A)->d_v); 1741 1742 if (cmat) { 1743 PetscCall(PlaceArray(cmat, device_array)); 1744 } else { 1745 PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx)); 1746 } 1747 } 1748 PetscCall(MatDenseSetLDA(cmat, mimpl->lda)); 1749 // place CPU array if present but do not copy any data 1750 if (const auto host_array = mimpl->v) { 1751 cmat->offloadmask = PETSC_OFFLOAD_GPU; 1752 PetscCall(MatDensePlaceArray(cmat, array_offset(host_array))); 1753 } 1754 1755 cmat->offloadmask = A->offloadmask; 1756 *mat = cmat; 1757 PetscFunctionReturn(PETSC_SUCCESS); 1758 } 1759 1760 template <device::cupm::DeviceType T> 1761 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept 1762 { 1763 const auto mimpl = MatIMPLCast(A); 1764 const auto cmat = mimpl->cmat; 1765 const auto reset = static_cast<bool>(mimpl->v); 1766 bool copy, was_offload_host; 1767 1768 PetscFunctionBegin; 1769 PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first"); 1770 PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix"); 1771 PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()"); 1772 mimpl->matinuse = 0; 1773 1774 // calls to ResetArray may change it, so save it here 1775 was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU; 1776 if (was_offload_host && !reset) { 1777 copy = true; 1778 PetscCall(MatSeqDenseSetPreallocation(A, nullptr)); 1779 } else { 1780 copy = false; 1781 } 1782 1783 PetscCall(ResetArray(cmat)); 1784 if (reset) PetscCall(MatDenseResetArray(cmat)); 1785 if (copy) { 1786 PetscDeviceContext dctx; 1787 1788 PetscCall(GetHandles_(&dctx)); 1789 PetscCall(DeviceToHost_(A, dctx)); 1790 } else { 1791 A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU; 1792 } 1793 1794 cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 1795 *m = nullptr; 1796 PetscFunctionReturn(PETSC_SUCCESS); 1797 } 1798 1799 // ========================================================================================== 1800 1801 namespace 1802 { 1803 1804 template <device::cupm::DeviceType T> 1805 inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept 1806 { 1807 PetscFunctionBegin; 1808 if (TA) { 1809 if (TB) { 1810 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C)); 1811 } else { 1812 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C)); 1813 } 1814 } else { 1815 if (TB) { 1816 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C)); 1817 } else { 1818 PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C)); 1819 } 1820 } 1821 PetscFunctionReturn(PETSC_SUCCESS); 1822 } 1823 1824 template <device::cupm::DeviceType T> 1825 inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept 1826 { 1827 PetscFunctionBegin; 1828 for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) { 1829 PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor)); 1830 PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor)); 1831 } 1832 PetscFunctionReturn(PETSC_SUCCESS); 1833 } 1834 1835 } // anonymous namespace 1836 1837 } // namespace impl 1838 1839 } // namespace cupm 1840 1841 } // namespace mat 1842 1843 } // namespace Petsc 1844 1845 #endif // PETSCMATSEQDENSECUPM_HPP 1846