xref: /petsc/src/mat/impls/dense/seq/cupm/matseqdensecupm.hpp (revision 98d129c30f3ee9fdddc40fdbc5a989b7be64f888)
1 #pragma once
2 
3 #include <petsc/private/matdensecupmimpl.h> /*I <petscmat.h> I*/
4 #include <../src/mat/impls/dense/seq/dense.h>
5 
6 #include <petsc/private/deviceimpl.h> // PetscDeviceContextGetOptionalNullContext_Internal()
7 #include <petsc/private/randomimpl.h> // _p_PetscRandom
8 #include <petsc/private/vecimpl.h>    // _p_Vec
9 #include <petsc/private/cupmobject.hpp>
10 #include <petsc/private/cupmsolverinterface.hpp>
11 
12 #include <petsc/private/cpp/type_traits.hpp> // PetscObjectCast()
13 #include <petsc/private/cpp/utility.hpp>     // util::exchange()
14 
15 #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp> // for VecSeq_CUPM
16 
17 namespace Petsc
18 {
19 
20 namespace mat
21 {
22 
23 namespace cupm
24 {
25 
26 namespace impl
27 {
28 
29 template <device::cupm::DeviceType T>
30 class MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
31 public:
32   MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);
33 
34 private:
35   struct Mat_SeqDenseCUPM {
36     PetscScalar *d_v;           // pointer to the matrix on the GPU
37     PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
38     bool         d_user_alloc;
39     bool         d_unplaced_user_alloc;
40     // factorization support
41     cupmBlasInt_t *d_fact_ipiv;  // device pivots
42     cupmScalar_t  *d_fact_tau;   // device QR tau vector
43     cupmBlasInt_t *d_fact_info;  // device info
44     cupmScalar_t  *d_fact_work;  // device workspace
45     cupmBlasInt_t  d_fact_lwork; // size of device workspace
46     // workspace
47     Vec workvec;
48   };
49 
50   static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;
51 
52   static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
53   static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;
54 
55   static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;
56 
57   template <typename Derived>
58   struct SolveCommon;
59   struct SolveQR;
60   struct SolveCholesky;
61   struct SolveLU;
62 
63   template <typename Solver, bool transpose>
64   static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
65   template <typename Solver, bool transpose>
66   static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
67   template <bool transpose, bool hermitian>
68   static PetscErrorCode MatMultAddColumnRange_Dispatch_(Mat, Vec, Vec, Vec, PetscInt, PetscInt) noexcept;
69   template <bool transpose, bool hermitian>
70   static PetscErrorCode MatMultColumnRange_Dispatch_(Mat, Vec, Vec, PetscInt, PetscInt) noexcept;
71   template <bool transpose, bool hermitian>
72   static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;
73 
74   template <bool to_host>
75   static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;
76 
77   PETSC_NODISCARD static constexpr MatType       MATIMPLCUPM_() noexcept;
78   PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;
79 
80 public:
81   PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;
82 
83   // define these by hand since they don't fit the above mold
84   PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
85   PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;
86 
87   static PetscErrorCode Create(Mat) noexcept;
88   static PetscErrorCode Destroy(Mat) noexcept;
89   static PetscErrorCode SetUp(Mat) noexcept;
90   static PetscErrorCode Reset(Mat) noexcept;
91 
92   static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
93   static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
94   static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;
95 
96   template <PetscMemType, PetscMemoryAccessMode>
97   static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
98   template <PetscMemType, PetscMemoryAccessMode>
99   static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
100   template <PetscMemoryAccessMode>
101   static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
102   template <PetscMemoryAccessMode>
103   static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;
104 
105 private:
106   template <PetscMemType mtype, PetscMemoryAccessMode mode>
107   static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
108   {
109     PetscDeviceContext dctx;
110 
111     PetscFunctionBegin;
112     PetscCall(GetHandles_(&dctx));
113     PetscCall(GetArray<mtype, mode>(m, p, dctx));
114     PetscFunctionReturn(PETSC_SUCCESS);
115   }
116 
117   template <PetscMemType mtype, PetscMemoryAccessMode mode>
118   static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
119   {
120     PetscDeviceContext dctx;
121 
122     PetscFunctionBegin;
123     PetscCall(GetHandles_(&dctx));
124     PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
125     PetscFunctionReturn(PETSC_SUCCESS);
126   }
127 
128   template <PetscMemoryAccessMode mode>
129   static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
130   {
131     PetscDeviceContext dctx;
132 
133     PetscFunctionBegin;
134     PetscCall(GetHandles_(&dctx));
135     PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
136     PetscFunctionReturn(PETSC_SUCCESS);
137   }
138 
139   template <PetscMemoryAccessMode mode>
140   static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
141   {
142     PetscDeviceContext dctx;
143 
144     PetscFunctionBegin;
145     PetscCall(GetHandles_(&dctx));
146     PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
147     PetscFunctionReturn(PETSC_SUCCESS);
148   }
149 
150 public:
151   static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
152   static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
153   static PetscErrorCode ResetArray(Mat) noexcept;
154 
155   template <bool transpose_A, bool transpose_B>
156   static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
157   static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
158   static PetscErrorCode ZeroEntries(Mat) noexcept;
159   static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
160   static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
161   static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
162   static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;
163 
164   static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
165   template <PetscMemoryAccessMode>
166   static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
167   template <PetscMemoryAccessMode>
168   static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;
169 
170   static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
171   static PetscErrorCode InvertFactors(Mat) noexcept;
172 
173   static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
174   static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
175 };
176 
177 } // namespace impl
178 
179 namespace
180 {
181 
182 // Declare this here so that the functions below can make use of it
183 template <device::cupm::DeviceType T>
184 inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
185 {
186   PetscFunctionBegin;
187   PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 } // anonymous namespace
192 
193 namespace impl
194 {
195 
196 // ==========================================================================================
197 // MatDense_Seq_CUPM - Private API - Utility
198 // ==========================================================================================
199 
200 template <device::cupm::DeviceType T>
201 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
202 {
203   const auto   mcu   = MatCUPMCast(m);
204   const auto   nrows = m->rmap->n;
205   const auto   ncols = m->cmap->n;
206   auto        &lda   = MatIMPLCast(m)->lda;
207   cupmStream_t stream;
208 
209   PetscFunctionBegin;
210   PetscCheckTypeName(m, MATSEQDENSECUPM());
211   PetscValidDeviceContext(dctx, 2);
212   PetscCall(checkCupmBlasIntCast(nrows));
213   PetscCall(checkCupmBlasIntCast(ncols));
214   PetscCall(GetHandlesFrom_(dctx, &stream));
215   if (lda <= 0) lda = nrows;
216   if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
217   if (user_device_array) {
218     mcu->d_user_alloc = PETSC_TRUE;
219     mcu->d_v          = user_device_array;
220   } else {
221     std::size_t size;
222 
223     mcu->d_user_alloc = PETSC_FALSE;
224     size              = lda * ncols;
225     PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
226     PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
227   }
228   m->offloadmask = PETSC_OFFLOAD_GPU;
229   PetscFunctionReturn(PETSC_SUCCESS);
230 }
231 
232 template <device::cupm::DeviceType T>
233 inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
234 {
235   const auto nrows = m->rmap->n;
236   const auto ncols = m->cmap->n;
237   const auto copy  = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;
238 
239   PetscFunctionBegin;
240   PetscCheckTypeName(m, MATSEQDENSECUPM());
241   if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
242   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
243   if (copy) {
244     const auto   mcu = MatCUPMCast(m);
245     cupmStream_t stream;
246 
247     // Allocate GPU memory if not present
248     if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr));
249     PetscCall(GetHandlesFrom_(dctx, &stream));
250     PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
251     {
252       const auto mimpl = MatIMPLCast(m);
253       const auto lda   = mimpl->lda;
254       const auto src   = mimpl->v;
255       const auto dest  = mcu->d_v;
256 
257       if (lda > nrows) {
258         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
259       } else {
260         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
261       }
262     }
263     PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
264     // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
265     m->offloadmask = PETSC_OFFLOAD_BOTH;
266   }
267   PetscFunctionReturn(PETSC_SUCCESS);
268 }
269 
270 template <device::cupm::DeviceType T>
271 inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
272 {
273   const auto nrows = m->rmap->n;
274   const auto ncols = m->cmap->n;
275   const auto copy  = m->offloadmask == PETSC_OFFLOAD_GPU;
276 
277   PetscFunctionBegin;
278   PetscCheckTypeName(m, MATSEQDENSECUPM());
279   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
280   if (copy) {
281     const auto   mimpl = MatIMPLCast(m);
282     cupmStream_t stream;
283 
284     // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
285     if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
286     PetscCall(GetHandlesFrom_(dctx, &stream));
287     PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
288     {
289       const auto lda  = mimpl->lda;
290       const auto dest = mimpl->v;
291       const auto src  = MatCUPMCast(m)->d_v;
292 
293       if (lda > nrows) {
294         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
295       } else {
296         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
297       }
298     }
299     PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
300     // order is important, MatSeqDenseSetPreallocation() might set offloadmask
301     m->offloadmask = PETSC_OFFLOAD_BOTH;
302   }
303   PetscFunctionReturn(PETSC_SUCCESS);
304 }
305 
306 template <device::cupm::DeviceType T>
307 inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
308 {
309   PetscFunctionBegin;
310   if (PetscDefined(USE_DEBUG)) {
311     cupmBlasInt_t info = 0;
312 
313     PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
314     if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
315     static_assert(std::is_same<decltype(info), int>::value, "");
316     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
317     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
318   }
319   PetscFunctionReturn(PETSC_SUCCESS);
320 }
321 
322 // ==========================================================================================
323 // MatDense_Seq_CUPM - Private API - Solver Dispatch
324 // ==========================================================================================
325 
326 // specific solvers called through the dispatch_() family of functions
327 template <device::cupm::DeviceType T>
328 template <typename Derived>
329 struct MatDense_Seq_CUPM<T>::SolveCommon {
330   using derived_type = Derived;
331 
332   template <typename F>
333   static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
334   {
335     cupmBlasInt_t lwork;
336 
337     PetscFunctionBegin;
338     PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
339     if (lwork > mcu->d_fact_lwork) {
340       mcu->d_fact_lwork = lwork;
341       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
342       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
343     }
344     PetscFunctionReturn(PETSC_SUCCESS);
345   }
346 
347   static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
348   {
349     const auto mcu = MatCUPMCast(A);
350 
351     PetscFunctionBegin;
352     PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
353     A->factortype             = derived_type::MATFACTORTYPE();
354     A->ops->solve             = MatSolve_Factored_Dispatch_<derived_type, false>;
355     A->ops->solvetranspose    = MatSolve_Factored_Dispatch_<derived_type, true>;
356     A->ops->matsolve          = MatMatSolve_Factored_Dispatch_<derived_type, false>;
357     A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;
358 
359     PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
360     if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
361     PetscFunctionReturn(PETSC_SUCCESS);
362   }
363 };
364 
365 template <device::cupm::DeviceType T>
366 struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
367   using base_type = SolveCommon<SolveLU>;
368 
369   static constexpr const char   *NAME() noexcept { return "LU"; }
370   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }
371 
372   static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
373   {
374     const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
375     const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
376     cupmStream_t       stream;
377     cupmSolverHandle_t handle;
378     PetscDeviceContext dctx;
379 
380     PetscFunctionBegin;
381     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
382     PetscCall(GetHandles_(&dctx, &handle, &stream));
383     PetscCall(base_type::FactorPrepare(A, stream));
384     {
385       const auto mcu = MatCUPMCast(A);
386       const auto da  = DeviceArrayReadWrite(dctx, A);
387       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
388 
389       // clang-format off
390       PetscCall(
391         base_type::ResizeFactLwork(
392           mcu, stream,
393           [&](cupmBlasInt_t *fact_lwork)
394           {
395             return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
396           }
397         )
398       );
399       // clang-format on
400       if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
401 
402       PetscCall(PetscLogGpuTimeBegin());
403       PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
404       PetscCall(PetscLogGpuTimeEnd());
405       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
406     }
407     PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
408     PetscFunctionReturn(PETSC_SUCCESS);
409   }
410 
411   template <bool transpose>
412   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
413   {
414     const auto         mcu       = MatCUPMCast(A);
415     const auto         fact_info = mcu->d_fact_info;
416     const auto         fact_ipiv = mcu->d_fact_ipiv;
417     cupmSolverHandle_t handle;
418 
419     PetscFunctionBegin;
420     PetscCall(GetHandlesFrom_(dctx, &handle));
421     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
422     PetscCall(PetscLogGpuTimeBegin());
423     {
424       constexpr auto op  = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
425       const auto     da  = DeviceArrayRead(dctx, A);
426       const auto     lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
427 
428       // clang-format off
429       PetscCall(
430         base_type::ResizeFactLwork(
431           mcu, stream,
432           [&](cupmBlasInt_t *lwork)
433           {
434             return cupmSolverXgetrs_bufferSize(
435               handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
436             );
437           }
438         )
439       );
440       // clang-format on
441       PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
442       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
443     }
444     PetscCall(PetscLogGpuTimeEnd());
445     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
446     PetscFunctionReturn(PETSC_SUCCESS);
447   }
448 };
449 
450 template <device::cupm::DeviceType T>
451 struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
452   using base_type = SolveCommon<SolveCholesky>;
453 
454   static constexpr const char   *NAME() noexcept { return "Cholesky"; }
455   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }
456 
457   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
458   {
459     const auto         n = static_cast<cupmBlasInt_t>(A->rmap->n);
460     PetscDeviceContext dctx;
461     cupmSolverHandle_t handle;
462     cupmStream_t       stream;
463 
464     PetscFunctionBegin;
465     if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
466     PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
467     PetscCall(GetHandles_(&dctx, &handle, &stream));
468     PetscCall(base_type::FactorPrepare(A, stream));
469     {
470       const auto mcu = MatCUPMCast(A);
471       const auto da  = DeviceArrayReadWrite(dctx, A);
472       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
473 
474       // clang-format off
475       PetscCall(
476         base_type::ResizeFactLwork(
477           mcu, stream,
478           [&](cupmBlasInt_t *fact_lwork)
479           {
480             return cupmSolverXpotrf_bufferSize(
481               handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
482             );
483           }
484         )
485       );
486       // clang-format on
487       PetscCall(PetscLogGpuTimeBegin());
488       PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
489       PetscCall(PetscLogGpuTimeEnd());
490       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
491     }
492     PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
493 
494 #if 0
495     // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
496     // and *hetr* routines. The code below should work, and it can be activated when *sytrs
497     // routines will be available
498     if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
499     if (!mcu->d_fact_lwork) {
500       PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
501       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
502     }
503     if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
504     PetscCall(PetscLogGpuTimeBegin());
505     PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
506     PetscCall(PetscLogGpuTimeEnd());
507 #endif
508     PetscFunctionReturn(PETSC_SUCCESS);
509   }
510 
511   template <bool transpose>
512   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
513   {
514     const auto         mcu       = MatCUPMCast(A);
515     const auto         fact_info = mcu->d_fact_info;
516     cupmSolverHandle_t handle;
517 
518     PetscFunctionBegin;
519     PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
520     PetscCall(GetHandlesFrom_(dctx, &handle));
521     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
522     PetscCall(PetscLogGpuTimeBegin());
523     {
524       const auto da  = DeviceArrayRead(dctx, A);
525       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
526 
527       // clang-format off
528       PetscCall(
529         base_type::ResizeFactLwork(
530           mcu, stream,
531           [&](cupmBlasInt_t *lwork)
532           {
533             return cupmSolverXpotrs_bufferSize(
534               handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
535             );
536           }
537         )
538       );
539       // clang-format on
540       PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
541       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
542     }
543     PetscCall(PetscLogGpuTimeEnd());
544     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
545     PetscFunctionReturn(PETSC_SUCCESS);
546   }
547 };
548 
549 template <device::cupm::DeviceType T>
550 struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
551   using base_type = SolveCommon<SolveQR>;
552 
553   static constexpr const char   *NAME() noexcept { return "QR"; }
554   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }
555 
556   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
557   {
558     const auto         m     = static_cast<cupmBlasInt_t>(A->rmap->n);
559     const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
560     const auto         min   = std::min(m, n);
561     const auto         mimpl = MatIMPLCast(A);
562     cupmStream_t       stream;
563     cupmSolverHandle_t handle;
564     PetscDeviceContext dctx;
565 
566     PetscFunctionBegin;
567     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
568     PetscCall(GetHandles_(&dctx, &handle, &stream));
569     PetscCall(base_type::FactorPrepare(A, stream));
570     mimpl->rank = min;
571     {
572       const auto mcu = MatCUPMCast(A);
573       const auto da  = DeviceArrayReadWrite(dctx, A);
574       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
575 
576       if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
577       if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
578       // clang-format off
579       PetscCall(
580         base_type::ResizeFactLwork(
581           mcu, stream,
582           [&](cupmBlasInt_t *fact_lwork)
583           {
584             return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
585           }
586         )
587       );
588       // clang-format on
589       PetscCall(PetscLogGpuTimeBegin());
590       PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
591       PetscCall(PetscLogGpuTimeEnd());
592       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
593     }
594     PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
595     PetscFunctionReturn(PETSC_SUCCESS);
596   }
597 
598   template <bool transpose>
599   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
600   {
601     const auto         mimpl      = MatIMPLCast(A);
602     const auto         rank       = static_cast<cupmBlasInt_t>(mimpl->rank);
603     const auto         mcu        = MatCUPMCast(A);
604     const auto         fact_info  = mcu->d_fact_info;
605     const auto         fact_tau   = mcu->d_fact_tau;
606     const auto         fact_work  = mcu->d_fact_work;
607     const auto         fact_lwork = mcu->d_fact_lwork;
608     cupmSolverHandle_t solver_handle;
609     cupmBlasHandle_t   blas_handle;
610 
611     PetscFunctionBegin;
612     PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
613     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
614     PetscCall(PetscLogGpuTimeBegin());
615     {
616       const auto da  = DeviceArrayRead(dctx, A);
617       const auto one = cupmScalarCast(1.0);
618       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
619 
620       if (transpose) {
621         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
622         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
623         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
624       } else {
625         constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;
626 
627         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
628         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
629         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
630       }
631     }
632     PetscCall(PetscLogGpuTimeEnd());
633     PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
634     PetscFunctionReturn(PETSC_SUCCESS);
635   }
636 };
637 
638 template <device::cupm::DeviceType T>
639 template <typename Solver, bool transpose>
640 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
641 {
642   using namespace vec::cupm;
643   const auto         pobj_A  = PetscObjectCast(A);
644   const auto         m       = static_cast<cupmBlasInt_t>(A->rmap->n);
645   const auto         k       = static_cast<cupmBlasInt_t>(A->cmap->n);
646   auto              &workvec = MatCUPMCast(A)->workvec;
647   PetscScalar       *y_array = nullptr;
648   PetscDeviceContext dctx;
649   PetscBool          xiscupm, yiscupm, aiscupm;
650   bool               use_y_array_directly;
651   cupmStream_t       stream;
652 
653   PetscFunctionBegin;
654   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
655   PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
656   PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
657   PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
658   PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
659   PetscCall(GetHandles_(&dctx, &stream));
660   use_y_array_directly = yiscupm && (k >= m);
661   {
662     const PetscScalar *x_array;
663     const auto         xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
664     const auto         copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
665 
666     if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
667     // The logic here is to try to minimize the amount of memory copying:
668     //
669     // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
670     // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
671     // data in order to copy it into the y array. So the array x will be wherever the data
672     // already is so that only one memcpy is performed
673     if (xisdevice) {
674       PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
675     } else {
676       PetscCall(VecGetArrayRead(x, &x_array));
677     }
678     PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
679     PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
680     if (xisdevice) {
681       PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
682     } else {
683       PetscCall(VecRestoreArrayRead(x, &x_array));
684     }
685   }
686 
687   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
688   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
689   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
690 
691   if (use_y_array_directly) {
692     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
693   } else {
694     const auto   copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
695     PetscScalar *yv;
696 
697     // The logic here is that the data is not yet in either y's GPU array or its CPU array.
698     // There is nothing in the interface to say where the user would like it to end up. So we
699     // choose the GPU, because it is the faster option
700     if (yiscupm) {
701       PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
702     } else {
703       PetscCall(VecGetArray(y, &yv));
704     }
705     PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
706     if (yiscupm) {
707       PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
708     } else {
709       PetscCall(VecRestoreArray(y, &yv));
710     }
711     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
712   }
713   PetscFunctionReturn(PETSC_SUCCESS);
714 }
715 
716 template <device::cupm::DeviceType T>
717 template <typename Solver, bool transpose>
718 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
719 {
720   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
721   const auto         k = static_cast<cupmBlasInt_t>(A->cmap->n);
722   cupmBlasInt_t      nrhs, ldb, ldx, ldy;
723   PetscScalar       *y;
724   PetscBool          biscupm, xiscupm, aiscupm;
725   PetscDeviceContext dctx;
726   cupmStream_t       stream;
727 
728   PetscFunctionBegin;
729   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
730   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
731   PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
732   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
733   PetscCall(GetHandles_(&dctx, &stream));
734   {
735     PetscInt n;
736 
737     PetscCall(MatGetSize(B, nullptr, &n));
738     PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
739     PetscCall(MatDenseGetLDA(B, &n));
740     PetscCall(PetscCUPMBlasIntCast(n, &ldb));
741     PetscCall(MatDenseGetLDA(X, &n));
742     PetscCall(PetscCUPMBlasIntCast(n, &ldx));
743   }
744   {
745     // The logic here is to try to minimize the amount of memory copying:
746     //
747     // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
748     // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
749     // get the data in order to copy it into the y array. So the array b will be wherever the
750     // data already is so that only one memcpy is performed
751     const auto         bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
752     const auto         copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
753     const PetscScalar *b;
754 
755     if (bisdevice) {
756       b = DeviceArrayRead(dctx, B);
757     } else if (biscupm) {
758       b = HostArrayRead(dctx, B);
759     } else {
760       PetscCall(MatDenseGetArrayRead(B, &b));
761     }
762 
763     if (ldx < m || !xiscupm) {
764       // X's array cannot serve as the array (too small or not on device), B's array cannot
765       // serve as the array (const), so allocate a new array
766       ldy = m;
767       PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
768     } else {
769       // X's array should serve as the array
770       ldy = ldx;
771       y   = DeviceArrayWrite(dctx, X);
772     }
773     PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
774     if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
775   }
776 
777   // convert to CUPM twice??????????????????????????????????
778   // but A should already be CUPM??????????????????????????????????????
779   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
780   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
781   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
782 
783   if (ldx < m || !xiscupm) {
784     const auto   copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
785     PetscScalar *x;
786 
787     // The logic here is that the data is not yet in either X's GPU array or its CPU
788     // array. There is nothing in the interface to say where the user would like it to end up.
789     // So we choose the GPU, because it is the faster option
790     if (xiscupm) {
791       x = DeviceArrayWrite(dctx, X);
792     } else {
793       PetscCall(MatDenseGetArray(X, &x));
794     }
795     PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
796     if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
797     PetscCallCUPM(cupmFreeAsync(y, stream));
798   }
799   PetscFunctionReturn(PETSC_SUCCESS);
800 }
801 
802 template <device::cupm::DeviceType T>
803 template <bool transpose, bool hermitian>
804 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAddColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end) noexcept
805 {
806   const auto         m   = static_cast<cupmBlasInt_t>(A->rmap->n);
807   const auto         n   = static_cast<cupmBlasInt_t>(c_end - c_start);
808   const auto         lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
809   cupmBlasHandle_t   handle;
810   PetscDeviceContext dctx;
811 
812   PetscFunctionBegin;
813   if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add
814   if (!m || !n) {
815     // mult only
816     if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0));
817     PetscFunctionReturn(PETSC_SUCCESS);
818   }
819   PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
820   PetscCall(GetHandles_(&dctx, &handle));
821   {
822     constexpr auto op   = transpose ? (hermitian ? CUPMBLAS_OP_C : CUPMBLAS_OP_T) : CUPMBLAS_OP_N;
823     const auto     one  = cupmScalarCast(1.0);
824     const auto     zero = cupmScalarCast(0.0);
825     const auto     da   = DeviceArrayRead(dctx, A);
826     const auto     dxx  = VecSeq_CUPM::DeviceArrayRead(dctx, xx);
827     const auto     dzz  = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz);
828 
829     PetscCall(PetscLogGpuTimeBegin());
830     PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata() + c_start * lda, lda, dxx.cupmdata() + (transpose ? 0 : c_start), 1, (yy ? &one : &zero), dzz.cupmdata() + (transpose ? c_start : 0), 1));
831     PetscCall(PetscLogGpuTimeEnd());
832   }
833   PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
834   PetscFunctionReturn(PETSC_SUCCESS);
835 }
836 
837 template <device::cupm::DeviceType T>
838 template <bool transpose, bool hermitian>
839 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end) noexcept
840 {
841   PetscFunctionBegin;
842   PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, nullptr, yy, c_start, c_end));
843   PetscFunctionReturn(PETSC_SUCCESS);
844 }
845 
846 template <device::cupm::DeviceType T>
847 template <bool transpose, bool hermitian>
848 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
849 {
850   PetscFunctionBegin;
851   PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, yy, zz, 0, A->cmap->n));
852   PetscFunctionReturn(PETSC_SUCCESS);
853 }
854 
855 // ==========================================================================================
856 // MatDense_Seq_CUPM - Private API - Conversion Dispatch
857 // ==========================================================================================
858 
859 template <device::cupm::DeviceType T>
860 template <bool to_host>
861 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
862 {
863   PetscFunctionBegin;
864   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
865     // TODO these cases should be optimized
866     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
867   } else {
868     const auto B    = *newmat;
869     const auto pobj = PetscObjectCast(B);
870 
871     if (to_host) {
872       PetscCall(BindToCPU(B, PETSC_TRUE));
873       PetscCall(Reset(B));
874     } else {
875       PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
876     }
877 
878     PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
879     PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
880     // cvec might be the wrong VecType, destroy and rebuild it if necessary
881     // REVIEW ME: this is possibly very inefficient
882     PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));
883 
884     MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
885     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
886     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
887     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
888     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
889     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
890     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
891     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
892     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
893     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
894     MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
895     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation);
896 
897     if (to_host) {
898       B->offloadmask = PETSC_OFFLOAD_CPU;
899     } else {
900       Mat_SeqDenseCUPM *mcu;
901 
902       PetscCall(PetscNew(&mcu));
903       B->spptr       = mcu;
904       B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
905       PetscCall(BindToCPU(B, PETSC_FALSE));
906     }
907 
908     MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
909     MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
910   }
911   PetscFunctionReturn(PETSC_SUCCESS);
912 }
913 
914 // ==========================================================================================
915 // MatDense_Seq_CUPM - Public API
916 // ==========================================================================================
917 
918 template <device::cupm::DeviceType T>
919 inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
920 {
921   return MATSEQDENSECUPM();
922 }
923 
924 template <device::cupm::DeviceType T>
925 inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
926 {
927   return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
928 }
929 
930 template <device::cupm::DeviceType T>
931 inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
932 {
933   return static_cast<Mat_SeqDense *>(m->data);
934 }
935 
936 template <device::cupm::DeviceType T>
937 inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
938 {
939   return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
940 }
941 
942 template <device::cupm::DeviceType T>
943 inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
944 {
945   return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
946 }
947 
948 // ==========================================================================================
949 
950 // MatCreate_SeqDenseCUPM()
951 template <device::cupm::DeviceType T>
952 inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
953 {
954   PetscFunctionBegin;
955   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
956   PetscCall(MatCreate_SeqDense(A));
957   PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
958   PetscFunctionReturn(PETSC_SUCCESS);
959 }
960 
961 template <device::cupm::DeviceType T>
962 inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
963 {
964   PetscFunctionBegin;
965   // prevent copying back data if we own the data pointer
966   if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
967   PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
968   PetscCall(MatDestroy_SeqDense(A));
969   PetscFunctionReturn(PETSC_SUCCESS);
970 }
971 
972 // obj->ops->setup()
973 template <device::cupm::DeviceType T>
974 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
975 {
976   PetscFunctionBegin;
977   PetscCall(PetscLayoutSetUp(A->rmap));
978   PetscCall(PetscLayoutSetUp(A->cmap));
979   if (!A->preallocated) {
980     PetscDeviceContext dctx;
981 
982     PetscCall(GetHandles_(&dctx));
983     PetscCall(SetPreallocation(A, dctx, nullptr));
984   }
985   PetscFunctionReturn(PETSC_SUCCESS);
986 }
987 
988 template <device::cupm::DeviceType T>
989 inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
990 {
991   PetscFunctionBegin;
992   if (const auto mcu = MatCUPMCast(A)) {
993     cupmStream_t stream;
994 
995     PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
996     PetscCall(GetHandles_(&stream));
997     if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
998     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
999     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
1000     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
1001     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1002     PetscCall(VecDestroy(&mcu->workvec));
1003     PetscCall(PetscFree(A->spptr /* mcu */));
1004   }
1005   PetscFunctionReturn(PETSC_SUCCESS);
1006 }
1007 
1008 // ==========================================================================================
1009 
1010 template <device::cupm::DeviceType T>
1011 inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
1012 {
1013   const auto mimpl = MatIMPLCast(A);
1014   const auto pobj  = PetscObjectCast(A);
1015 
1016   PetscFunctionBegin;
1017   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1018   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1019   A->boundtocpu = to_host;
1020   PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
1021   if (to_host) {
1022     PetscDeviceContext dctx;
1023 
1024     // make sure we have an up-to-date copy on the CPU
1025     PetscCall(GetHandles_(&dctx));
1026     PetscCall(DeviceToHost_(A, dctx));
1027   } else {
1028     PetscBool iscupm;
1029 
1030     if (auto &cvec = mimpl->cvec) {
1031       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1032       if (!iscupm) PetscCall(VecDestroy(&cvec));
1033     }
1034     if (auto &cmat = mimpl->cmat) {
1035       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1036       if (!iscupm) PetscCall(MatDestroy(&cmat));
1037     }
1038   }
1039 
1040   // ============================================================
1041   // Composed ops
1042   // ============================================================
1043   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1044   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1045   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1046   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1047   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1048   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1049   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1050   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1051   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1052   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1053   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1054   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1055   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1056   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1057   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1058   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1059   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1060   MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1061   MatComposeOp_CUPM(to_host, pobj, "MatMultAddColumnRange_C", MatMultAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>);
1062   MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeColumnRange_C", MatMultHermitianTransposeColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1063   MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeAddColumnRange_C", MatMultHermitianTransposeAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1064   // always the same
1065   PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1066 
1067   // ============================================================
1068   // Function pointer ops
1069   // ============================================================
1070   MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1071   MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>(A, xx, nullptr, yy); });
1072   MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>(A, xx, nullptr, yy); });
1073   MatSetOp_CUPM(to_host, A, multhermitiantranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>(A, xx, nullptr, yy); });
1074   MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>);
1075   MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>);
1076   MatSetOp_CUPM(to_host, A, multhermitiantransposeadd, MatMultHermitianTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>);
1077   MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1078   MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1079   MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1080   MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1081   MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1082   MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1083   MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1084   MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1085   MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1086   MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1087   MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1088   MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1089   MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1090   MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal);
1091   // seemingly always the same
1092   A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1093 
1094   if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1095   PetscFunctionReturn(PETSC_SUCCESS);
1096 }
1097 
1098 template <device::cupm::DeviceType T>
1099 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1100 {
1101   PetscFunctionBegin;
1102   PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1103   PetscFunctionReturn(PETSC_SUCCESS);
1104 }
1105 
1106 template <device::cupm::DeviceType T>
1107 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1108 {
1109   PetscFunctionBegin;
1110   PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1111   PetscFunctionReturn(PETSC_SUCCESS);
1112 }
1113 
1114 // ==========================================================================================
1115 
1116 template <device::cupm::DeviceType T>
1117 template <PetscMemType mtype, PetscMemoryAccessMode access>
1118 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1119 {
1120   constexpr auto hostmem     = PetscMemTypeHost(mtype);
1121   constexpr auto read_access = PetscMemoryAccessRead(access);
1122 
1123   PetscFunctionBegin;
1124   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1125   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1126   if (hostmem) {
1127     if (read_access) {
1128       PetscCall(DeviceToHost_(m, dctx));
1129     } else if (!MatIMPLCast(m)->v) {
1130       // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1131       PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1132     }
1133     *array = MatIMPLCast(m)->v;
1134   } else {
1135     if (read_access) {
1136       PetscCall(HostToDevice_(m, dctx));
1137     } else if (!MatCUPMCast(m)->d_v) {
1138       // write-only
1139       PetscCall(SetPreallocation(m, dctx, nullptr));
1140     }
1141     *array = MatCUPMCast(m)->d_v;
1142   }
1143   if (PetscMemoryAccessWrite(access)) {
1144     m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1145     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1146   }
1147   PetscFunctionReturn(PETSC_SUCCESS);
1148 }
1149 
1150 template <device::cupm::DeviceType T>
1151 template <PetscMemType mtype, PetscMemoryAccessMode access>
1152 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1153 {
1154   PetscFunctionBegin;
1155   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1156   if (PetscMemoryAccessWrite(access)) {
1157     // WRITE or READ_WRITE
1158     m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1159     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1160   }
1161   if (array) {
1162     PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1163     *array = nullptr;
1164   }
1165   PetscFunctionReturn(PETSC_SUCCESS);
1166 }
1167 
1168 template <device::cupm::DeviceType T>
1169 template <PetscMemoryAccessMode access>
1170 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1171 {
1172   PetscFunctionBegin;
1173   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1174   if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1175   PetscFunctionReturn(PETSC_SUCCESS);
1176 }
1177 
1178 template <device::cupm::DeviceType T>
1179 template <PetscMemoryAccessMode access>
1180 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1181 {
1182   PetscFunctionBegin;
1183   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1184   PetscFunctionReturn(PETSC_SUCCESS);
1185 }
1186 
1187 // ==========================================================================================
1188 
1189 template <device::cupm::DeviceType T>
1190 inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1191 {
1192   const auto mimpl = MatIMPLCast(A);
1193   const auto mcu   = MatCUPMCast(A);
1194 
1195   PetscFunctionBegin;
1196   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1197   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1198   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1199   if (mimpl->v) {
1200     PetscDeviceContext dctx;
1201 
1202     PetscCall(GetHandles_(&dctx));
1203     PetscCall(HostToDevice_(A, dctx));
1204   }
1205   mcu->unplacedarray         = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1206   mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1207   PetscFunctionReturn(PETSC_SUCCESS);
1208 }
1209 
1210 template <device::cupm::DeviceType T>
1211 inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1212 {
1213   const auto mimpl = MatIMPLCast(A);
1214   const auto mcu   = MatCUPMCast(A);
1215 
1216   PetscFunctionBegin;
1217   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1218   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1219   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1220   if (!mcu->d_user_alloc) {
1221     cupmStream_t stream;
1222 
1223     PetscCall(GetHandles_(&stream));
1224     PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1225   }
1226   mcu->d_v          = const_cast<PetscScalar *>(array);
1227   mcu->d_user_alloc = PETSC_FALSE;
1228   PetscFunctionReturn(PETSC_SUCCESS);
1229 }
1230 
1231 template <device::cupm::DeviceType T>
1232 inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1233 {
1234   const auto mimpl = MatIMPLCast(A);
1235   const auto mcu   = MatCUPMCast(A);
1236 
1237   PetscFunctionBegin;
1238   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1239   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1240   if (mimpl->v) {
1241     PetscDeviceContext dctx;
1242 
1243     PetscCall(GetHandles_(&dctx));
1244     PetscCall(HostToDevice_(A, dctx));
1245   }
1246   mcu->d_v          = util::exchange(mcu->unplacedarray, nullptr);
1247   mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1248   PetscFunctionReturn(PETSC_SUCCESS);
1249 }
1250 
1251 // ==========================================================================================
1252 
1253 template <device::cupm::DeviceType T>
1254 template <bool transpose_A, bool transpose_B>
1255 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1256 {
1257   cupmBlasInt_t      m, n, k;
1258   PetscBool          Aiscupm, Biscupm;
1259   PetscDeviceContext dctx;
1260   cupmBlasHandle_t   handle;
1261 
1262   PetscFunctionBegin;
1263   PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1264   PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1265   PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1266   if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
1267 
1268   // we may end up with SEQDENSE as one of the arguments
1269   // REVIEW ME: how? and why is it not B and C????????
1270   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1271   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1272   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1273   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1274   PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1275   PetscCall(GetHandles_(&dctx, &handle));
1276 
1277   PetscCall(PetscLogGpuTimeBegin());
1278   {
1279     const auto one  = cupmScalarCast(1.0);
1280     const auto zero = cupmScalarCast(0.0);
1281     const auto da   = DeviceArrayRead(dctx, A);
1282     const auto db   = DeviceArrayRead(dctx, B);
1283     const auto dc   = DeviceArrayWrite(dctx, C);
1284     PetscInt   alda, blda, clda;
1285 
1286     PetscCall(MatDenseGetLDA(A, &alda));
1287     PetscCall(MatDenseGetLDA(B, &blda));
1288     PetscCall(MatDenseGetLDA(C, &clda));
1289     PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1290   }
1291   PetscCall(PetscLogGpuTimeEnd());
1292 
1293   PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1294   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1295   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1296   PetscFunctionReturn(PETSC_SUCCESS);
1297 }
1298 
1299 template <device::cupm::DeviceType T>
1300 inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1301 {
1302   const auto m = A->rmap->n;
1303   const auto n = A->cmap->n;
1304 
1305   PetscFunctionBegin;
1306   PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1307   // The two matrices must have the same copy implementation to be eligible for fast copy
1308   if (A->ops->copy == B->ops->copy) {
1309     PetscDeviceContext dctx;
1310     cupmStream_t       stream;
1311 
1312     PetscCall(GetHandles_(&dctx, &stream));
1313     PetscCall(PetscLogGpuTimeBegin());
1314     {
1315       const auto va = DeviceArrayRead(dctx, A);
1316       const auto vb = DeviceArrayWrite(dctx, B);
1317       // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1318       // lda!
1319       const auto lda_a = MatIMPLCast(A)->lda;
1320       const auto lda_b = MatIMPLCast(B)->lda;
1321 
1322       if (lda_a > m || lda_b > m) {
1323         PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1324         PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1325         PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1326       } else {
1327         PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1328       }
1329     }
1330     PetscCall(PetscLogGpuTimeEnd());
1331   } else {
1332     PetscCall(MatCopy_Basic(A, B, str));
1333   }
1334   PetscFunctionReturn(PETSC_SUCCESS);
1335 }
1336 
1337 template <device::cupm::DeviceType T>
1338 inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1339 {
1340   PetscDeviceContext dctx;
1341   cupmStream_t       stream;
1342 
1343   PetscFunctionBegin;
1344   PetscCall(GetHandles_(&dctx, &stream));
1345   PetscCall(PetscLogGpuTimeBegin());
1346   {
1347     const auto va  = DeviceArrayWrite(dctx, m);
1348     const auto lda = MatIMPLCast(m)->lda;
1349     const auto ma  = m->rmap->n;
1350     const auto na  = m->cmap->n;
1351 
1352     if (lda > ma) {
1353       PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1354     } else {
1355       PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1356     }
1357   }
1358   PetscCall(PetscLogGpuTimeEnd());
1359   PetscFunctionReturn(PETSC_SUCCESS);
1360 }
1361 
1362 namespace detail
1363 {
1364 
1365 // ==========================================================================================
1366 // SubMatIndexFunctor
1367 //
1368 // Iterator which permutes a linear index range into matrix indices for am nrows x ncols
1369 // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1370 // the i'th sequential entry in the matrix.
1371 // ==========================================================================================
1372 template <typename T>
1373 struct SubMatIndexFunctor {
1374   PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }
1375 
1376   PetscInt nrows;
1377   PetscInt ncols;
1378   PetscInt lda;
1379 };
1380 
1381 template <typename Iterator>
1382 struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>> {
1383   using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>>;
1384 
1385   using iterator = typename base_type::iterator;
1386 
1387   constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1388     base_type{
1389       std::move(first), std::move(last), {nrows, ncols, lda}
1390   }
1391   {
1392   }
1393 
1394   PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1395 };
1396 
1397 namespace
1398 {
1399 
1400 template <typename T>
1401 PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1402 {
1403   const auto nrows = rend - rstart;
1404   const auto ncols = cend - cstart;
1405   const auto dptr  = thrust::device_pointer_cast(ptr);
1406 
1407   return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1408 }
1409 
1410 } // namespace
1411 
1412 } // namespace detail
1413 
1414 template <device::cupm::DeviceType T>
1415 inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1416 {
1417   const auto         m = A->rmap->n;
1418   const auto         n = A->cmap->n;
1419   const auto         N = m * n;
1420   PetscDeviceContext dctx;
1421 
1422   PetscFunctionBegin;
1423   PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1424   PetscCall(GetHandles_(&dctx));
1425   {
1426     const auto da  = DeviceArrayReadWrite(dctx, A);
1427     const auto lda = MatIMPLCast(A)->lda;
1428 
1429     if (lda > m) {
1430       cupmStream_t stream;
1431 
1432       PetscCall(GetHandlesFrom_(dctx, &stream));
1433       // clang-format off
1434       PetscCallThrust(
1435         const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());
1436 
1437         THRUST_CALL(
1438           thrust::transform,
1439           stream,
1440           sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1441           device::cupm::functors::make_times_equals(alpha)
1442         )
1443       );
1444       // clang-format on
1445     } else {
1446       const auto       cu_alpha = cupmScalarCast(alpha);
1447       cupmBlasHandle_t handle;
1448 
1449       PetscCall(GetHandlesFrom_(dctx, &handle));
1450       PetscCall(PetscLogGpuTimeBegin());
1451       PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1452       PetscCall(PetscLogGpuTimeEnd());
1453     }
1454   }
1455   PetscCall(PetscLogGpuFlops(N));
1456   PetscFunctionReturn(PETSC_SUCCESS);
1457 }
1458 
1459 template <device::cupm::DeviceType T>
1460 inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1461 {
1462   const auto         m_x = X->rmap->n, m_y = Y->rmap->n;
1463   const auto         n_x = X->cmap->n, n_y = Y->cmap->n;
1464   const auto         N = m_x * n_x;
1465   PetscDeviceContext dctx;
1466 
1467   PetscFunctionBegin;
1468   if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1469   PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1470   PetscCall(GetHandles_(&dctx));
1471   {
1472     const auto dx    = DeviceArrayRead(dctx, X);
1473     const auto dy    = DeviceArrayReadWrite(dctx, Y);
1474     const auto lda_x = MatIMPLCast(X)->lda;
1475     const auto lda_y = MatIMPLCast(Y)->lda;
1476 
1477     if (lda_x > m_x || lda_y > m_x) {
1478       cupmStream_t stream;
1479 
1480       PetscCall(GetHandlesFrom_(dctx, &stream));
1481       // clang-format off
1482       PetscCallThrust(
1483         const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1484         const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());
1485 
1486         THRUST_CALL(
1487           thrust::transform,
1488           stream,
1489           sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1490           device::cupm::functors::make_axpy(alpha)
1491         );
1492       );
1493       // clang-format on
1494     } else {
1495       const auto       cu_alpha = cupmScalarCast(alpha);
1496       cupmBlasHandle_t handle;
1497 
1498       PetscCall(GetHandlesFrom_(dctx, &handle));
1499       PetscCall(PetscLogGpuTimeBegin());
1500       PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1501       PetscCall(PetscLogGpuTimeEnd());
1502     }
1503   }
1504   PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1505   PetscFunctionReturn(PETSC_SUCCESS);
1506 }
1507 
1508 template <device::cupm::DeviceType T>
1509 inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1510 {
1511   const auto         hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1512   PetscDeviceContext dctx;
1513 
1514   PetscFunctionBegin;
1515   PetscCall(GetHandles_(&dctx));
1516   // do not call SetPreallocation() yet, we call it afterwards??
1517   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1518   PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1519   if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1520   // allocate memory if needed
1521   if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr));
1522   PetscFunctionReturn(PETSC_SUCCESS);
1523 }
1524 
1525 template <device::cupm::DeviceType T>
1526 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1527 {
1528   PetscBool device;
1529 
1530   PetscFunctionBegin;
1531   PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1532   if (device) {
1533     const auto         m = A->rmap->n;
1534     const auto         n = A->cmap->n;
1535     PetscDeviceContext dctx;
1536 
1537     PetscCall(GetHandles_(&dctx));
1538     {
1539       const auto a = DeviceArrayWrite(dctx, A);
1540       PetscInt   lda;
1541 
1542       PetscCall(MatDenseGetLDA(A, &lda));
1543       if (lda > m) {
1544         for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1545       } else {
1546         PetscInt mn;
1547 
1548         PetscCall(PetscIntMultError(m, n, &mn));
1549         PetscCall(PetscRandomGetValues(rng, mn, a));
1550       }
1551     }
1552   } else {
1553     PetscCall(MatSetRandom_SeqDense(A, rng));
1554   }
1555   PetscFunctionReturn(PETSC_SUCCESS);
1556 }
1557 
1558 // ==========================================================================================
1559 
1560 template <device::cupm::DeviceType T>
1561 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1562 {
1563   const auto         offloadmask = A->offloadmask;
1564   const auto         n           = A->rmap->n;
1565   const auto         col_offset  = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1566   PetscBool          viscupm;
1567   PetscDeviceContext dctx;
1568   cupmStream_t       stream;
1569 
1570   PetscFunctionBegin;
1571   PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1572   PetscCall(GetHandles_(&dctx, &stream));
1573   if (viscupm && !v->boundtocpu) {
1574     const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
1575 
1576     // update device data
1577     if (PetscOffloadDevice(offloadmask)) {
1578       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1579     } else {
1580       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1581     }
1582   } else {
1583     PetscScalar *x;
1584 
1585     // update host data
1586     PetscCall(VecGetArrayWrite(v, &x));
1587     if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1588       PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1589     } else if (PetscOffloadDevice(offloadmask)) {
1590       PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1591     }
1592     PetscCall(VecRestoreArrayWrite(v, &x));
1593   }
1594   PetscFunctionReturn(PETSC_SUCCESS);
1595 }
1596 
1597 template <device::cupm::DeviceType T>
1598 template <PetscMemoryAccessMode access>
1599 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1600 {
1601   using namespace vec::cupm;
1602   const auto         mimpl = MatIMPLCast(A);
1603   PetscDeviceContext dctx;
1604 
1605   PetscFunctionBegin;
1606   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1607   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1608   mimpl->vecinuse = col + 1;
1609   if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec));
1610   PetscCall(GetHandles_(&dctx));
1611   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1612   PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1613   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1614   *v = mimpl->cvec;
1615   PetscFunctionReturn(PETSC_SUCCESS);
1616 }
1617 
1618 template <device::cupm::DeviceType T>
1619 template <PetscMemoryAccessMode access>
1620 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1621 {
1622   using namespace vec::cupm;
1623   const auto         mimpl = MatIMPLCast(A);
1624   const auto         cvec  = mimpl->cvec;
1625   PetscDeviceContext dctx;
1626 
1627   PetscFunctionBegin;
1628   PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1629   PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1630   mimpl->vecinuse = 0;
1631   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1632   PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1633   PetscCall(GetHandles_(&dctx));
1634   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1635   if (v) *v = nullptr;
1636   PetscFunctionReturn(PETSC_SUCCESS);
1637 }
1638 
1639 // ==========================================================================================
1640 
1641 template <device::cupm::DeviceType T>
1642 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1643 {
1644   Mat                fact = nullptr;
1645   PetscDeviceContext dctx;
1646 
1647   PetscFunctionBegin;
1648   PetscCall(GetHandles_(&dctx));
1649   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1650   fact->factortype = ftype;
1651   switch (ftype) {
1652   case MAT_FACTOR_LU:
1653   case MAT_FACTOR_ILU: // fall-through
1654     fact->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1655     fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1656     break;
1657   case MAT_FACTOR_CHOLESKY:
1658   case MAT_FACTOR_ICC: // fall-through
1659     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1660     break;
1661   case MAT_FACTOR_QR: {
1662     const auto pobj = PetscObjectCast(fact);
1663 
1664     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1665     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1666   } break;
1667   case MAT_FACTOR_NONE:
1668   case MAT_FACTOR_ILUDT:     // fall-through
1669   case MAT_FACTOR_NUM_TYPES: // fall-through
1670     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1671   }
1672   PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1673   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1674   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1675   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1676   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1677   *fact_out = fact;
1678   PetscFunctionReturn(PETSC_SUCCESS);
1679 }
1680 
1681 template <device::cupm::DeviceType T>
1682 inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1683 {
1684   const auto         mimpl = MatIMPLCast(A);
1685   const auto         mcu   = MatCUPMCast(A);
1686   const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
1687   cupmSolverHandle_t handle;
1688   PetscDeviceContext dctx;
1689   cupmStream_t       stream;
1690 
1691   PetscFunctionBegin;
1692 #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1693   // HIP appears to have this by default??
1694   PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1695 #endif
1696   if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1697   PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1698   // spd
1699   PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());
1700 
1701   PetscCall(GetHandles_(&dctx, &handle, &stream));
1702   {
1703     const auto    da  = DeviceArrayReadWrite(dctx, A);
1704     const auto    lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1705     cupmBlasInt_t il;
1706 
1707     PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1708     if (il > mcu->d_fact_lwork) {
1709       mcu->d_fact_lwork = il;
1710       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1711       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1712     }
1713     PetscCall(PetscLogGpuTimeBegin());
1714     PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1715     PetscCall(PetscLogGpuTimeEnd());
1716   }
1717   PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1718   // TODO (write cuda kernel)
1719   PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1720   PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
1721 
1722   A->ops->solve          = nullptr;
1723   A->ops->solvetranspose = nullptr;
1724   A->ops->matsolve       = nullptr;
1725   A->factortype          = MAT_FACTOR_NONE;
1726 
1727   PetscCall(PetscFree(A->solvertype));
1728   PetscFunctionReturn(PETSC_SUCCESS);
1729 }
1730 
1731 // ==========================================================================================
1732 
1733 template <device::cupm::DeviceType T>
1734 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1735 {
1736   const auto         mimpl        = MatIMPLCast(A);
1737   const auto         array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1738   const auto         n            = rend - rbegin;
1739   const auto         m            = cend - cbegin;
1740   auto              &cmat         = mimpl->cmat;
1741   PetscDeviceContext dctx;
1742 
1743   PetscFunctionBegin;
1744   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1745   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1746   mimpl->matinuse = cbegin + 1;
1747 
1748   PetscCall(GetHandles_(&dctx));
1749   PetscCall(HostToDevice_(A, dctx));
1750 
1751   if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1752   {
1753     const auto device_array = array_offset(MatCUPMCast(A)->d_v);
1754 
1755     if (cmat) {
1756       PetscCall(PlaceArray(cmat, device_array));
1757     } else {
1758       PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1759     }
1760   }
1761   PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1762   // place CPU array if present but do not copy any data
1763   if (const auto host_array = mimpl->v) {
1764     cmat->offloadmask = PETSC_OFFLOAD_GPU;
1765     PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1766   }
1767 
1768   cmat->offloadmask = A->offloadmask;
1769   *mat              = cmat;
1770   PetscFunctionReturn(PETSC_SUCCESS);
1771 }
1772 
1773 template <device::cupm::DeviceType T>
1774 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1775 {
1776   const auto mimpl = MatIMPLCast(A);
1777   const auto cmat  = mimpl->cmat;
1778   const auto reset = static_cast<bool>(mimpl->v);
1779   bool       copy, was_offload_host;
1780 
1781   PetscFunctionBegin;
1782   PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1783   PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1784   PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1785   mimpl->matinuse = 0;
1786 
1787   // calls to ResetArray may change it, so save it here
1788   was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1789   if (was_offload_host && !reset) {
1790     copy = true;
1791     PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1792   } else {
1793     copy = false;
1794   }
1795 
1796   PetscCall(ResetArray(cmat));
1797   if (reset) PetscCall(MatDenseResetArray(cmat));
1798   if (copy) {
1799     PetscDeviceContext dctx;
1800 
1801     PetscCall(GetHandles_(&dctx));
1802     PetscCall(DeviceToHost_(A, dctx));
1803   } else {
1804     A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1805   }
1806 
1807   cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1808   *m                = nullptr;
1809   PetscFunctionReturn(PETSC_SUCCESS);
1810 }
1811 
1812 // ==========================================================================================
1813 
1814 namespace
1815 {
1816 
1817 template <device::cupm::DeviceType T>
1818 inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1819 {
1820   PetscFunctionBegin;
1821   if (TA) {
1822     if (TB) {
1823       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1824     } else {
1825       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1826     }
1827   } else {
1828     if (TB) {
1829       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1830     } else {
1831       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1832     }
1833   }
1834   PetscFunctionReturn(PETSC_SUCCESS);
1835 }
1836 
1837 template <device::cupm::DeviceType T>
1838 inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
1839 {
1840   PetscFunctionBegin;
1841   for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
1842     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
1843     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
1844   }
1845   PetscFunctionReturn(PETSC_SUCCESS);
1846 }
1847 
1848 } // anonymous namespace
1849 
1850 } // namespace impl
1851 
1852 } // namespace cupm
1853 
1854 } // namespace mat
1855 
1856 } // namespace Petsc
1857