xref: /petsc/src/mat/impls/dense/seq/cupm/matseqdensecupm.hpp (revision 66af8762ec03dbef0e079729eb2a1734a35ed7ff)
1 #pragma once
2 
3 #include <petsc/private/matdensecupmimpl.h> /*I <petscmat.h> I*/
4 #include <../src/mat/impls/dense/seq/dense.h>
5 
6 #include <petsc/private/deviceimpl.h> // PetscDeviceContextGetOptionalNullContext_Internal()
7 #include <petsc/private/randomimpl.h> // _p_PetscRandom
8 #include <petsc/private/vecimpl.h>    // _p_Vec
9 #include <petsc/private/cupmobject.hpp>
10 #include <petsc/private/cupmsolverinterface.hpp>
11 
12 #include <petsc/private/cpp/type_traits.hpp> // PetscObjectCast()
13 #include <petsc/private/cpp/utility.hpp>     // util::exchange()
14 
15 #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp> // for VecSeq_CUPM
16 
17 namespace Petsc
18 {
19 
20 namespace mat
21 {
22 
23 namespace cupm
24 {
25 
26 namespace impl
27 {
28 
29 template <device::cupm::DeviceType T>
30 class MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
31 public:
32   MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);
33 
34 private:
35   struct Mat_SeqDenseCUPM {
36     PetscScalar *d_v;           // pointer to the matrix on the GPU
37     PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
38     bool         d_user_alloc;
39     bool         d_unplaced_user_alloc;
40     // factorization support
41     cupmBlasInt_t *d_fact_ipiv;  // device pivots
42     cupmScalar_t  *d_fact_tau;   // device QR tau vector
43     cupmBlasInt_t *d_fact_info;  // device info
44     cupmScalar_t  *d_fact_work;  // device workspace
45     cupmBlasInt_t  d_fact_lwork; // size of device workspace
46     // workspace
47     Vec workvec;
48   };
49 
50   static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;
51 
52   static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
53   static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;
54 
55   static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;
56 
57   template <typename Derived>
58   struct SolveCommon;
59   struct SolveQR;
60   struct SolveCholesky;
61   struct SolveLU;
62 
63   template <typename Solver, bool transpose>
64   static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
65   template <typename Solver, bool transpose>
66   static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
67   template <bool transpose>
68   static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;
69 
70   template <bool to_host>
71   static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;
72 
73   PETSC_NODISCARD static constexpr MatType       MATIMPLCUPM_() noexcept;
74   PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;
75 
76 public:
77   PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;
78 
79   // define these by hand since they don't fit the above mold
80   PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
81   PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;
82 
83   static PetscErrorCode Create(Mat) noexcept;
84   static PetscErrorCode Destroy(Mat) noexcept;
85   static PetscErrorCode SetUp(Mat) noexcept;
86   static PetscErrorCode Reset(Mat) noexcept;
87 
88   static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
89   static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
90   static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;
91 
92   template <PetscMemType, PetscMemoryAccessMode>
93   static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
94   template <PetscMemType, PetscMemoryAccessMode>
95   static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
96   template <PetscMemoryAccessMode>
97   static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
98   template <PetscMemoryAccessMode>
99   static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;
100 
101 private:
102   template <PetscMemType mtype, PetscMemoryAccessMode mode>
103   static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
104   {
105     PetscDeviceContext dctx;
106 
107     PetscFunctionBegin;
108     PetscCall(GetHandles_(&dctx));
109     PetscCall(GetArray<mtype, mode>(m, p, dctx));
110     PetscFunctionReturn(PETSC_SUCCESS);
111   }
112 
113   template <PetscMemType mtype, PetscMemoryAccessMode mode>
114   static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
115   {
116     PetscDeviceContext dctx;
117 
118     PetscFunctionBegin;
119     PetscCall(GetHandles_(&dctx));
120     PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
121     PetscFunctionReturn(PETSC_SUCCESS);
122   }
123 
124   template <PetscMemoryAccessMode mode>
125   static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
126   {
127     PetscDeviceContext dctx;
128 
129     PetscFunctionBegin;
130     PetscCall(GetHandles_(&dctx));
131     PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
132     PetscFunctionReturn(PETSC_SUCCESS);
133   }
134 
135   template <PetscMemoryAccessMode mode>
136   static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
137   {
138     PetscDeviceContext dctx;
139 
140     PetscFunctionBegin;
141     PetscCall(GetHandles_(&dctx));
142     PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
143     PetscFunctionReturn(PETSC_SUCCESS);
144   }
145 
146 public:
147   static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
148   static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
149   static PetscErrorCode ResetArray(Mat) noexcept;
150 
151   template <bool transpose_A, bool transpose_B>
152   static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
153   static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
154   static PetscErrorCode ZeroEntries(Mat) noexcept;
155   static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
156   static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
157   static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
158   static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;
159 
160   static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
161   template <PetscMemoryAccessMode>
162   static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
163   template <PetscMemoryAccessMode>
164   static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;
165 
166   static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
167   static PetscErrorCode InvertFactors(Mat) noexcept;
168 
169   static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
170   static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
171 };
172 
173 } // namespace impl
174 
175 namespace
176 {
177 
178 // Declare this here so that the functions below can make use of it
179 template <device::cupm::DeviceType T>
180 inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
181 {
182   PetscFunctionBegin;
183   PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
184   PetscFunctionReturn(PETSC_SUCCESS);
185 }
186 
187 } // anonymous namespace
188 
189 namespace impl
190 {
191 
192 // ==========================================================================================
193 // MatDense_Seq_CUPM - Private API - Utility
194 // ==========================================================================================
195 
196 template <device::cupm::DeviceType T>
197 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
198 {
199   const auto   mcu   = MatCUPMCast(m);
200   const auto   nrows = m->rmap->n;
201   const auto   ncols = m->cmap->n;
202   auto        &lda   = MatIMPLCast(m)->lda;
203   cupmStream_t stream;
204 
205   PetscFunctionBegin;
206   PetscCheckTypeName(m, MATSEQDENSECUPM());
207   PetscValidDeviceContext(dctx, 2);
208   PetscCall(checkCupmBlasIntCast(nrows));
209   PetscCall(checkCupmBlasIntCast(ncols));
210   PetscCall(GetHandlesFrom_(dctx, &stream));
211   if (lda <= 0) lda = nrows;
212   if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
213   if (user_device_array) {
214     mcu->d_user_alloc = PETSC_TRUE;
215     mcu->d_v          = user_device_array;
216   } else {
217     PetscInt size;
218 
219     mcu->d_user_alloc = PETSC_FALSE;
220     PetscCall(PetscIntMultError(lda, ncols, &size));
221     PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
222     PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
223   }
224   m->offloadmask = PETSC_OFFLOAD_GPU;
225   PetscFunctionReturn(PETSC_SUCCESS);
226 }
227 
228 template <device::cupm::DeviceType T>
229 inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
230 {
231   const auto nrows = m->rmap->n;
232   const auto ncols = m->cmap->n;
233   const auto copy  = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;
234 
235   PetscFunctionBegin;
236   PetscCheckTypeName(m, MATSEQDENSECUPM());
237   if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
238   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
239   if (copy) {
240     const auto   mcu = MatCUPMCast(m);
241     cupmStream_t stream;
242 
243     // Allocate GPU memory if not present
244     if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr));
245     PetscCall(GetHandlesFrom_(dctx, &stream));
246     PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
247     {
248       const auto mimpl = MatIMPLCast(m);
249       const auto lda   = mimpl->lda;
250       const auto src   = mimpl->v;
251       const auto dest  = mcu->d_v;
252 
253       if (lda > nrows) {
254         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
255       } else {
256         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
257       }
258     }
259     PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
260     // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
261     m->offloadmask = PETSC_OFFLOAD_BOTH;
262   }
263   PetscFunctionReturn(PETSC_SUCCESS);
264 }
265 
266 template <device::cupm::DeviceType T>
267 inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
268 {
269   const auto nrows = m->rmap->n;
270   const auto ncols = m->cmap->n;
271   const auto copy  = m->offloadmask == PETSC_OFFLOAD_GPU;
272 
273   PetscFunctionBegin;
274   PetscCheckTypeName(m, MATSEQDENSECUPM());
275   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
276   if (copy) {
277     const auto   mimpl = MatIMPLCast(m);
278     cupmStream_t stream;
279 
280     // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
281     if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
282     PetscCall(GetHandlesFrom_(dctx, &stream));
283     PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
284     {
285       const auto lda  = mimpl->lda;
286       const auto dest = mimpl->v;
287       const auto src  = MatCUPMCast(m)->d_v;
288 
289       if (lda > nrows) {
290         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
291       } else {
292         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
293       }
294     }
295     PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
296     // order is important, MatSeqDenseSetPreallocation() might set offloadmask
297     m->offloadmask = PETSC_OFFLOAD_BOTH;
298   }
299   PetscFunctionReturn(PETSC_SUCCESS);
300 }
301 
302 template <device::cupm::DeviceType T>
303 inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
304 {
305   PetscFunctionBegin;
306   if (PetscDefined(USE_DEBUG)) {
307     cupmBlasInt_t info = 0;
308 
309     PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
310     if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
311     static_assert(std::is_same<decltype(info), int>::value, "");
312     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
313     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
314   }
315   PetscFunctionReturn(PETSC_SUCCESS);
316 }
317 
318 // ==========================================================================================
319 // MatDense_Seq_CUPM - Private API - Solver Dispatch
320 // ==========================================================================================
321 
322 // specific solvers called through the dispatch_() family of functions
323 template <device::cupm::DeviceType T>
324 template <typename Derived>
325 struct MatDense_Seq_CUPM<T>::SolveCommon {
326   using derived_type = Derived;
327 
328   template <typename F>
329   static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
330   {
331     cupmBlasInt_t lwork;
332 
333     PetscFunctionBegin;
334     PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
335     if (lwork > mcu->d_fact_lwork) {
336       mcu->d_fact_lwork = lwork;
337       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
338       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
339     }
340     PetscFunctionReturn(PETSC_SUCCESS);
341   }
342 
343   static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
344   {
345     const auto mcu = MatCUPMCast(A);
346 
347     PetscFunctionBegin;
348     PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
349     A->factortype             = derived_type::MATFACTORTYPE();
350     A->ops->solve             = MatSolve_Factored_Dispatch_<derived_type, false>;
351     A->ops->solvetranspose    = MatSolve_Factored_Dispatch_<derived_type, true>;
352     A->ops->matsolve          = MatMatSolve_Factored_Dispatch_<derived_type, false>;
353     A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;
354 
355     PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
356     if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
357     PetscFunctionReturn(PETSC_SUCCESS);
358   }
359 };
360 
361 template <device::cupm::DeviceType T>
362 struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
363   using base_type = SolveCommon<SolveLU>;
364 
365   static constexpr const char   *NAME() noexcept { return "LU"; }
366   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }
367 
368   static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
369   {
370     const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
371     const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
372     cupmStream_t       stream;
373     cupmSolverHandle_t handle;
374     PetscDeviceContext dctx;
375 
376     PetscFunctionBegin;
377     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
378     PetscCall(GetHandles_(&dctx, &handle, &stream));
379     PetscCall(base_type::FactorPrepare(A, stream));
380     {
381       const auto mcu = MatCUPMCast(A);
382       const auto da  = DeviceArrayReadWrite(dctx, A);
383       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
384 
385       // clang-format off
386       PetscCall(
387         base_type::ResizeFactLwork(
388           mcu, stream,
389           [&](cupmBlasInt_t *fact_lwork)
390           {
391             return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
392           }
393         )
394       );
395       // clang-format on
396       if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
397 
398       PetscCall(PetscLogGpuTimeBegin());
399       PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
400       PetscCall(PetscLogGpuTimeEnd());
401       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
402     }
403     PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
404     PetscFunctionReturn(PETSC_SUCCESS);
405   }
406 
407   template <bool transpose>
408   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
409   {
410     const auto         mcu       = MatCUPMCast(A);
411     const auto         fact_info = mcu->d_fact_info;
412     const auto         fact_ipiv = mcu->d_fact_ipiv;
413     cupmSolverHandle_t handle;
414 
415     PetscFunctionBegin;
416     PetscCall(GetHandlesFrom_(dctx, &handle));
417     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
418     PetscCall(PetscLogGpuTimeBegin());
419     {
420       constexpr auto op  = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
421       const auto     da  = DeviceArrayRead(dctx, A);
422       const auto     lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
423 
424       // clang-format off
425       PetscCall(
426         base_type::ResizeFactLwork(
427           mcu, stream,
428           [&](cupmBlasInt_t *lwork)
429           {
430             return cupmSolverXgetrs_bufferSize(
431               handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
432             );
433           }
434         )
435       );
436       // clang-format on
437       PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
438       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
439     }
440     PetscCall(PetscLogGpuTimeEnd());
441     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
442     PetscFunctionReturn(PETSC_SUCCESS);
443   }
444 };
445 
446 template <device::cupm::DeviceType T>
447 struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
448   using base_type = SolveCommon<SolveCholesky>;
449 
450   static constexpr const char   *NAME() noexcept { return "Cholesky"; }
451   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }
452 
453   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
454   {
455     const auto         n = static_cast<cupmBlasInt_t>(A->rmap->n);
456     PetscDeviceContext dctx;
457     cupmSolverHandle_t handle;
458     cupmStream_t       stream;
459 
460     PetscFunctionBegin;
461     if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
462     PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
463     PetscCall(GetHandles_(&dctx, &handle, &stream));
464     PetscCall(base_type::FactorPrepare(A, stream));
465     {
466       const auto mcu = MatCUPMCast(A);
467       const auto da  = DeviceArrayReadWrite(dctx, A);
468       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
469 
470       // clang-format off
471       PetscCall(
472         base_type::ResizeFactLwork(
473           mcu, stream,
474           [&](cupmBlasInt_t *fact_lwork)
475           {
476             return cupmSolverXpotrf_bufferSize(
477               handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
478             );
479           }
480         )
481       );
482       // clang-format on
483       PetscCall(PetscLogGpuTimeBegin());
484       PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
485       PetscCall(PetscLogGpuTimeEnd());
486       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
487     }
488     PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
489 
490 #if 0
491     // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
492     // and *hetr* routines. The code below should work, and it can be activated when *sytrs
493     // routines will be available
494     if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
495     if (!mcu->d_fact_lwork) {
496       PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
497       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
498     }
499     if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
500     PetscCall(PetscLogGpuTimeBegin());
501     PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
502     PetscCall(PetscLogGpuTimeEnd());
503 #endif
504     PetscFunctionReturn(PETSC_SUCCESS);
505   }
506 
507   template <bool transpose>
508   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
509   {
510     const auto         mcu       = MatCUPMCast(A);
511     const auto         fact_info = mcu->d_fact_info;
512     cupmSolverHandle_t handle;
513 
514     PetscFunctionBegin;
515     PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
516     PetscCall(GetHandlesFrom_(dctx, &handle));
517     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
518     PetscCall(PetscLogGpuTimeBegin());
519     {
520       const auto da  = DeviceArrayRead(dctx, A);
521       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
522 
523       // clang-format off
524       PetscCall(
525         base_type::ResizeFactLwork(
526           mcu, stream,
527           [&](cupmBlasInt_t *lwork)
528           {
529             return cupmSolverXpotrs_bufferSize(
530               handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
531             );
532           }
533         )
534       );
535       // clang-format on
536       PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
537       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
538     }
539     PetscCall(PetscLogGpuTimeEnd());
540     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
541     PetscFunctionReturn(PETSC_SUCCESS);
542   }
543 };
544 
545 template <device::cupm::DeviceType T>
546 struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
547   using base_type = SolveCommon<SolveQR>;
548 
549   static constexpr const char   *NAME() noexcept { return "QR"; }
550   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }
551 
552   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
553   {
554     const auto         m     = static_cast<cupmBlasInt_t>(A->rmap->n);
555     const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
556     const auto         min   = std::min(m, n);
557     const auto         mimpl = MatIMPLCast(A);
558     cupmStream_t       stream;
559     cupmSolverHandle_t handle;
560     PetscDeviceContext dctx;
561 
562     PetscFunctionBegin;
563     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
564     PetscCall(GetHandles_(&dctx, &handle, &stream));
565     PetscCall(base_type::FactorPrepare(A, stream));
566     mimpl->rank = min;
567     {
568       const auto mcu = MatCUPMCast(A);
569       const auto da  = DeviceArrayReadWrite(dctx, A);
570       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
571 
572       if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
573       if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
574       // clang-format off
575       PetscCall(
576         base_type::ResizeFactLwork(
577           mcu, stream,
578           [&](cupmBlasInt_t *fact_lwork)
579           {
580             return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
581           }
582         )
583       );
584       // clang-format on
585       PetscCall(PetscLogGpuTimeBegin());
586       PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
587       PetscCall(PetscLogGpuTimeEnd());
588       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
589     }
590     PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
591     PetscFunctionReturn(PETSC_SUCCESS);
592   }
593 
594   template <bool transpose>
595   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
596   {
597     const auto         mimpl      = MatIMPLCast(A);
598     const auto         rank       = static_cast<cupmBlasInt_t>(mimpl->rank);
599     const auto         mcu        = MatCUPMCast(A);
600     const auto         fact_info  = mcu->d_fact_info;
601     const auto         fact_tau   = mcu->d_fact_tau;
602     const auto         fact_work  = mcu->d_fact_work;
603     const auto         fact_lwork = mcu->d_fact_lwork;
604     cupmSolverHandle_t solver_handle;
605     cupmBlasHandle_t   blas_handle;
606 
607     PetscFunctionBegin;
608     PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
609     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
610     PetscCall(PetscLogGpuTimeBegin());
611     {
612       const auto da  = DeviceArrayRead(dctx, A);
613       const auto one = cupmScalarCast(1.0);
614       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
615 
616       if (transpose) {
617         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
618         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
619         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
620       } else {
621         constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;
622 
623         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
624         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
625         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
626       }
627     }
628     PetscCall(PetscLogGpuTimeEnd());
629     PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
630     PetscFunctionReturn(PETSC_SUCCESS);
631   }
632 };
633 
634 template <device::cupm::DeviceType T>
635 template <typename Solver, bool transpose>
636 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
637 {
638   using namespace vec::cupm;
639   const auto         pobj_A  = PetscObjectCast(A);
640   const auto         m       = static_cast<cupmBlasInt_t>(A->rmap->n);
641   const auto         k       = static_cast<cupmBlasInt_t>(A->cmap->n);
642   auto              &workvec = MatCUPMCast(A)->workvec;
643   PetscScalar       *y_array = nullptr;
644   PetscDeviceContext dctx;
645   PetscBool          xiscupm, yiscupm, aiscupm;
646   bool               use_y_array_directly;
647   cupmStream_t       stream;
648 
649   PetscFunctionBegin;
650   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
651   PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
652   PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
653   PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
654   PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
655   PetscCall(GetHandles_(&dctx, &stream));
656   use_y_array_directly = yiscupm && (k >= m);
657   {
658     const PetscScalar *x_array;
659     const auto         xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
660     const auto         copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
661 
662     if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
663     // The logic here is to try to minimize the amount of memory copying:
664     //
665     // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
666     // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
667     // data in order to copy it into the y array. So the array x will be wherever the data
668     // already is so that only one memcpy is performed
669     if (xisdevice) {
670       PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
671     } else {
672       PetscCall(VecGetArrayRead(x, &x_array));
673     }
674     PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
675     PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
676     if (xisdevice) {
677       PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
678     } else {
679       PetscCall(VecRestoreArrayRead(x, &x_array));
680     }
681   }
682 
683   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
684   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
685   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
686 
687   if (use_y_array_directly) {
688     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
689   } else {
690     const auto   copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
691     PetscScalar *yv;
692 
693     // The logic here is that the data is not yet in either y's GPU array or its CPU array.
694     // There is nothing in the interface to say where the user would like it to end up. So we
695     // choose the GPU, because it is the faster option
696     if (yiscupm) {
697       PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
698     } else {
699       PetscCall(VecGetArray(y, &yv));
700     }
701     PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
702     if (yiscupm) {
703       PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
704     } else {
705       PetscCall(VecRestoreArray(y, &yv));
706     }
707     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
708   }
709   PetscFunctionReturn(PETSC_SUCCESS);
710 }
711 
712 template <device::cupm::DeviceType T>
713 template <typename Solver, bool transpose>
714 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
715 {
716   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
717   const auto         k = static_cast<cupmBlasInt_t>(A->cmap->n);
718   cupmBlasInt_t      nrhs, ldb, ldx, ldy;
719   PetscScalar       *y;
720   PetscBool          biscupm, xiscupm, aiscupm;
721   PetscDeviceContext dctx;
722   cupmStream_t       stream;
723 
724   PetscFunctionBegin;
725   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
726   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
727   PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
728   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
729   PetscCall(GetHandles_(&dctx, &stream));
730   {
731     PetscInt n;
732 
733     PetscCall(MatGetSize(B, nullptr, &n));
734     PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
735     PetscCall(MatDenseGetLDA(B, &n));
736     PetscCall(PetscCUPMBlasIntCast(n, &ldb));
737     PetscCall(MatDenseGetLDA(X, &n));
738     PetscCall(PetscCUPMBlasIntCast(n, &ldx));
739   }
740   {
741     // The logic here is to try to minimize the amount of memory copying:
742     //
743     // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
744     // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
745     // get the data in order to copy it into the y array. So the array b will be wherever the
746     // data already is so that only one memcpy is performed
747     const auto         bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
748     const auto         copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
749     const PetscScalar *b;
750 
751     if (bisdevice) {
752       b = DeviceArrayRead(dctx, B);
753     } else if (biscupm) {
754       b = HostArrayRead(dctx, B);
755     } else {
756       PetscCall(MatDenseGetArrayRead(B, &b));
757     }
758 
759     if (ldx < m || !xiscupm) {
760       // X's array cannot serve as the array (too small or not on device), B's array cannot
761       // serve as the array (const), so allocate a new array
762       ldy = m;
763       PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
764     } else {
765       // X's array should serve as the array
766       ldy = ldx;
767       y   = DeviceArrayWrite(dctx, X);
768     }
769     PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
770     if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
771   }
772 
773   // convert to CUPM twice??????????????????????????????????
774   // but A should already be CUPM??????????????????????????????????????
775   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
776   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
777   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
778 
779   if (ldx < m || !xiscupm) {
780     const auto   copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
781     PetscScalar *x;
782 
783     // The logic here is that the data is not yet in either X's GPU array or its CPU
784     // array. There is nothing in the interface to say where the user would like it to end up.
785     // So we choose the GPU, because it is the faster option
786     if (xiscupm) {
787       x = DeviceArrayWrite(dctx, X);
788     } else {
789       PetscCall(MatDenseGetArray(X, &x));
790     }
791     PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
792     if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
793     PetscCallCUPM(cupmFreeAsync(y, stream));
794   }
795   PetscFunctionReturn(PETSC_SUCCESS);
796 }
797 
798 template <device::cupm::DeviceType T>
799 template <bool transpose>
800 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
801 {
802   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
803   const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
804   cupmBlasHandle_t   handle;
805   PetscDeviceContext dctx;
806 
807   PetscFunctionBegin;
808   if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add
809   if (!m || !n) {
810     // mult only
811     if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0));
812     PetscFunctionReturn(PETSC_SUCCESS);
813   }
814   PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
815   PetscCall(GetHandles_(&dctx, &handle));
816   {
817     constexpr auto op   = transpose ? CUPMBLAS_OP_T : CUPMBLAS_OP_N;
818     const auto     one  = cupmScalarCast(1.0);
819     const auto     zero = cupmScalarCast(0.0);
820     const auto     da   = DeviceArrayRead(dctx, A);
821     const auto     dxx  = VecSeq_CUPM::DeviceArrayRead(dctx, xx);
822     const auto     dzz  = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz);
823 
824     PetscCall(PetscLogGpuTimeBegin());
825     PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata(), static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda), dxx.cupmdata(), 1, (yy ? &one : &zero), dzz.cupmdata(), 1));
826     PetscCall(PetscLogGpuTimeEnd());
827   }
828   PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
829   PetscFunctionReturn(PETSC_SUCCESS);
830 }
831 
832 // ==========================================================================================
833 // MatDense_Seq_CUPM - Private API - Conversion Dispatch
834 // ==========================================================================================
835 
836 template <device::cupm::DeviceType T>
837 template <bool to_host>
838 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
839 {
840   PetscFunctionBegin;
841   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
842     // TODO these cases should be optimized
843     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
844   } else {
845     const auto B    = *newmat;
846     const auto pobj = PetscObjectCast(B);
847 
848     if (to_host) {
849       PetscCall(BindToCPU(B, PETSC_TRUE));
850       PetscCall(Reset(B));
851     } else {
852       PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
853     }
854 
855     PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
856     PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
857     // cvec might be the wrong VecType, destroy and rebuild it if necessary
858     // REVIEW ME: this is possibly very inefficient
859     PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));
860 
861     MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
862     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
863     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
864     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
865     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
866     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
867     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
868     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
869     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
870     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
871     MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
872     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation);
873 
874     if (to_host) {
875       B->offloadmask = PETSC_OFFLOAD_CPU;
876     } else {
877       Mat_SeqDenseCUPM *mcu;
878 
879       PetscCall(PetscNew(&mcu));
880       B->spptr       = mcu;
881       B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
882       PetscCall(BindToCPU(B, PETSC_FALSE));
883     }
884 
885     MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
886     MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
887   }
888   PetscFunctionReturn(PETSC_SUCCESS);
889 }
890 
891 // ==========================================================================================
892 // MatDense_Seq_CUPM - Public API
893 // ==========================================================================================
894 
895 template <device::cupm::DeviceType T>
896 inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
897 {
898   return MATSEQDENSECUPM();
899 }
900 
901 template <device::cupm::DeviceType T>
902 inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
903 {
904   return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
905 }
906 
907 template <device::cupm::DeviceType T>
908 inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
909 {
910   return static_cast<Mat_SeqDense *>(m->data);
911 }
912 
913 template <device::cupm::DeviceType T>
914 inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
915 {
916   return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
917 }
918 
919 template <device::cupm::DeviceType T>
920 inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
921 {
922   return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
923 }
924 
925 // ==========================================================================================
926 
927 // MatCreate_SeqDenseCUPM()
928 template <device::cupm::DeviceType T>
929 inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
930 {
931   PetscFunctionBegin;
932   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
933   PetscCall(MatCreate_SeqDense(A));
934   PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
935   PetscFunctionReturn(PETSC_SUCCESS);
936 }
937 
938 template <device::cupm::DeviceType T>
939 inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
940 {
941   PetscFunctionBegin;
942   // prevent copying back data if we own the data pointer
943   if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
944   PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
945   PetscCall(MatDestroy_SeqDense(A));
946   PetscFunctionReturn(PETSC_SUCCESS);
947 }
948 
949 // obj->ops->setup()
950 template <device::cupm::DeviceType T>
951 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
952 {
953   PetscFunctionBegin;
954   PetscCall(PetscLayoutSetUp(A->rmap));
955   PetscCall(PetscLayoutSetUp(A->cmap));
956   if (!A->preallocated) {
957     PetscDeviceContext dctx;
958 
959     PetscCall(GetHandles_(&dctx));
960     PetscCall(SetPreallocation(A, dctx, nullptr));
961   }
962   PetscFunctionReturn(PETSC_SUCCESS);
963 }
964 
965 template <device::cupm::DeviceType T>
966 inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
967 {
968   PetscFunctionBegin;
969   if (const auto mcu = MatCUPMCast(A)) {
970     cupmStream_t stream;
971 
972     PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
973     PetscCall(GetHandles_(&stream));
974     if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
975     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
976     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
977     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
978     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
979     PetscCall(VecDestroy(&mcu->workvec));
980     PetscCall(PetscFree(A->spptr /* mcu */));
981   }
982   PetscFunctionReturn(PETSC_SUCCESS);
983 }
984 
985 // ==========================================================================================
986 
987 template <device::cupm::DeviceType T>
988 inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
989 {
990   const auto mimpl = MatIMPLCast(A);
991   const auto pobj  = PetscObjectCast(A);
992 
993   PetscFunctionBegin;
994   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
995   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
996   A->boundtocpu = to_host;
997   PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
998   if (to_host) {
999     PetscDeviceContext dctx;
1000 
1001     // make sure we have an up-to-date copy on the CPU
1002     PetscCall(GetHandles_(&dctx));
1003     PetscCall(DeviceToHost_(A, dctx));
1004   } else {
1005     PetscBool iscupm;
1006 
1007     if (auto &cvec = mimpl->cvec) {
1008       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1009       if (!iscupm) PetscCall(VecDestroy(&cvec));
1010     }
1011     if (auto &cmat = mimpl->cmat) {
1012       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1013       if (!iscupm) PetscCall(MatDestroy(&cmat));
1014     }
1015   }
1016 
1017   // ============================================================
1018   // Composed ops
1019   // ============================================================
1020   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1021   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1022   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1023   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1024   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1025   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1026   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1027   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1028   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1029   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1030   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1031   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1032   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1033   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1034   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1035   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1036   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1037   MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1038   // always the same
1039   PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1040 
1041   // ============================================================
1042   // Function pointer ops
1043   // ============================================================
1044   MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1045   MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false>(A, xx, nullptr, yy); });
1046   MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true>(A, xx, nullptr, yy); });
1047   MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false>);
1048   MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true>);
1049   MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1050   MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1051   MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1052   MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1053   MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1054   MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1055   MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1056   MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1057   MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1058   MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1059   MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1060   MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1061   MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1062   MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal);
1063   // seemingly always the same
1064   A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1065 
1066   if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1067   PetscFunctionReturn(PETSC_SUCCESS);
1068 }
1069 
1070 template <device::cupm::DeviceType T>
1071 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1072 {
1073   PetscFunctionBegin;
1074   PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1075   PetscFunctionReturn(PETSC_SUCCESS);
1076 }
1077 
1078 template <device::cupm::DeviceType T>
1079 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1080 {
1081   PetscFunctionBegin;
1082   PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1083   PetscFunctionReturn(PETSC_SUCCESS);
1084 }
1085 
1086 // ==========================================================================================
1087 
1088 template <device::cupm::DeviceType T>
1089 template <PetscMemType mtype, PetscMemoryAccessMode access>
1090 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1091 {
1092   constexpr auto hostmem     = PetscMemTypeHost(mtype);
1093   constexpr auto read_access = PetscMemoryAccessRead(access);
1094 
1095   PetscFunctionBegin;
1096   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1097   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1098   if (hostmem) {
1099     if (read_access) {
1100       PetscCall(DeviceToHost_(m, dctx));
1101     } else if (!MatIMPLCast(m)->v) {
1102       // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1103       PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1104     }
1105     *array = MatIMPLCast(m)->v;
1106   } else {
1107     if (read_access) {
1108       PetscCall(HostToDevice_(m, dctx));
1109     } else if (!MatCUPMCast(m)->d_v) {
1110       // write-only
1111       PetscCall(SetPreallocation(m, dctx, nullptr));
1112     }
1113     *array = MatCUPMCast(m)->d_v;
1114   }
1115   if (PetscMemoryAccessWrite(access)) {
1116     m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1117     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1118   }
1119   PetscFunctionReturn(PETSC_SUCCESS);
1120 }
1121 
1122 template <device::cupm::DeviceType T>
1123 template <PetscMemType mtype, PetscMemoryAccessMode access>
1124 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1125 {
1126   PetscFunctionBegin;
1127   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1128   if (PetscMemoryAccessWrite(access)) {
1129     // WRITE or READ_WRITE
1130     m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1131     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1132   }
1133   if (array) {
1134     PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1135     *array = nullptr;
1136   }
1137   PetscFunctionReturn(PETSC_SUCCESS);
1138 }
1139 
1140 template <device::cupm::DeviceType T>
1141 template <PetscMemoryAccessMode access>
1142 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1143 {
1144   PetscFunctionBegin;
1145   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1146   if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1147   PetscFunctionReturn(PETSC_SUCCESS);
1148 }
1149 
1150 template <device::cupm::DeviceType T>
1151 template <PetscMemoryAccessMode access>
1152 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1153 {
1154   PetscFunctionBegin;
1155   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1156   PetscFunctionReturn(PETSC_SUCCESS);
1157 }
1158 
1159 // ==========================================================================================
1160 
1161 template <device::cupm::DeviceType T>
1162 inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1163 {
1164   const auto mimpl = MatIMPLCast(A);
1165   const auto mcu   = MatCUPMCast(A);
1166 
1167   PetscFunctionBegin;
1168   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1169   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1170   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1171   if (mimpl->v) {
1172     PetscDeviceContext dctx;
1173 
1174     PetscCall(GetHandles_(&dctx));
1175     PetscCall(HostToDevice_(A, dctx));
1176   }
1177   mcu->unplacedarray         = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1178   mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1179   PetscFunctionReturn(PETSC_SUCCESS);
1180 }
1181 
1182 template <device::cupm::DeviceType T>
1183 inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1184 {
1185   const auto mimpl = MatIMPLCast(A);
1186   const auto mcu   = MatCUPMCast(A);
1187 
1188   PetscFunctionBegin;
1189   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1190   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1191   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1192   if (!mcu->d_user_alloc) {
1193     cupmStream_t stream;
1194 
1195     PetscCall(GetHandles_(&stream));
1196     PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1197   }
1198   mcu->d_v          = const_cast<PetscScalar *>(array);
1199   mcu->d_user_alloc = PETSC_FALSE;
1200   PetscFunctionReturn(PETSC_SUCCESS);
1201 }
1202 
1203 template <device::cupm::DeviceType T>
1204 inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1205 {
1206   const auto mimpl = MatIMPLCast(A);
1207   const auto mcu   = MatCUPMCast(A);
1208 
1209   PetscFunctionBegin;
1210   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1211   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1212   if (mimpl->v) {
1213     PetscDeviceContext dctx;
1214 
1215     PetscCall(GetHandles_(&dctx));
1216     PetscCall(HostToDevice_(A, dctx));
1217   }
1218   mcu->d_v          = util::exchange(mcu->unplacedarray, nullptr);
1219   mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1220   PetscFunctionReturn(PETSC_SUCCESS);
1221 }
1222 
1223 // ==========================================================================================
1224 
1225 template <device::cupm::DeviceType T>
1226 template <bool transpose_A, bool transpose_B>
1227 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1228 {
1229   cupmBlasInt_t      m, n, k;
1230   PetscBool          Aiscupm, Biscupm;
1231   PetscDeviceContext dctx;
1232   cupmBlasHandle_t   handle;
1233 
1234   PetscFunctionBegin;
1235   PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1236   PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1237   PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1238   if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
1239 
1240   // we may end up with SEQDENSE as one of the arguments
1241   // REVIEW ME: how? and why is it not B and C????????
1242   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1243   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1244   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1245   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1246   PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1247   PetscCall(GetHandles_(&dctx, &handle));
1248 
1249   PetscCall(PetscLogGpuTimeBegin());
1250   {
1251     const auto one  = cupmScalarCast(1.0);
1252     const auto zero = cupmScalarCast(0.0);
1253     const auto da   = DeviceArrayRead(dctx, A);
1254     const auto db   = DeviceArrayRead(dctx, B);
1255     const auto dc   = DeviceArrayWrite(dctx, C);
1256     PetscInt   alda, blda, clda;
1257 
1258     PetscCall(MatDenseGetLDA(A, &alda));
1259     PetscCall(MatDenseGetLDA(B, &blda));
1260     PetscCall(MatDenseGetLDA(C, &clda));
1261     PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1262   }
1263   PetscCall(PetscLogGpuTimeEnd());
1264 
1265   PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1266   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1267   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1268   PetscFunctionReturn(PETSC_SUCCESS);
1269 }
1270 
1271 template <device::cupm::DeviceType T>
1272 inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1273 {
1274   const auto m = A->rmap->n;
1275   const auto n = A->cmap->n;
1276 
1277   PetscFunctionBegin;
1278   PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1279   // The two matrices must have the same copy implementation to be eligible for fast copy
1280   if (A->ops->copy == B->ops->copy) {
1281     PetscDeviceContext dctx;
1282     cupmStream_t       stream;
1283 
1284     PetscCall(GetHandles_(&dctx, &stream));
1285     PetscCall(PetscLogGpuTimeBegin());
1286     {
1287       const auto va = DeviceArrayRead(dctx, A);
1288       const auto vb = DeviceArrayWrite(dctx, B);
1289       // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1290       // lda!
1291       const auto lda_a = MatIMPLCast(A)->lda;
1292       const auto lda_b = MatIMPLCast(B)->lda;
1293 
1294       if (lda_a > m || lda_b > m) {
1295         PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1296         PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1297         PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1298       } else {
1299         PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1300       }
1301     }
1302     PetscCall(PetscLogGpuTimeEnd());
1303   } else {
1304     PetscCall(MatCopy_Basic(A, B, str));
1305   }
1306   PetscFunctionReturn(PETSC_SUCCESS);
1307 }
1308 
1309 template <device::cupm::DeviceType T>
1310 inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1311 {
1312   PetscDeviceContext dctx;
1313   cupmStream_t       stream;
1314 
1315   PetscFunctionBegin;
1316   PetscCall(GetHandles_(&dctx, &stream));
1317   PetscCall(PetscLogGpuTimeBegin());
1318   {
1319     const auto va  = DeviceArrayWrite(dctx, m);
1320     const auto lda = MatIMPLCast(m)->lda;
1321     const auto ma  = m->rmap->n;
1322     const auto na  = m->cmap->n;
1323 
1324     if (lda > ma) {
1325       PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1326     } else {
1327       PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1328     }
1329   }
1330   PetscCall(PetscLogGpuTimeEnd());
1331   PetscFunctionReturn(PETSC_SUCCESS);
1332 }
1333 
1334 namespace detail
1335 {
1336 
1337 // ==========================================================================================
1338 // SubMatIndexFunctor
1339 //
1340 // Iterator which permutes a linear index range into matrix indices for am nrows x ncols
1341 // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1342 // the i'th sequential entry in the matrix.
1343 // ==========================================================================================
1344 template <typename T>
1345 struct SubMatIndexFunctor {
1346   PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }
1347 
1348   PetscInt nrows;
1349   PetscInt ncols;
1350   PetscInt lda;
1351 };
1352 
1353 template <typename Iterator>
1354 struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>> {
1355   using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>>;
1356 
1357   using iterator = typename base_type::iterator;
1358 
1359   constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1360     base_type{
1361       std::move(first), std::move(last), {nrows, ncols, lda}
1362   }
1363   {
1364   }
1365 
1366   PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1367 };
1368 
1369 namespace
1370 {
1371 
1372 template <typename T>
1373 PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1374 {
1375   const auto nrows = rend - rstart;
1376   const auto ncols = cend - cstart;
1377   const auto dptr  = thrust::device_pointer_cast(ptr);
1378 
1379   return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1380 }
1381 
1382 } // namespace
1383 
1384 } // namespace detail
1385 
1386 template <device::cupm::DeviceType T>
1387 inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1388 {
1389   const auto         m = A->rmap->n;
1390   const auto         n = A->cmap->n;
1391   const auto         N = m * n;
1392   PetscDeviceContext dctx;
1393 
1394   PetscFunctionBegin;
1395   PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1396   PetscCall(GetHandles_(&dctx));
1397   {
1398     const auto da  = DeviceArrayReadWrite(dctx, A);
1399     const auto lda = MatIMPLCast(A)->lda;
1400 
1401     if (lda > m) {
1402       cupmStream_t stream;
1403 
1404       PetscCall(GetHandlesFrom_(dctx, &stream));
1405       // clang-format off
1406       PetscCallThrust(
1407         const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());
1408 
1409         THRUST_CALL(
1410           thrust::transform,
1411           stream,
1412           sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1413           device::cupm::functors::make_times_equals(alpha)
1414         )
1415       );
1416       // clang-format on
1417     } else {
1418       const auto       cu_alpha = cupmScalarCast(alpha);
1419       cupmBlasHandle_t handle;
1420 
1421       PetscCall(GetHandlesFrom_(dctx, &handle));
1422       PetscCall(PetscLogGpuTimeBegin());
1423       PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1424       PetscCall(PetscLogGpuTimeEnd());
1425     }
1426   }
1427   PetscCall(PetscLogGpuFlops(N));
1428   PetscFunctionReturn(PETSC_SUCCESS);
1429 }
1430 
1431 template <device::cupm::DeviceType T>
1432 inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1433 {
1434   const auto         m_x = X->rmap->n, m_y = Y->rmap->n;
1435   const auto         n_x = X->cmap->n, n_y = Y->cmap->n;
1436   const auto         N = m_x * n_x;
1437   PetscDeviceContext dctx;
1438 
1439   PetscFunctionBegin;
1440   if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1441   PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1442   PetscCall(GetHandles_(&dctx));
1443   {
1444     const auto dx    = DeviceArrayRead(dctx, X);
1445     const auto dy    = DeviceArrayReadWrite(dctx, Y);
1446     const auto lda_x = MatIMPLCast(X)->lda;
1447     const auto lda_y = MatIMPLCast(Y)->lda;
1448 
1449     if (lda_x > m_x || lda_y > m_x) {
1450       cupmStream_t stream;
1451 
1452       PetscCall(GetHandlesFrom_(dctx, &stream));
1453       // clang-format off
1454       PetscCallThrust(
1455         const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1456         const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());
1457 
1458         THRUST_CALL(
1459           thrust::transform,
1460           stream,
1461           sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1462           device::cupm::functors::make_axpy(alpha)
1463         );
1464       );
1465       // clang-format on
1466     } else {
1467       const auto       cu_alpha = cupmScalarCast(alpha);
1468       cupmBlasHandle_t handle;
1469 
1470       PetscCall(GetHandlesFrom_(dctx, &handle));
1471       PetscCall(PetscLogGpuTimeBegin());
1472       PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1473       PetscCall(PetscLogGpuTimeEnd());
1474     }
1475   }
1476   PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1477   PetscFunctionReturn(PETSC_SUCCESS);
1478 }
1479 
1480 template <device::cupm::DeviceType T>
1481 inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1482 {
1483   const auto         hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1484   PetscDeviceContext dctx;
1485 
1486   PetscFunctionBegin;
1487   PetscCall(GetHandles_(&dctx));
1488   // do not call SetPreallocation() yet, we call it afterwards??
1489   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1490   PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1491   if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1492   // allocate memory if needed
1493   if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr));
1494   PetscFunctionReturn(PETSC_SUCCESS);
1495 }
1496 
1497 template <device::cupm::DeviceType T>
1498 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1499 {
1500   PetscBool device;
1501 
1502   PetscFunctionBegin;
1503   PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1504   if (device) {
1505     const auto         m = A->rmap->n;
1506     const auto         n = A->cmap->n;
1507     PetscDeviceContext dctx;
1508 
1509     PetscCall(GetHandles_(&dctx));
1510     {
1511       const auto a = DeviceArrayWrite(dctx, A);
1512       PetscInt   lda;
1513 
1514       PetscCall(MatDenseGetLDA(A, &lda));
1515       if (lda > m) {
1516         for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1517       } else {
1518         PetscInt mn;
1519 
1520         PetscCall(PetscIntMultError(m, n, &mn));
1521         PetscCall(PetscRandomGetValues(rng, mn, a));
1522       }
1523     }
1524   } else {
1525     PetscCall(MatSetRandom_SeqDense(A, rng));
1526   }
1527   PetscFunctionReturn(PETSC_SUCCESS);
1528 }
1529 
1530 // ==========================================================================================
1531 
1532 template <device::cupm::DeviceType T>
1533 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1534 {
1535   const auto         offloadmask = A->offloadmask;
1536   const auto         n           = A->rmap->n;
1537   const auto         col_offset  = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1538   PetscBool          viscupm;
1539   PetscDeviceContext dctx;
1540   cupmStream_t       stream;
1541 
1542   PetscFunctionBegin;
1543   PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1544   PetscCall(GetHandles_(&dctx, &stream));
1545   if (viscupm && !v->boundtocpu) {
1546     const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
1547 
1548     // update device data
1549     if (PetscOffloadDevice(offloadmask)) {
1550       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1551     } else {
1552       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1553     }
1554   } else {
1555     PetscScalar *x;
1556 
1557     // update host data
1558     PetscCall(VecGetArrayWrite(v, &x));
1559     if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1560       PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1561     } else if (PetscOffloadDevice(offloadmask)) {
1562       PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1563     }
1564     PetscCall(VecRestoreArrayWrite(v, &x));
1565   }
1566   PetscFunctionReturn(PETSC_SUCCESS);
1567 }
1568 
1569 template <device::cupm::DeviceType T>
1570 template <PetscMemoryAccessMode access>
1571 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1572 {
1573   using namespace vec::cupm;
1574   const auto         mimpl = MatIMPLCast(A);
1575   PetscDeviceContext dctx;
1576 
1577   PetscFunctionBegin;
1578   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1579   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1580   mimpl->vecinuse = col + 1;
1581   if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec));
1582   PetscCall(GetHandles_(&dctx));
1583   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1584   PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1585   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1586   *v = mimpl->cvec;
1587   PetscFunctionReturn(PETSC_SUCCESS);
1588 }
1589 
1590 template <device::cupm::DeviceType T>
1591 template <PetscMemoryAccessMode access>
1592 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1593 {
1594   using namespace vec::cupm;
1595   const auto         mimpl = MatIMPLCast(A);
1596   const auto         cvec  = mimpl->cvec;
1597   PetscDeviceContext dctx;
1598 
1599   PetscFunctionBegin;
1600   PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1601   PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1602   mimpl->vecinuse = 0;
1603   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1604   PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1605   PetscCall(GetHandles_(&dctx));
1606   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1607   if (v) *v = nullptr;
1608   PetscFunctionReturn(PETSC_SUCCESS);
1609 }
1610 
1611 // ==========================================================================================
1612 
1613 template <device::cupm::DeviceType T>
1614 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1615 {
1616   Mat                fact = nullptr;
1617   PetscDeviceContext dctx;
1618 
1619   PetscFunctionBegin;
1620   PetscCall(GetHandles_(&dctx));
1621   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1622   fact->factortype = ftype;
1623   switch (ftype) {
1624   case MAT_FACTOR_LU:
1625   case MAT_FACTOR_ILU: // fall-through
1626     fact->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1627     fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1628     break;
1629   case MAT_FACTOR_CHOLESKY:
1630   case MAT_FACTOR_ICC: // fall-through
1631     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1632     break;
1633   case MAT_FACTOR_QR: {
1634     const auto pobj = PetscObjectCast(fact);
1635 
1636     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1637     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1638   } break;
1639   case MAT_FACTOR_NONE:
1640   case MAT_FACTOR_ILUDT:     // fall-through
1641   case MAT_FACTOR_NUM_TYPES: // fall-through
1642     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1643   }
1644   PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1645   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1646   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1647   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1648   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1649   *fact_out = fact;
1650   PetscFunctionReturn(PETSC_SUCCESS);
1651 }
1652 
1653 template <device::cupm::DeviceType T>
1654 inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1655 {
1656   const auto         mimpl = MatIMPLCast(A);
1657   const auto         mcu   = MatCUPMCast(A);
1658   const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
1659   cupmSolverHandle_t handle;
1660   PetscDeviceContext dctx;
1661   cupmStream_t       stream;
1662 
1663   PetscFunctionBegin;
1664 #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1665   // HIP appears to have this by default??
1666   PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1667 #endif
1668   if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1669   PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1670   // spd
1671   PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());
1672 
1673   PetscCall(GetHandles_(&dctx, &handle, &stream));
1674   {
1675     const auto    da  = DeviceArrayReadWrite(dctx, A);
1676     const auto    lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1677     cupmBlasInt_t il;
1678 
1679     PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1680     if (il > mcu->d_fact_lwork) {
1681       mcu->d_fact_lwork = il;
1682       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1683       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1684     }
1685     PetscCall(PetscLogGpuTimeBegin());
1686     PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1687     PetscCall(PetscLogGpuTimeEnd());
1688   }
1689   PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1690   // TODO (write cuda kernel)
1691   PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1692   PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
1693 
1694   A->ops->solve          = nullptr;
1695   A->ops->solvetranspose = nullptr;
1696   A->ops->matsolve       = nullptr;
1697   A->factortype          = MAT_FACTOR_NONE;
1698 
1699   PetscCall(PetscFree(A->solvertype));
1700   PetscFunctionReturn(PETSC_SUCCESS);
1701 }
1702 
1703 // ==========================================================================================
1704 
1705 template <device::cupm::DeviceType T>
1706 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1707 {
1708   const auto         mimpl        = MatIMPLCast(A);
1709   const auto         array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1710   const auto         n            = rend - rbegin;
1711   const auto         m            = cend - cbegin;
1712   auto              &cmat         = mimpl->cmat;
1713   PetscDeviceContext dctx;
1714 
1715   PetscFunctionBegin;
1716   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1717   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1718   mimpl->matinuse = cbegin + 1;
1719 
1720   PetscCall(GetHandles_(&dctx));
1721   PetscCall(HostToDevice_(A, dctx));
1722 
1723   if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1724   {
1725     const auto device_array = array_offset(MatCUPMCast(A)->d_v);
1726 
1727     if (cmat) {
1728       PetscCall(PlaceArray(cmat, device_array));
1729     } else {
1730       PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1731     }
1732   }
1733   PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1734   // place CPU array if present but do not copy any data
1735   if (const auto host_array = mimpl->v) {
1736     cmat->offloadmask = PETSC_OFFLOAD_GPU;
1737     PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1738   }
1739 
1740   cmat->offloadmask = A->offloadmask;
1741   *mat              = cmat;
1742   PetscFunctionReturn(PETSC_SUCCESS);
1743 }
1744 
1745 template <device::cupm::DeviceType T>
1746 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1747 {
1748   const auto mimpl = MatIMPLCast(A);
1749   const auto cmat  = mimpl->cmat;
1750   const auto reset = static_cast<bool>(mimpl->v);
1751   bool       copy, was_offload_host;
1752 
1753   PetscFunctionBegin;
1754   PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1755   PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1756   PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1757   mimpl->matinuse = 0;
1758 
1759   // calls to ResetArray may change it, so save it here
1760   was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1761   if (was_offload_host && !reset) {
1762     copy = true;
1763     PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1764   } else {
1765     copy = false;
1766   }
1767 
1768   PetscCall(ResetArray(cmat));
1769   if (reset) PetscCall(MatDenseResetArray(cmat));
1770   if (copy) {
1771     PetscDeviceContext dctx;
1772 
1773     PetscCall(GetHandles_(&dctx));
1774     PetscCall(DeviceToHost_(A, dctx));
1775   } else {
1776     A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1777   }
1778 
1779   cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1780   *m                = nullptr;
1781   PetscFunctionReturn(PETSC_SUCCESS);
1782 }
1783 
1784 // ==========================================================================================
1785 
1786 namespace
1787 {
1788 
1789 template <device::cupm::DeviceType T>
1790 inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1791 {
1792   PetscFunctionBegin;
1793   if (TA) {
1794     if (TB) {
1795       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1796     } else {
1797       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1798     }
1799   } else {
1800     if (TB) {
1801       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1802     } else {
1803       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1804     }
1805   }
1806   PetscFunctionReturn(PETSC_SUCCESS);
1807 }
1808 
1809 template <device::cupm::DeviceType T>
1810 inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
1811 {
1812   PetscFunctionBegin;
1813   for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
1814     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
1815     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
1816   }
1817   PetscFunctionReturn(PETSC_SUCCESS);
1818 }
1819 
1820 } // anonymous namespace
1821 
1822 } // namespace impl
1823 
1824 } // namespace cupm
1825 
1826 } // namespace mat
1827 
1828 } // namespace Petsc
1829