xref: /petsc/src/mat/impls/dense/seq/cupm/matseqdensecupm.hpp (revision fbf9dbe564678ed6eff1806adbc4c4f01b9743f4)
1 #ifndef PETSCMATSEQDENSECUPM_HPP
2 #define PETSCMATSEQDENSECUPM_HPP
3 
4 #include <petsc/private/matdensecupmimpl.h> /*I <petscmat.h> I*/
5 #include <../src/mat/impls/dense/seq/dense.h>
6 
7 #include <petsc/private/deviceimpl.h> // PetscDeviceContextGetOptionalNullContext_Internal()
8 #include <petsc/private/randomimpl.h> // _p_PetscRandom
9 #include <petsc/private/vecimpl.h>    // _p_Vec
10 #include <petsc/private/cupmobject.hpp>
11 #include <petsc/private/cupmsolverinterface.hpp>
12 
13 #include <petsc/private/cpp/type_traits.hpp> // PetscObjectCast()
14 #include <petsc/private/cpp/utility.hpp>     // util::exchange()
15 
16 #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp> // for VecSeq_CUPM
17 
18 namespace Petsc
19 {
20 
21 namespace mat
22 {
23 
24 namespace cupm
25 {
26 
27 namespace impl
28 {
29 
30 template <device::cupm::DeviceType T>
31 class MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
32 public:
33   MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);
34 
35 private:
36   struct Mat_SeqDenseCUPM {
37     PetscScalar *d_v;           // pointer to the matrix on the GPU
38     PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
39     bool         d_user_alloc;
40     bool         d_unplaced_user_alloc;
41     // factorization support
42     cupmBlasInt_t *d_fact_ipiv;  // device pivots
43     cupmScalar_t  *d_fact_tau;   // device QR tau vector
44     cupmBlasInt_t *d_fact_info;  // device info
45     cupmScalar_t  *d_fact_work;  // device workspace
46     cupmBlasInt_t  d_fact_lwork; // size of device workspace
47     // workspace
48     Vec workvec;
49   };
50 
51   static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;
52 
53   static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
54   static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;
55 
56   static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;
57 
58   template <typename Derived>
59   struct SolveCommon;
60   struct SolveQR;
61   struct SolveCholesky;
62   struct SolveLU;
63 
64   template <typename Solver, bool transpose>
65   static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
66   template <typename Solver, bool transpose>
67   static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
68   template <bool transpose>
69   static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;
70 
71   template <bool to_host>
72   static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;
73 
74   PETSC_NODISCARD static constexpr MatType       MATIMPLCUPM_() noexcept;
75   PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;
76 
77 public:
78   PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;
79 
80   // define these by hand since they don't fit the above mold
81   PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
82   PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;
83 
84   static PetscErrorCode Create(Mat) noexcept;
85   static PetscErrorCode Destroy(Mat) noexcept;
86   static PetscErrorCode SetUp(Mat) noexcept;
87   static PetscErrorCode Reset(Mat) noexcept;
88 
89   static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
90   static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
91   static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;
92 
93   template <PetscMemType, PetscMemoryAccessMode>
94   static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
95   template <PetscMemType, PetscMemoryAccessMode>
96   static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
97   template <PetscMemoryAccessMode>
98   static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
99   template <PetscMemoryAccessMode>
100   static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;
101 
102 private:
103   template <PetscMemType mtype, PetscMemoryAccessMode mode>
104   static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
105   {
106     PetscDeviceContext dctx;
107 
108     PetscFunctionBegin;
109     PetscCall(GetHandles_(&dctx));
110     PetscCall(GetArray<mtype, mode>(m, p, dctx));
111     PetscFunctionReturn(PETSC_SUCCESS);
112   }
113 
114   template <PetscMemType mtype, PetscMemoryAccessMode mode>
115   static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
116   {
117     PetscDeviceContext dctx;
118 
119     PetscFunctionBegin;
120     PetscCall(GetHandles_(&dctx));
121     PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
122     PetscFunctionReturn(PETSC_SUCCESS);
123   }
124 
125   template <PetscMemoryAccessMode mode>
126   static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
127   {
128     PetscDeviceContext dctx;
129 
130     PetscFunctionBegin;
131     PetscCall(GetHandles_(&dctx));
132     PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
133     PetscFunctionReturn(PETSC_SUCCESS);
134   }
135 
136   template <PetscMemoryAccessMode mode>
137   static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
138   {
139     PetscDeviceContext dctx;
140 
141     PetscFunctionBegin;
142     PetscCall(GetHandles_(&dctx));
143     PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
144     PetscFunctionReturn(PETSC_SUCCESS);
145   }
146 
147 public:
148   static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
149   static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
150   static PetscErrorCode ResetArray(Mat) noexcept;
151 
152   template <bool transpose_A, bool transpose_B>
153   static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
154   static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
155   static PetscErrorCode ZeroEntries(Mat) noexcept;
156   static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
157   static PetscErrorCode Shift(Mat, PetscScalar) noexcept;
158   static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
159   static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
160   static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;
161 
162   static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
163   template <PetscMemoryAccessMode>
164   static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
165   template <PetscMemoryAccessMode>
166   static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;
167 
168   static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
169   static PetscErrorCode InvertFactors(Mat) noexcept;
170 
171   static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
172   static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
173 };
174 
175 } // namespace impl
176 
177 namespace
178 {
179 
180 // Declare this here so that the functions below can make use of it
181 template <device::cupm::DeviceType T>
182 inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
183 {
184   PetscFunctionBegin;
185   PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
186   PetscFunctionReturn(PETSC_SUCCESS);
187 }
188 
189 } // anonymous namespace
190 
191 namespace impl
192 {
193 
194 // ==========================================================================================
195 // MatDense_Seq_CUPM - Private API - Utility
196 // ==========================================================================================
197 
198 template <device::cupm::DeviceType T>
199 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
200 {
201   const auto   mcu   = MatCUPMCast(m);
202   const auto   nrows = m->rmap->n;
203   const auto   ncols = m->cmap->n;
204   auto        &lda   = MatIMPLCast(m)->lda;
205   cupmStream_t stream;
206 
207   PetscFunctionBegin;
208   PetscCheckTypeName(m, MATSEQDENSECUPM());
209   PetscValidDeviceContext(dctx, 2);
210   PetscCall(checkCupmBlasIntCast(nrows));
211   PetscCall(checkCupmBlasIntCast(ncols));
212   PetscCall(GetHandlesFrom_(dctx, &stream));
213   if (lda <= 0) lda = nrows;
214   if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
215   if (user_device_array) {
216     mcu->d_user_alloc = PETSC_TRUE;
217     mcu->d_v          = user_device_array;
218   } else {
219     PetscInt size;
220 
221     mcu->d_user_alloc = PETSC_FALSE;
222     PetscCall(PetscIntMultError(lda, ncols, &size));
223     PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
224     PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
225   }
226   m->offloadmask = PETSC_OFFLOAD_GPU;
227   PetscFunctionReturn(PETSC_SUCCESS);
228 }
229 
230 template <device::cupm::DeviceType T>
231 inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
232 {
233   const auto nrows = m->rmap->n;
234   const auto ncols = m->cmap->n;
235   const auto copy  = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;
236 
237   PetscFunctionBegin;
238   PetscCheckTypeName(m, MATSEQDENSECUPM());
239   if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
240   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
241   if (copy) {
242     const auto   mcu = MatCUPMCast(m);
243     cupmStream_t stream;
244 
245     // Allocate GPU memory if not present
246     if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr));
247     PetscCall(GetHandlesFrom_(dctx, &stream));
248     PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
249     {
250       const auto mimpl = MatIMPLCast(m);
251       const auto lda   = mimpl->lda;
252       const auto src   = mimpl->v;
253       const auto dest  = mcu->d_v;
254 
255       if (lda > nrows) {
256         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
257       } else {
258         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
259       }
260     }
261     PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
262     // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
263     m->offloadmask = PETSC_OFFLOAD_BOTH;
264   }
265   PetscFunctionReturn(PETSC_SUCCESS);
266 }
267 
268 template <device::cupm::DeviceType T>
269 inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
270 {
271   const auto nrows = m->rmap->n;
272   const auto ncols = m->cmap->n;
273   const auto copy  = m->offloadmask == PETSC_OFFLOAD_GPU;
274 
275   PetscFunctionBegin;
276   PetscCheckTypeName(m, MATSEQDENSECUPM());
277   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
278   if (copy) {
279     const auto   mimpl = MatIMPLCast(m);
280     cupmStream_t stream;
281 
282     // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
283     if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
284     PetscCall(GetHandlesFrom_(dctx, &stream));
285     PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
286     {
287       const auto lda  = mimpl->lda;
288       const auto dest = mimpl->v;
289       const auto src  = MatCUPMCast(m)->d_v;
290 
291       if (lda > nrows) {
292         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
293       } else {
294         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
295       }
296     }
297     PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
298     // order is important, MatSeqDenseSetPreallocation() might set offloadmask
299     m->offloadmask = PETSC_OFFLOAD_BOTH;
300   }
301   PetscFunctionReturn(PETSC_SUCCESS);
302 }
303 
304 template <device::cupm::DeviceType T>
305 inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
306 {
307   PetscFunctionBegin;
308   if (PetscDefined(USE_DEBUG)) {
309     cupmBlasInt_t info = 0;
310 
311     PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
312     if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
313     static_assert(std::is_same<decltype(info), int>::value, "");
314     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
315     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
316   }
317   PetscFunctionReturn(PETSC_SUCCESS);
318 }
319 
320 // ==========================================================================================
321 // MatDense_Seq_CUPM - Private API - Solver Dispatch
322 // ==========================================================================================
323 
324 // specific solvers called through the dispatch_() family of functions
325 template <device::cupm::DeviceType T>
326 template <typename Derived>
327 struct MatDense_Seq_CUPM<T>::SolveCommon {
328   using derived_type = Derived;
329 
330   template <typename F>
331   static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
332   {
333     cupmBlasInt_t lwork;
334 
335     PetscFunctionBegin;
336     PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
337     if (lwork > mcu->d_fact_lwork) {
338       mcu->d_fact_lwork = lwork;
339       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
340       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
341     }
342     PetscFunctionReturn(PETSC_SUCCESS);
343   }
344 
345   static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
346   {
347     const auto mcu = MatCUPMCast(A);
348 
349     PetscFunctionBegin;
350     PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
351     A->factortype             = derived_type::MATFACTORTYPE();
352     A->ops->solve             = MatSolve_Factored_Dispatch_<derived_type, false>;
353     A->ops->solvetranspose    = MatSolve_Factored_Dispatch_<derived_type, true>;
354     A->ops->matsolve          = MatMatSolve_Factored_Dispatch_<derived_type, false>;
355     A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;
356 
357     PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
358     if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
359     PetscFunctionReturn(PETSC_SUCCESS);
360   }
361 };
362 
363 template <device::cupm::DeviceType T>
364 struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
365   using base_type = SolveCommon<SolveLU>;
366 
367   static constexpr const char   *NAME() noexcept { return "LU"; }
368   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }
369 
370   static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
371   {
372     const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
373     const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
374     cupmStream_t       stream;
375     cupmSolverHandle_t handle;
376     PetscDeviceContext dctx;
377 
378     PetscFunctionBegin;
379     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
380     PetscCall(GetHandles_(&dctx, &handle, &stream));
381     PetscCall(base_type::FactorPrepare(A, stream));
382     {
383       const auto mcu = MatCUPMCast(A);
384       const auto da  = DeviceArrayReadWrite(dctx, A);
385       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
386 
387       // clang-format off
388       PetscCall(
389         base_type::ResizeFactLwork(
390           mcu, stream,
391           [&](cupmBlasInt_t *fact_lwork)
392           {
393             return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
394           }
395         )
396       );
397       // clang-format on
398       if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
399 
400       PetscCall(PetscLogGpuTimeBegin());
401       PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
402       PetscCall(PetscLogGpuTimeEnd());
403       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
404     }
405     PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
406     PetscFunctionReturn(PETSC_SUCCESS);
407   }
408 
409   template <bool transpose>
410   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
411   {
412     const auto         mcu       = MatCUPMCast(A);
413     const auto         fact_info = mcu->d_fact_info;
414     const auto         fact_ipiv = mcu->d_fact_ipiv;
415     cupmSolverHandle_t handle;
416 
417     PetscFunctionBegin;
418     PetscCall(GetHandlesFrom_(dctx, &handle));
419     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
420     PetscCall(PetscLogGpuTimeBegin());
421     {
422       constexpr auto op  = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
423       const auto     da  = DeviceArrayRead(dctx, A);
424       const auto     lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
425 
426       // clang-format off
427       PetscCall(
428         base_type::ResizeFactLwork(
429           mcu, stream,
430           [&](cupmBlasInt_t *lwork)
431           {
432             return cupmSolverXgetrs_bufferSize(
433               handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
434             );
435           }
436         )
437       );
438       // clang-format on
439       PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
440       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
441     }
442     PetscCall(PetscLogGpuTimeEnd());
443     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
444     PetscFunctionReturn(PETSC_SUCCESS);
445   }
446 };
447 
448 template <device::cupm::DeviceType T>
449 struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
450   using base_type = SolveCommon<SolveCholesky>;
451 
452   static constexpr const char   *NAME() noexcept { return "Cholesky"; }
453   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }
454 
455   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
456   {
457     const auto         n = static_cast<cupmBlasInt_t>(A->rmap->n);
458     PetscDeviceContext dctx;
459     cupmSolverHandle_t handle;
460     cupmStream_t       stream;
461 
462     PetscFunctionBegin;
463     if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
464     PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
465     PetscCall(GetHandles_(&dctx, &handle, &stream));
466     PetscCall(base_type::FactorPrepare(A, stream));
467     {
468       const auto mcu = MatCUPMCast(A);
469       const auto da  = DeviceArrayReadWrite(dctx, A);
470       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
471 
472       // clang-format off
473       PetscCall(
474         base_type::ResizeFactLwork(
475           mcu, stream,
476           [&](cupmBlasInt_t *fact_lwork)
477           {
478             return cupmSolverXpotrf_bufferSize(
479               handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
480             );
481           }
482         )
483       );
484       // clang-format on
485       PetscCall(PetscLogGpuTimeBegin());
486       PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
487       PetscCall(PetscLogGpuTimeEnd());
488       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
489     }
490     PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
491 
492 #if 0
493     // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
494     // and *hetr* routines. The code below should work, and it can be activated when *sytrs
495     // routines will be available
496     if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
497     if (!mcu->d_fact_lwork) {
498       PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
499       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
500     }
501     if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
502     PetscCall(PetscLogGpuTimeBegin());
503     PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
504     PetscCall(PetscLogGpuTimeEnd());
505 #endif
506     PetscFunctionReturn(PETSC_SUCCESS);
507   }
508 
509   template <bool transpose>
510   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
511   {
512     const auto         mcu       = MatCUPMCast(A);
513     const auto         fact_info = mcu->d_fact_info;
514     cupmSolverHandle_t handle;
515 
516     PetscFunctionBegin;
517     PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
518     PetscCall(GetHandlesFrom_(dctx, &handle));
519     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
520     PetscCall(PetscLogGpuTimeBegin());
521     {
522       const auto da  = DeviceArrayRead(dctx, A);
523       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
524 
525       // clang-format off
526       PetscCall(
527         base_type::ResizeFactLwork(
528           mcu, stream,
529           [&](cupmBlasInt_t *lwork)
530           {
531             return cupmSolverXpotrs_bufferSize(
532               handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
533             );
534           }
535         )
536       );
537       // clang-format on
538       PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
539       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
540     }
541     PetscCall(PetscLogGpuTimeEnd());
542     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
543     PetscFunctionReturn(PETSC_SUCCESS);
544   }
545 };
546 
547 template <device::cupm::DeviceType T>
548 struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
549   using base_type = SolveCommon<SolveQR>;
550 
551   static constexpr const char   *NAME() noexcept { return "QR"; }
552   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }
553 
554   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
555   {
556     const auto         m     = static_cast<cupmBlasInt_t>(A->rmap->n);
557     const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
558     const auto         min   = std::min(m, n);
559     const auto         mimpl = MatIMPLCast(A);
560     cupmStream_t       stream;
561     cupmSolverHandle_t handle;
562     PetscDeviceContext dctx;
563 
564     PetscFunctionBegin;
565     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
566     PetscCall(GetHandles_(&dctx, &handle, &stream));
567     PetscCall(base_type::FactorPrepare(A, stream));
568     mimpl->rank = min;
569     {
570       const auto mcu = MatCUPMCast(A);
571       const auto da  = DeviceArrayReadWrite(dctx, A);
572       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
573 
574       if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
575       if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
576       // clang-format off
577       PetscCall(
578         base_type::ResizeFactLwork(
579           mcu, stream,
580           [&](cupmBlasInt_t *fact_lwork)
581           {
582             return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
583           }
584         )
585       );
586       // clang-format on
587       PetscCall(PetscLogGpuTimeBegin());
588       PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
589       PetscCall(PetscLogGpuTimeEnd());
590       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
591     }
592     PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
593     PetscFunctionReturn(PETSC_SUCCESS);
594   }
595 
596   template <bool transpose>
597   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
598   {
599     const auto         mimpl      = MatIMPLCast(A);
600     const auto         rank       = static_cast<cupmBlasInt_t>(mimpl->rank);
601     const auto         mcu        = MatCUPMCast(A);
602     const auto         fact_info  = mcu->d_fact_info;
603     const auto         fact_tau   = mcu->d_fact_tau;
604     const auto         fact_work  = mcu->d_fact_work;
605     const auto         fact_lwork = mcu->d_fact_lwork;
606     cupmSolverHandle_t solver_handle;
607     cupmBlasHandle_t   blas_handle;
608 
609     PetscFunctionBegin;
610     PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
611     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
612     PetscCall(PetscLogGpuTimeBegin());
613     {
614       const auto da  = DeviceArrayRead(dctx, A);
615       const auto one = cupmScalarCast(1.0);
616       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
617 
618       if (transpose) {
619         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
620         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
621         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
622       } else {
623         constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;
624 
625         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
626         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
627         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
628       }
629     }
630     PetscCall(PetscLogGpuTimeEnd());
631     PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
632     PetscFunctionReturn(PETSC_SUCCESS);
633   }
634 };
635 
636 template <device::cupm::DeviceType T>
637 template <typename Solver, bool transpose>
638 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
639 {
640   using namespace vec::cupm;
641   const auto         pobj_A  = PetscObjectCast(A);
642   const auto         m       = static_cast<cupmBlasInt_t>(A->rmap->n);
643   const auto         k       = static_cast<cupmBlasInt_t>(A->cmap->n);
644   auto              &workvec = MatCUPMCast(A)->workvec;
645   PetscScalar       *y_array = nullptr;
646   PetscDeviceContext dctx;
647   PetscBool          xiscupm, yiscupm, aiscupm;
648   bool               use_y_array_directly;
649   cupmStream_t       stream;
650 
651   PetscFunctionBegin;
652   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
653   PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
654   PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
655   PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
656   PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
657   PetscCall(GetHandles_(&dctx, &stream));
658   use_y_array_directly = yiscupm && (k >= m);
659   {
660     const PetscScalar *x_array;
661     const auto         xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
662     const auto         copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
663 
664     if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
665     // The logic here is to try to minimize the amount of memory copying:
666     //
667     // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
668     // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
669     // data in order to copy it into the y array. So the array x will be wherever the data
670     // already is so that only one memcpy is performed
671     if (xisdevice) {
672       PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
673     } else {
674       PetscCall(VecGetArrayRead(x, &x_array));
675     }
676     PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
677     PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
678     if (xisdevice) {
679       PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
680     } else {
681       PetscCall(VecRestoreArrayRead(x, &x_array));
682     }
683   }
684 
685   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
686   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
687   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
688 
689   if (use_y_array_directly) {
690     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
691   } else {
692     const auto   copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
693     PetscScalar *yv;
694 
695     // The logic here is that the data is not yet in either y's GPU array or its CPU array.
696     // There is nothing in the interface to say where the user would like it to end up. So we
697     // choose the GPU, because it is the faster option
698     if (yiscupm) {
699       PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
700     } else {
701       PetscCall(VecGetArray(y, &yv));
702     }
703     PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
704     if (yiscupm) {
705       PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
706     } else {
707       PetscCall(VecRestoreArray(y, &yv));
708     }
709     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
710   }
711   PetscFunctionReturn(PETSC_SUCCESS);
712 }
713 
714 template <device::cupm::DeviceType T>
715 template <typename Solver, bool transpose>
716 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
717 {
718   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
719   const auto         k = static_cast<cupmBlasInt_t>(A->cmap->n);
720   cupmBlasInt_t      nrhs, ldb, ldx, ldy;
721   PetscScalar       *y;
722   PetscBool          biscupm, xiscupm, aiscupm;
723   PetscDeviceContext dctx;
724   cupmStream_t       stream;
725 
726   PetscFunctionBegin;
727   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
728   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
729   PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
730   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
731   PetscCall(GetHandles_(&dctx, &stream));
732   {
733     PetscInt n;
734 
735     PetscCall(MatGetSize(B, nullptr, &n));
736     PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
737     PetscCall(MatDenseGetLDA(B, &n));
738     PetscCall(PetscCUPMBlasIntCast(n, &ldb));
739     PetscCall(MatDenseGetLDA(X, &n));
740     PetscCall(PetscCUPMBlasIntCast(n, &ldx));
741   }
742   {
743     // The logic here is to try to minimize the amount of memory copying:
744     //
745     // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
746     // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
747     // get the data in order to copy it into the y array. So the array b will be wherever the
748     // data already is so that only one memcpy is performed
749     const auto         bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
750     const auto         copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
751     const PetscScalar *b;
752 
753     if (bisdevice) {
754       b = DeviceArrayRead(dctx, B);
755     } else if (biscupm) {
756       b = HostArrayRead(dctx, B);
757     } else {
758       PetscCall(MatDenseGetArrayRead(B, &b));
759     }
760 
761     if (ldx < m || !xiscupm) {
762       // X's array cannot serve as the array (too small or not on device), B's array cannot
763       // serve as the array (const), so allocate a new array
764       ldy = m;
765       PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
766     } else {
767       // X's array should serve as the array
768       ldy = ldx;
769       y   = DeviceArrayWrite(dctx, X);
770     }
771     PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
772     if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
773   }
774 
775   // convert to CUPM twice??????????????????????????????????
776   // but A should already be CUPM??????????????????????????????????????
777   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
778   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
779   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
780 
781   if (ldx < m || !xiscupm) {
782     const auto   copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
783     PetscScalar *x;
784 
785     // The logic here is that the data is not yet in either X's GPU array or its CPU
786     // array. There is nothing in the interface to say where the user would like it to end up.
787     // So we choose the GPU, because it is the faster option
788     if (xiscupm) {
789       x = DeviceArrayWrite(dctx, X);
790     } else {
791       PetscCall(MatDenseGetArray(X, &x));
792     }
793     PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
794     if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
795     PetscCallCUPM(cupmFreeAsync(y, stream));
796   }
797   PetscFunctionReturn(PETSC_SUCCESS);
798 }
799 
800 template <device::cupm::DeviceType T>
801 template <bool transpose>
802 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
803 {
804   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
805   const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
806   cupmBlasHandle_t   handle;
807   PetscDeviceContext dctx;
808 
809   PetscFunctionBegin;
810   if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add
811   if (!m || !n) {
812     // mult only
813     if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0));
814     PetscFunctionReturn(PETSC_SUCCESS);
815   }
816   PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
817   PetscCall(GetHandles_(&dctx, &handle));
818   {
819     constexpr auto op   = transpose ? CUPMBLAS_OP_T : CUPMBLAS_OP_N;
820     const auto     one  = cupmScalarCast(1.0);
821     const auto     zero = cupmScalarCast(0.0);
822     const auto     da   = DeviceArrayRead(dctx, A);
823     const auto     dxx  = VecSeq_CUPM::DeviceArrayRead(dctx, xx);
824     const auto     dzz  = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz);
825 
826     PetscCall(PetscLogGpuTimeBegin());
827     PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata(), static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda), dxx.cupmdata(), 1, (yy ? &one : &zero), dzz.cupmdata(), 1));
828     PetscCall(PetscLogGpuTimeEnd());
829   }
830   PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
831   PetscFunctionReturn(PETSC_SUCCESS);
832 }
833 
834 // ==========================================================================================
835 // MatDense_Seq_CUPM - Private API - Conversion Dispatch
836 // ==========================================================================================
837 
838 template <device::cupm::DeviceType T>
839 template <bool to_host>
840 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
841 {
842   PetscFunctionBegin;
843   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
844     // TODO these cases should be optimized
845     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
846   } else {
847     const auto B    = *newmat;
848     const auto pobj = PetscObjectCast(B);
849 
850     if (to_host) {
851       PetscCall(BindToCPU(B, PETSC_TRUE));
852       PetscCall(Reset(B));
853     } else {
854       PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
855     }
856 
857     PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
858     PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
859     // cvec might be the wrong VecType, destroy and rebuild it if necessary
860     // REVIEW ME: this is possibly very inefficient
861     PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));
862 
863     MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
864     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
865     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
866     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
867     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
868     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
869     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
870     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
871     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
872     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
873     MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
874     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation);
875 
876     if (to_host) {
877       B->offloadmask = PETSC_OFFLOAD_CPU;
878     } else {
879       Mat_SeqDenseCUPM *mcu;
880 
881       PetscCall(PetscNew(&mcu));
882       B->spptr       = mcu;
883       B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
884       PetscCall(BindToCPU(B, PETSC_FALSE));
885     }
886 
887     MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
888     MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
889   }
890   PetscFunctionReturn(PETSC_SUCCESS);
891 }
892 
893 // ==========================================================================================
894 // MatDense_Seq_CUPM - Public API
895 // ==========================================================================================
896 
897 template <device::cupm::DeviceType T>
898 inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
899 {
900   return MATSEQDENSECUPM();
901 }
902 
903 template <device::cupm::DeviceType T>
904 inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
905 {
906   return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
907 }
908 
909 template <device::cupm::DeviceType T>
910 inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
911 {
912   return static_cast<Mat_SeqDense *>(m->data);
913 }
914 
915 template <device::cupm::DeviceType T>
916 inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
917 {
918   return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
919 }
920 
921 template <device::cupm::DeviceType T>
922 inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
923 {
924   return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
925 }
926 
927 // ==========================================================================================
928 
929 // MatCreate_SeqDenseCUPM()
930 template <device::cupm::DeviceType T>
931 inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
932 {
933   PetscFunctionBegin;
934   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
935   PetscCall(MatCreate_SeqDense(A));
936   PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
937   PetscFunctionReturn(PETSC_SUCCESS);
938 }
939 
940 template <device::cupm::DeviceType T>
941 inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
942 {
943   PetscFunctionBegin;
944   // prevent copying back data if we own the data pointer
945   if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
946   PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
947   PetscCall(MatDestroy_SeqDense(A));
948   PetscFunctionReturn(PETSC_SUCCESS);
949 }
950 
951 // obj->ops->setup()
952 template <device::cupm::DeviceType T>
953 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
954 {
955   PetscFunctionBegin;
956   PetscCall(PetscLayoutSetUp(A->rmap));
957   PetscCall(PetscLayoutSetUp(A->cmap));
958   if (!A->preallocated) {
959     PetscDeviceContext dctx;
960 
961     PetscCall(GetHandles_(&dctx));
962     PetscCall(SetPreallocation(A, dctx, nullptr));
963   }
964   PetscFunctionReturn(PETSC_SUCCESS);
965 }
966 
967 template <device::cupm::DeviceType T>
968 inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
969 {
970   PetscFunctionBegin;
971   if (const auto mcu = MatCUPMCast(A)) {
972     cupmStream_t stream;
973 
974     PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
975     PetscCall(GetHandles_(&stream));
976     if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
977     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
978     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
979     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
980     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
981     PetscCall(VecDestroy(&mcu->workvec));
982     PetscCall(PetscFree(A->spptr /* mcu */));
983   }
984   PetscFunctionReturn(PETSC_SUCCESS);
985 }
986 
987 // ==========================================================================================
988 
989 template <device::cupm::DeviceType T>
990 inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
991 {
992   const auto mimpl = MatIMPLCast(A);
993   const auto pobj  = PetscObjectCast(A);
994 
995   PetscFunctionBegin;
996   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
997   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
998   A->boundtocpu = to_host;
999   PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
1000   if (to_host) {
1001     PetscDeviceContext dctx;
1002 
1003     // make sure we have an up-to-date copy on the CPU
1004     PetscCall(GetHandles_(&dctx));
1005     PetscCall(DeviceToHost_(A, dctx));
1006   } else {
1007     PetscBool iscupm;
1008 
1009     if (auto &cvec = mimpl->cvec) {
1010       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1011       if (!iscupm) PetscCall(VecDestroy(&cvec));
1012     }
1013     if (auto &cmat = mimpl->cmat) {
1014       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1015       if (!iscupm) PetscCall(MatDestroy(&cmat));
1016     }
1017   }
1018 
1019   // ============================================================
1020   // Composed ops
1021   // ============================================================
1022   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1023   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1024   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1025   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1026   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1027   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1028   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1029   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1030   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1031   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1032   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1033   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1034   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1035   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1036   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1037   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1038   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1039   MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1040   // always the same
1041   PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1042 
1043   // ============================================================
1044   // Function pointer ops
1045   // ============================================================
1046   MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1047   MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false>(A, xx, nullptr, yy); });
1048   MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true>(A, xx, nullptr, yy); });
1049   MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false>);
1050   MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true>);
1051   MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1052   MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1053   MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1054   MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1055   MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1056   MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1057   MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1058   MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1059   MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1060   MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1061   MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1062   MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1063   MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1064   // seemingly always the same
1065   A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1066 
1067   if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1068   PetscFunctionReturn(PETSC_SUCCESS);
1069 }
1070 
1071 template <device::cupm::DeviceType T>
1072 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1073 {
1074   PetscFunctionBegin;
1075   PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1076   PetscFunctionReturn(PETSC_SUCCESS);
1077 }
1078 
1079 template <device::cupm::DeviceType T>
1080 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1081 {
1082   PetscFunctionBegin;
1083   PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1084   PetscFunctionReturn(PETSC_SUCCESS);
1085 }
1086 
1087 // ==========================================================================================
1088 
1089 template <device::cupm::DeviceType T>
1090 template <PetscMemType mtype, PetscMemoryAccessMode access>
1091 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1092 {
1093   constexpr auto hostmem     = PetscMemTypeHost(mtype);
1094   constexpr auto read_access = PetscMemoryAccessRead(access);
1095 
1096   PetscFunctionBegin;
1097   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1098   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1099   if (hostmem) {
1100     if (read_access) {
1101       PetscCall(DeviceToHost_(m, dctx));
1102     } else if (!MatIMPLCast(m)->v) {
1103       // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1104       PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1105     }
1106     *array = MatIMPLCast(m)->v;
1107   } else {
1108     if (read_access) {
1109       PetscCall(HostToDevice_(m, dctx));
1110     } else if (!MatCUPMCast(m)->d_v) {
1111       // write-only
1112       PetscCall(SetPreallocation(m, dctx, nullptr));
1113     }
1114     *array = MatCUPMCast(m)->d_v;
1115   }
1116   if (PetscMemoryAccessWrite(access)) {
1117     m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1118     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1119   }
1120   PetscFunctionReturn(PETSC_SUCCESS);
1121 }
1122 
1123 template <device::cupm::DeviceType T>
1124 template <PetscMemType mtype, PetscMemoryAccessMode access>
1125 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1126 {
1127   PetscFunctionBegin;
1128   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1129   if (PetscMemoryAccessWrite(access)) {
1130     // WRITE or READ_WRITE
1131     m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1132     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1133   }
1134   if (array) {
1135     PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1136     *array = nullptr;
1137   }
1138   PetscFunctionReturn(PETSC_SUCCESS);
1139 }
1140 
1141 template <device::cupm::DeviceType T>
1142 template <PetscMemoryAccessMode access>
1143 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1144 {
1145   PetscFunctionBegin;
1146   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1147   if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1148   PetscFunctionReturn(PETSC_SUCCESS);
1149 }
1150 
1151 template <device::cupm::DeviceType T>
1152 template <PetscMemoryAccessMode access>
1153 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1154 {
1155   PetscFunctionBegin;
1156   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1157   PetscFunctionReturn(PETSC_SUCCESS);
1158 }
1159 
1160 // ==========================================================================================
1161 
1162 template <device::cupm::DeviceType T>
1163 inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1164 {
1165   const auto mimpl = MatIMPLCast(A);
1166   const auto mcu   = MatCUPMCast(A);
1167 
1168   PetscFunctionBegin;
1169   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1170   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1171   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1172   if (mimpl->v) {
1173     PetscDeviceContext dctx;
1174 
1175     PetscCall(GetHandles_(&dctx));
1176     PetscCall(HostToDevice_(A, dctx));
1177   }
1178   mcu->unplacedarray         = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1179   mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1180   PetscFunctionReturn(PETSC_SUCCESS);
1181 }
1182 
1183 template <device::cupm::DeviceType T>
1184 inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1185 {
1186   const auto mimpl = MatIMPLCast(A);
1187   const auto mcu   = MatCUPMCast(A);
1188 
1189   PetscFunctionBegin;
1190   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1191   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1192   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1193   if (!mcu->d_user_alloc) {
1194     cupmStream_t stream;
1195 
1196     PetscCall(GetHandles_(&stream));
1197     PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1198   }
1199   mcu->d_v          = const_cast<PetscScalar *>(array);
1200   mcu->d_user_alloc = PETSC_FALSE;
1201   PetscFunctionReturn(PETSC_SUCCESS);
1202 }
1203 
1204 template <device::cupm::DeviceType T>
1205 inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1206 {
1207   const auto mimpl = MatIMPLCast(A);
1208   const auto mcu   = MatCUPMCast(A);
1209 
1210   PetscFunctionBegin;
1211   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1212   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1213   if (mimpl->v) {
1214     PetscDeviceContext dctx;
1215 
1216     PetscCall(GetHandles_(&dctx));
1217     PetscCall(HostToDevice_(A, dctx));
1218   }
1219   mcu->d_v          = util::exchange(mcu->unplacedarray, nullptr);
1220   mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1221   PetscFunctionReturn(PETSC_SUCCESS);
1222 }
1223 
1224 // ==========================================================================================
1225 
1226 template <device::cupm::DeviceType T>
1227 template <bool transpose_A, bool transpose_B>
1228 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1229 {
1230   cupmBlasInt_t      m, n, k;
1231   PetscBool          Aiscupm, Biscupm;
1232   PetscDeviceContext dctx;
1233   cupmBlasHandle_t   handle;
1234 
1235   PetscFunctionBegin;
1236   PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1237   PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1238   PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1239   if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
1240 
1241   // we may end up with SEQDENSE as one of the arguments
1242   // REVIEW ME: how? and why is it not B and C????????
1243   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1244   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1245   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1246   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1247   PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1248   PetscCall(GetHandles_(&dctx, &handle));
1249 
1250   PetscCall(PetscLogGpuTimeBegin());
1251   {
1252     const auto one  = cupmScalarCast(1.0);
1253     const auto zero = cupmScalarCast(0.0);
1254     const auto da   = DeviceArrayRead(dctx, A);
1255     const auto db   = DeviceArrayRead(dctx, B);
1256     const auto dc   = DeviceArrayWrite(dctx, C);
1257     PetscInt   alda, blda, clda;
1258 
1259     PetscCall(MatDenseGetLDA(A, &alda));
1260     PetscCall(MatDenseGetLDA(B, &blda));
1261     PetscCall(MatDenseGetLDA(C, &clda));
1262     PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1263   }
1264   PetscCall(PetscLogGpuTimeEnd());
1265 
1266   PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1267   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1268   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1269   PetscFunctionReturn(PETSC_SUCCESS);
1270 }
1271 
1272 template <device::cupm::DeviceType T>
1273 inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1274 {
1275   const auto m = A->rmap->n;
1276   const auto n = A->cmap->n;
1277 
1278   PetscFunctionBegin;
1279   PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1280   // The two matrices must have the same copy implementation to be eligible for fast copy
1281   if (A->ops->copy == B->ops->copy) {
1282     PetscDeviceContext dctx;
1283     cupmStream_t       stream;
1284 
1285     PetscCall(GetHandles_(&dctx, &stream));
1286     PetscCall(PetscLogGpuTimeBegin());
1287     {
1288       const auto va = DeviceArrayRead(dctx, A);
1289       const auto vb = DeviceArrayWrite(dctx, B);
1290       // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1291       // lda!
1292       const auto lda_a = MatIMPLCast(A)->lda;
1293       const auto lda_b = MatIMPLCast(B)->lda;
1294 
1295       if (lda_a > m || lda_b > m) {
1296         PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1297         PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1298         PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1299       } else {
1300         PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1301       }
1302     }
1303     PetscCall(PetscLogGpuTimeEnd());
1304   } else {
1305     PetscCall(MatCopy_Basic(A, B, str));
1306   }
1307   PetscFunctionReturn(PETSC_SUCCESS);
1308 }
1309 
1310 template <device::cupm::DeviceType T>
1311 inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1312 {
1313   PetscDeviceContext dctx;
1314   cupmStream_t       stream;
1315 
1316   PetscFunctionBegin;
1317   PetscCall(GetHandles_(&dctx, &stream));
1318   PetscCall(PetscLogGpuTimeBegin());
1319   {
1320     const auto va  = DeviceArrayWrite(dctx, m);
1321     const auto lda = MatIMPLCast(m)->lda;
1322     const auto ma  = m->rmap->n;
1323     const auto na  = m->cmap->n;
1324 
1325     if (lda > ma) {
1326       PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1327     } else {
1328       PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1329     }
1330   }
1331   PetscCall(PetscLogGpuTimeEnd());
1332   PetscFunctionReturn(PETSC_SUCCESS);
1333 }
1334 
1335 namespace detail
1336 {
1337 
1338 // ==========================================================================================
1339 // SubMatIndexFunctor
1340 //
1341 // Iterator which permutes a linear index range into matrix indices for am nrows x ncols
1342 // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1343 // the i'th sequential entry in the matrix.
1344 // ==========================================================================================
1345 template <typename T>
1346 struct SubMatIndexFunctor {
1347   PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }
1348 
1349   PetscInt nrows;
1350   PetscInt ncols;
1351   PetscInt lda;
1352 };
1353 
1354 template <typename Iterator>
1355 struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>> {
1356   using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>>;
1357 
1358   using iterator = typename base_type::iterator;
1359 
1360   constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1361     base_type{
1362       std::move(first), std::move(last), {nrows, ncols, lda}
1363   }
1364   {
1365   }
1366 
1367   PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1368 };
1369 
1370 namespace
1371 {
1372 
1373 template <typename T>
1374 PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1375 {
1376   const auto nrows = rend - rstart;
1377   const auto ncols = cend - cstart;
1378   const auto dptr  = thrust::device_pointer_cast(ptr);
1379 
1380   return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1381 }
1382 
1383 } // namespace
1384 
1385 } // namespace detail
1386 
1387 template <device::cupm::DeviceType T>
1388 inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1389 {
1390   const auto         m = A->rmap->n;
1391   const auto         n = A->cmap->n;
1392   const auto         N = m * n;
1393   PetscDeviceContext dctx;
1394 
1395   PetscFunctionBegin;
1396   PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1397   PetscCall(GetHandles_(&dctx));
1398   {
1399     const auto da  = DeviceArrayReadWrite(dctx, A);
1400     const auto lda = MatIMPLCast(A)->lda;
1401 
1402     if (lda > m) {
1403       cupmStream_t stream;
1404 
1405       PetscCall(GetHandlesFrom_(dctx, &stream));
1406       // clang-format off
1407       PetscCallThrust(
1408         const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());
1409 
1410         THRUST_CALL(
1411           thrust::transform,
1412           stream,
1413           sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1414           device::cupm::functors::make_times_equals(alpha)
1415         )
1416       );
1417       // clang-format on
1418     } else {
1419       const auto       cu_alpha = cupmScalarCast(alpha);
1420       cupmBlasHandle_t handle;
1421 
1422       PetscCall(GetHandlesFrom_(dctx, &handle));
1423       PetscCall(PetscLogGpuTimeBegin());
1424       PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1425       PetscCall(PetscLogGpuTimeEnd());
1426     }
1427   }
1428   PetscCall(PetscLogGpuFlops(N));
1429   PetscFunctionReturn(PETSC_SUCCESS);
1430 }
1431 
1432 template <device::cupm::DeviceType T>
1433 inline PetscErrorCode MatDense_Seq_CUPM<T>::Shift(Mat A, PetscScalar alpha) noexcept
1434 {
1435   const auto         m = A->rmap->n;
1436   const auto         n = A->cmap->n;
1437   PetscDeviceContext dctx;
1438 
1439   PetscFunctionBegin;
1440   PetscCall(GetHandles_(&dctx));
1441   PetscCall(PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1442   PetscCall(DiagonalUnaryTransform(A, 0, m, n, dctx, device::cupm::functors::make_plus_equals(alpha)));
1443   PetscFunctionReturn(PETSC_SUCCESS);
1444 }
1445 
1446 template <device::cupm::DeviceType T>
1447 inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1448 {
1449   const auto         m_x = X->rmap->n, m_y = Y->rmap->n;
1450   const auto         n_x = X->cmap->n, n_y = Y->cmap->n;
1451   const auto         N = m_x * n_x;
1452   PetscDeviceContext dctx;
1453 
1454   PetscFunctionBegin;
1455   if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1456   PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1457   PetscCall(GetHandles_(&dctx));
1458   {
1459     const auto dx    = DeviceArrayRead(dctx, X);
1460     const auto dy    = DeviceArrayReadWrite(dctx, Y);
1461     const auto lda_x = MatIMPLCast(X)->lda;
1462     const auto lda_y = MatIMPLCast(Y)->lda;
1463 
1464     if (lda_x > m_x || lda_y > m_x) {
1465       cupmStream_t stream;
1466 
1467       PetscCall(GetHandlesFrom_(dctx, &stream));
1468       // clang-format off
1469       PetscCallThrust(
1470         const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1471         const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());
1472 
1473         THRUST_CALL(
1474           thrust::transform,
1475           stream,
1476           sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1477           device::cupm::functors::make_axpy(alpha)
1478         );
1479       );
1480       // clang-format on
1481     } else {
1482       const auto       cu_alpha = cupmScalarCast(alpha);
1483       cupmBlasHandle_t handle;
1484 
1485       PetscCall(GetHandlesFrom_(dctx, &handle));
1486       PetscCall(PetscLogGpuTimeBegin());
1487       PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1488       PetscCall(PetscLogGpuTimeEnd());
1489     }
1490   }
1491   PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1492   PetscFunctionReturn(PETSC_SUCCESS);
1493 }
1494 
1495 template <device::cupm::DeviceType T>
1496 inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1497 {
1498   const auto         hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1499   PetscDeviceContext dctx;
1500 
1501   PetscFunctionBegin;
1502   PetscCall(GetHandles_(&dctx));
1503   // do not call SetPreallocation() yet, we call it afterwards??
1504   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1505   PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1506   if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1507   // allocate memory if needed
1508   if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr));
1509   PetscFunctionReturn(PETSC_SUCCESS);
1510 }
1511 
1512 template <device::cupm::DeviceType T>
1513 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1514 {
1515   PetscBool device;
1516 
1517   PetscFunctionBegin;
1518   PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1519   if (device) {
1520     const auto         m = A->rmap->n;
1521     const auto         n = A->cmap->n;
1522     PetscDeviceContext dctx;
1523 
1524     PetscCall(GetHandles_(&dctx));
1525     {
1526       const auto a = DeviceArrayWrite(dctx, A);
1527       PetscInt   lda;
1528 
1529       PetscCall(MatDenseGetLDA(A, &lda));
1530       if (lda > m) {
1531         for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1532       } else {
1533         PetscInt mn;
1534 
1535         PetscCall(PetscIntMultError(m, n, &mn));
1536         PetscCall(PetscRandomGetValues(rng, mn, a));
1537       }
1538     }
1539   } else {
1540     PetscCall(MatSetRandom_SeqDense(A, rng));
1541   }
1542   PetscFunctionReturn(PETSC_SUCCESS);
1543 }
1544 
1545 // ==========================================================================================
1546 
1547 template <device::cupm::DeviceType T>
1548 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1549 {
1550   const auto         offloadmask = A->offloadmask;
1551   const auto         n           = A->rmap->n;
1552   const auto         col_offset  = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1553   PetscBool          viscupm;
1554   PetscDeviceContext dctx;
1555   cupmStream_t       stream;
1556 
1557   PetscFunctionBegin;
1558   PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1559   PetscCall(GetHandles_(&dctx, &stream));
1560   if (viscupm && !v->boundtocpu) {
1561     const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
1562 
1563     // update device data
1564     if (PetscOffloadDevice(offloadmask)) {
1565       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1566     } else {
1567       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1568     }
1569   } else {
1570     PetscScalar *x;
1571 
1572     // update host data
1573     PetscCall(VecGetArrayWrite(v, &x));
1574     if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1575       PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1576     } else if (PetscOffloadDevice(offloadmask)) {
1577       PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1578     }
1579     PetscCall(VecRestoreArrayWrite(v, &x));
1580   }
1581   PetscFunctionReturn(PETSC_SUCCESS);
1582 }
1583 
1584 template <device::cupm::DeviceType T>
1585 template <PetscMemoryAccessMode access>
1586 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1587 {
1588   using namespace vec::cupm;
1589   const auto         mimpl = MatIMPLCast(A);
1590   PetscDeviceContext dctx;
1591 
1592   PetscFunctionBegin;
1593   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1594   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1595   mimpl->vecinuse = col + 1;
1596   if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec));
1597   PetscCall(GetHandles_(&dctx));
1598   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1599   PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1600   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1601   *v = mimpl->cvec;
1602   PetscFunctionReturn(PETSC_SUCCESS);
1603 }
1604 
1605 template <device::cupm::DeviceType T>
1606 template <PetscMemoryAccessMode access>
1607 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1608 {
1609   using namespace vec::cupm;
1610   const auto         mimpl = MatIMPLCast(A);
1611   const auto         cvec  = mimpl->cvec;
1612   PetscDeviceContext dctx;
1613 
1614   PetscFunctionBegin;
1615   PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1616   PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1617   mimpl->vecinuse = 0;
1618   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1619   PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1620   PetscCall(GetHandles_(&dctx));
1621   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1622   if (v) *v = nullptr;
1623   PetscFunctionReturn(PETSC_SUCCESS);
1624 }
1625 
1626 // ==========================================================================================
1627 
1628 template <device::cupm::DeviceType T>
1629 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1630 {
1631   Mat                fact = nullptr;
1632   PetscDeviceContext dctx;
1633 
1634   PetscFunctionBegin;
1635   PetscCall(GetHandles_(&dctx));
1636   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1637   fact->factortype = ftype;
1638   switch (ftype) {
1639   case MAT_FACTOR_LU:
1640   case MAT_FACTOR_ILU: // fall-through
1641     fact->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1642     fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1643     break;
1644   case MAT_FACTOR_CHOLESKY:
1645   case MAT_FACTOR_ICC: // fall-through
1646     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1647     break;
1648   case MAT_FACTOR_QR: {
1649     const auto pobj = PetscObjectCast(fact);
1650 
1651     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1652     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1653   } break;
1654   case MAT_FACTOR_NONE:
1655   case MAT_FACTOR_ILUDT:     // fall-through
1656   case MAT_FACTOR_NUM_TYPES: // fall-through
1657     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1658   }
1659   PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1660   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1661   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1662   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1663   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1664   *fact_out = fact;
1665   PetscFunctionReturn(PETSC_SUCCESS);
1666 }
1667 
1668 template <device::cupm::DeviceType T>
1669 inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1670 {
1671   const auto         mimpl = MatIMPLCast(A);
1672   const auto         mcu   = MatCUPMCast(A);
1673   const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
1674   cupmSolverHandle_t handle;
1675   PetscDeviceContext dctx;
1676   cupmStream_t       stream;
1677 
1678   PetscFunctionBegin;
1679 #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1680   // HIP appears to have this by default??
1681   PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1682 #endif
1683   if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1684   PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1685   // spd
1686   PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());
1687 
1688   PetscCall(GetHandles_(&dctx, &handle, &stream));
1689   {
1690     const auto    da  = DeviceArrayReadWrite(dctx, A);
1691     const auto    lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1692     cupmBlasInt_t il;
1693 
1694     PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1695     if (il > mcu->d_fact_lwork) {
1696       mcu->d_fact_lwork = il;
1697       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1698       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1699     }
1700     PetscCall(PetscLogGpuTimeBegin());
1701     PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1702     PetscCall(PetscLogGpuTimeEnd());
1703   }
1704   PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1705   // TODO (write cuda kernel)
1706   PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1707   PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
1708 
1709   A->ops->solve          = nullptr;
1710   A->ops->solvetranspose = nullptr;
1711   A->ops->matsolve       = nullptr;
1712   A->factortype          = MAT_FACTOR_NONE;
1713 
1714   PetscCall(PetscFree(A->solvertype));
1715   PetscFunctionReturn(PETSC_SUCCESS);
1716 }
1717 
1718 // ==========================================================================================
1719 
1720 template <device::cupm::DeviceType T>
1721 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1722 {
1723   const auto         mimpl        = MatIMPLCast(A);
1724   const auto         array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1725   const auto         n            = rend - rbegin;
1726   const auto         m            = cend - cbegin;
1727   auto              &cmat         = mimpl->cmat;
1728   PetscDeviceContext dctx;
1729 
1730   PetscFunctionBegin;
1731   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1732   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1733   mimpl->matinuse = cbegin + 1;
1734 
1735   PetscCall(GetHandles_(&dctx));
1736   PetscCall(HostToDevice_(A, dctx));
1737 
1738   if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1739   {
1740     const auto device_array = array_offset(MatCUPMCast(A)->d_v);
1741 
1742     if (cmat) {
1743       PetscCall(PlaceArray(cmat, device_array));
1744     } else {
1745       PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1746     }
1747   }
1748   PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1749   // place CPU array if present but do not copy any data
1750   if (const auto host_array = mimpl->v) {
1751     cmat->offloadmask = PETSC_OFFLOAD_GPU;
1752     PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1753   }
1754 
1755   cmat->offloadmask = A->offloadmask;
1756   *mat              = cmat;
1757   PetscFunctionReturn(PETSC_SUCCESS);
1758 }
1759 
1760 template <device::cupm::DeviceType T>
1761 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1762 {
1763   const auto mimpl = MatIMPLCast(A);
1764   const auto cmat  = mimpl->cmat;
1765   const auto reset = static_cast<bool>(mimpl->v);
1766   bool       copy, was_offload_host;
1767 
1768   PetscFunctionBegin;
1769   PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1770   PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1771   PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1772   mimpl->matinuse = 0;
1773 
1774   // calls to ResetArray may change it, so save it here
1775   was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1776   if (was_offload_host && !reset) {
1777     copy = true;
1778     PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1779   } else {
1780     copy = false;
1781   }
1782 
1783   PetscCall(ResetArray(cmat));
1784   if (reset) PetscCall(MatDenseResetArray(cmat));
1785   if (copy) {
1786     PetscDeviceContext dctx;
1787 
1788     PetscCall(GetHandles_(&dctx));
1789     PetscCall(DeviceToHost_(A, dctx));
1790   } else {
1791     A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1792   }
1793 
1794   cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1795   *m                = nullptr;
1796   PetscFunctionReturn(PETSC_SUCCESS);
1797 }
1798 
1799 // ==========================================================================================
1800 
1801 namespace
1802 {
1803 
1804 template <device::cupm::DeviceType T>
1805 inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1806 {
1807   PetscFunctionBegin;
1808   if (TA) {
1809     if (TB) {
1810       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1811     } else {
1812       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1813     }
1814   } else {
1815     if (TB) {
1816       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1817     } else {
1818       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1819     }
1820   }
1821   PetscFunctionReturn(PETSC_SUCCESS);
1822 }
1823 
1824 template <device::cupm::DeviceType T>
1825 inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
1826 {
1827   PetscFunctionBegin;
1828   for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
1829     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
1830     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
1831   }
1832   PetscFunctionReturn(PETSC_SUCCESS);
1833 }
1834 
1835 } // anonymous namespace
1836 
1837 } // namespace impl
1838 
1839 } // namespace cupm
1840 
1841 } // namespace mat
1842 
1843 } // namespace Petsc
1844 
1845 #endif // PETSCMATSEQDENSECUPM_HPP
1846