xref: /petsc/src/mat/impls/dense/seq/cupm/matseqdensecupm.hpp (revision 901f93825bbaa60582af604c6700caf57884a2e1) !
1 #pragma once
2 
3 #include <petsc/private/matdensecupmimpl.h> /*I <petscmat.h> I*/
4 #include <../src/mat/impls/dense/seq/dense.h>
5 
6 #include <petsc/private/deviceimpl.h> // PetscDeviceContextGetOptionalNullContext_Internal()
7 #include <petsc/private/randomimpl.h> // _p_PetscRandom
8 #include <petsc/private/vecimpl.h>    // _p_Vec
9 #include <petsc/private/cupmobject.hpp>
10 #include <petsc/private/cupmsolverinterface.hpp>
11 
12 #include <petsc/private/cpp/type_traits.hpp> // PetscObjectCast()
13 #include <petsc/private/cpp/utility.hpp>     // util::exchange()
14 
15 #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp> // for VecSeq_CUPM
16 
17 namespace Petsc
18 {
19 
20 namespace mat
21 {
22 
23 namespace cupm
24 {
25 
26 namespace impl
27 {
28 
29 template <device::cupm::DeviceType T>
30 class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
31 public:
32   MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);
33 
34 private:
35   struct Mat_SeqDenseCUPM {
36     PetscScalar *d_v;           // pointer to the matrix on the GPU
37     PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
38     bool         d_user_alloc;
39     bool         d_unplaced_user_alloc;
40     // factorization support
41     cupmBlasInt_t *d_fact_ipiv;  // device pivots
42     cupmScalar_t  *d_fact_tau;   // device QR tau vector
43     cupmBlasInt_t *d_fact_info;  // device info
44     cupmScalar_t  *d_fact_work;  // device workspace
45     cupmBlasInt_t  d_fact_lwork; // size of device workspace
46     // workspace
47     Vec workvec;
48   };
49 
50   static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;
51 
52   static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
53   static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;
54 
55   static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;
56 
57   template <typename Derived>
58   struct SolveCommon;
59   struct SolveQR;
60   struct SolveCholesky;
61   struct SolveLU;
62 
63   template <typename Solver, bool transpose>
64   static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
65   template <typename Solver, bool transpose>
66   static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
67   template <bool transpose, bool hermitian>
68   static PetscErrorCode MatMultAddColumnRange_Dispatch_(Mat, Vec, Vec, Vec, PetscInt, PetscInt) noexcept;
69   template <bool transpose, bool hermitian>
70   static PetscErrorCode MatMultColumnRange_Dispatch_(Mat, Vec, Vec, PetscInt, PetscInt) noexcept;
71   template <bool transpose, bool hermitian>
72   static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;
73 
74   template <bool to_host>
75   static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;
76 
77   PETSC_NODISCARD static constexpr MatType       MATIMPLCUPM_() noexcept;
78   PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;
79 
80 public:
81   PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;
82 
83   // define these by hand since they don't fit the above mold
84   PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
85   PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;
86 
87   static PetscErrorCode Create(Mat) noexcept;
88   static PetscErrorCode Destroy(Mat) noexcept;
89   static PetscErrorCode SetUp(Mat) noexcept;
90   static PetscErrorCode Reset(Mat) noexcept;
91 
92   static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
93   static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
94   static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;
95 
96   template <PetscMemType, PetscMemoryAccessMode>
97   static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
98   template <PetscMemType, PetscMemoryAccessMode>
99   static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
100   template <PetscMemoryAccessMode>
101   static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
102   template <PetscMemoryAccessMode>
103   static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;
104 
105 private:
106   template <PetscMemType mtype, PetscMemoryAccessMode mode>
GetArrayC_(Mat m,PetscScalar ** p)107   static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
108   {
109     PetscDeviceContext dctx;
110 
111     PetscFunctionBegin;
112     PetscCall(GetHandles_(&dctx));
113     PetscCall(GetArray<mtype, mode>(m, p, dctx));
114     PetscFunctionReturn(PETSC_SUCCESS);
115   }
116 
117   template <PetscMemType mtype, PetscMemoryAccessMode mode>
RestoreArrayC_(Mat m,PetscScalar ** p)118   static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
119   {
120     PetscDeviceContext dctx;
121 
122     PetscFunctionBegin;
123     PetscCall(GetHandles_(&dctx));
124     PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
125     PetscFunctionReturn(PETSC_SUCCESS);
126   }
127 
128   template <PetscMemoryAccessMode mode>
GetArrayAndMemTypeC_(Mat m,PetscScalar ** p,PetscMemType * tp)129   static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
130   {
131     PetscDeviceContext dctx;
132 
133     PetscFunctionBegin;
134     PetscCall(GetHandles_(&dctx));
135     PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
136     PetscFunctionReturn(PETSC_SUCCESS);
137   }
138 
139   template <PetscMemoryAccessMode mode>
RestoreArrayAndMemTypeC_(Mat m,PetscScalar ** p)140   static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
141   {
142     PetscDeviceContext dctx;
143 
144     PetscFunctionBegin;
145     PetscCall(GetHandles_(&dctx));
146     PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
147     PetscFunctionReturn(PETSC_SUCCESS);
148   }
149 
150 public:
151   static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
152   static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
153   static PetscErrorCode ResetArray(Mat) noexcept;
154 
155   template <bool transpose_A, bool transpose_B>
156   static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
157   static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
158   static PetscErrorCode ZeroEntries(Mat) noexcept;
159   static PetscErrorCode Conjugate(Mat) noexcept;
160   static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
161   static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
162   static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
163   static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;
164 
165   static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
166   template <PetscMemoryAccessMode>
167   static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
168   template <PetscMemoryAccessMode>
169   static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;
170 
171   static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
172   static PetscErrorCode InvertFactors(Mat) noexcept;
173 
174   static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
175   static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
176 };
177 
178 } // namespace impl
179 
180 namespace
181 {
182 
183 // Declare this here so that the functions below can make use of it
184 template <device::cupm::DeviceType T>
MatCreateSeqDenseCUPM(MPI_Comm comm,PetscInt m,PetscInt n,PetscScalar * data,Mat * A,PetscDeviceContext dctx=nullptr,bool preallocate=true)185 inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
186 {
187   PetscFunctionBegin;
188   PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
189   PetscFunctionReturn(PETSC_SUCCESS);
190 }
191 
192 } // anonymous namespace
193 
194 namespace impl
195 {
196 
197 // ==========================================================================================
198 // MatDense_Seq_CUPM - Private API - Utility
199 // ==========================================================================================
200 
201 template <device::cupm::DeviceType T>
SetPreallocation_(Mat m,PetscDeviceContext dctx,PetscScalar * user_device_array)202 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
203 {
204   const auto   mcu   = MatCUPMCast(m);
205   const auto   nrows = m->rmap->n;
206   const auto   ncols = m->cmap->n;
207   auto        &lda   = MatIMPLCast(m)->lda;
208   cupmStream_t stream;
209 
210   PetscFunctionBegin;
211   PetscCheckTypeName(m, MATSEQDENSECUPM());
212   PetscValidDeviceContext(dctx, 2);
213   PetscCall(checkCupmBlasIntCast(nrows));
214   PetscCall(checkCupmBlasIntCast(ncols));
215   PetscCall(GetHandlesFrom_(dctx, &stream));
216   if (lda <= 0) lda = nrows;
217   if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
218   if (user_device_array) {
219     mcu->d_user_alloc = PETSC_TRUE;
220     mcu->d_v          = user_device_array;
221   } else {
222     std::size_t size;
223 
224     mcu->d_user_alloc = PETSC_FALSE;
225     size              = lda * ncols;
226     PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
227     PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
228   }
229   m->offloadmask = PETSC_OFFLOAD_GPU;
230   PetscFunctionReturn(PETSC_SUCCESS);
231 }
232 
233 template <device::cupm::DeviceType T>
HostToDevice_(Mat m,PetscDeviceContext dctx)234 inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
235 {
236   const auto nrows = m->rmap->n;
237   const auto ncols = m->cmap->n;
238   const auto copy  = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;
239 
240   PetscFunctionBegin;
241   PetscCheckTypeName(m, MATSEQDENSECUPM());
242   if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
243   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
244   if (copy) {
245     const auto   mcu = MatCUPMCast(m);
246     cupmStream_t stream;
247 
248     // Allocate GPU memory if not present
249     if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr));
250     PetscCall(GetHandlesFrom_(dctx, &stream));
251     PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
252     {
253       const auto mimpl = MatIMPLCast(m);
254       const auto lda   = mimpl->lda;
255       const auto src   = mimpl->v;
256       const auto dest  = mcu->d_v;
257 
258       if (lda > nrows) {
259         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
260       } else {
261         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
262       }
263     }
264     PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
265     // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
266     m->offloadmask = PETSC_OFFLOAD_BOTH;
267   }
268   PetscFunctionReturn(PETSC_SUCCESS);
269 }
270 
271 template <device::cupm::DeviceType T>
DeviceToHost_(Mat m,PetscDeviceContext dctx)272 inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
273 {
274   const auto nrows = m->rmap->n;
275   const auto ncols = m->cmap->n;
276   const auto copy  = m->offloadmask == PETSC_OFFLOAD_GPU;
277 
278   PetscFunctionBegin;
279   PetscCheckTypeName(m, MATSEQDENSECUPM());
280   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
281   if (copy) {
282     const auto   mimpl = MatIMPLCast(m);
283     cupmStream_t stream;
284 
285     // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
286     if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
287     PetscCall(GetHandlesFrom_(dctx, &stream));
288     PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
289     {
290       const auto lda  = mimpl->lda;
291       const auto dest = mimpl->v;
292       const auto src  = MatCUPMCast(m)->d_v;
293 
294       if (lda > nrows) {
295         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
296       } else {
297         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
298       }
299     }
300     PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
301     // order is important, MatSeqDenseSetPreallocation() might set offloadmask
302     m->offloadmask = PETSC_OFFLOAD_BOTH;
303   }
304   PetscFunctionReturn(PETSC_SUCCESS);
305 }
306 
307 template <device::cupm::DeviceType T>
CheckCUPMSolverInfo_(const cupmBlasInt_t * fact_info,cupmStream_t stream)308 inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
309 {
310   PetscFunctionBegin;
311   if (PetscDefined(USE_DEBUG)) {
312     cupmBlasInt_t info = 0;
313 
314     PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
315     if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
316     static_assert(std::is_same<decltype(info), int>::value, "");
317     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
318     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
319   }
320   PetscFunctionReturn(PETSC_SUCCESS);
321 }
322 
323 // ==========================================================================================
324 // MatDense_Seq_CUPM - Private API - Solver Dispatch
325 // ==========================================================================================
326 
327 // specific solvers called through the dispatch_() family of functions
328 template <device::cupm::DeviceType T>
329 template <typename Derived>
330 struct MatDense_Seq_CUPM<T>::SolveCommon {
331   using derived_type = Derived;
332 
333   template <typename F>
ResizeFactLworkPetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveCommon334   static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
335   {
336     cupmBlasInt_t lwork;
337 
338     PetscFunctionBegin;
339     PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
340     if (lwork > mcu->d_fact_lwork) {
341       mcu->d_fact_lwork = lwork;
342       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
343       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
344     }
345     PetscFunctionReturn(PETSC_SUCCESS);
346   }
347 
FactorPreparePetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveCommon348   static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
349   {
350     const auto mcu = MatCUPMCast(A);
351 
352     PetscFunctionBegin;
353     PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
354     A->factortype             = derived_type::MATFACTORTYPE();
355     A->ops->solve             = MatSolve_Factored_Dispatch_<derived_type, false>;
356     A->ops->solvetranspose    = MatSolve_Factored_Dispatch_<derived_type, true>;
357     A->ops->matsolve          = MatMatSolve_Factored_Dispatch_<derived_type, false>;
358     A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;
359 
360     PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
361     if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
362     PetscFunctionReturn(PETSC_SUCCESS);
363   }
364 };
365 
366 template <device::cupm::DeviceType T>
367 struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
368   using base_type = SolveCommon<SolveLU>;
369 
NAMEPetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveLU370   static constexpr const char   *NAME() noexcept { return "LU"; }
MATFACTORTYPEPetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveLU371   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }
372 
FactorPetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveLU373   static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
374   {
375     const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
376     const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
377     cupmStream_t       stream;
378     cupmSolverHandle_t handle;
379     PetscDeviceContext dctx;
380 
381     PetscFunctionBegin;
382     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
383     PetscCall(GetHandles_(&dctx, &handle, &stream));
384     PetscCall(base_type::FactorPrepare(A, stream));
385     {
386       const auto mcu = MatCUPMCast(A);
387       const auto da  = DeviceArrayReadWrite(dctx, A);
388       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
389 
390       // clang-format off
391       PetscCall(
392         base_type::ResizeFactLwork(
393           mcu, stream,
394           [&](cupmBlasInt_t *fact_lwork)
395           {
396             return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
397           }
398         )
399       );
400       // clang-format on
401       if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
402 
403       PetscCall(PetscLogGpuTimeBegin());
404       PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
405       PetscCall(PetscLogGpuTimeEnd());
406       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
407     }
408     PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
409     PetscFunctionReturn(PETSC_SUCCESS);
410   }
411 
412   template <bool transpose>
SolvePetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveLU413   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
414   {
415     const auto         mcu       = MatCUPMCast(A);
416     const auto         fact_info = mcu->d_fact_info;
417     const auto         fact_ipiv = mcu->d_fact_ipiv;
418     cupmSolverHandle_t handle;
419 
420     PetscFunctionBegin;
421     PetscCall(GetHandlesFrom_(dctx, &handle));
422     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
423     PetscCall(PetscLogGpuTimeBegin());
424     {
425       constexpr auto op  = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
426       const auto     da  = DeviceArrayRead(dctx, A);
427       const auto     lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
428 
429       // clang-format off
430       PetscCall(
431         base_type::ResizeFactLwork(
432           mcu, stream,
433           [&](cupmBlasInt_t *lwork)
434           {
435             return cupmSolverXgetrs_bufferSize(
436               handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
437             );
438           }
439         )
440       );
441       // clang-format on
442       PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
443       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
444     }
445     PetscCall(PetscLogGpuTimeEnd());
446     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
447     PetscFunctionReturn(PETSC_SUCCESS);
448   }
449 };
450 
451 template <device::cupm::DeviceType T>
452 struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
453   using base_type = SolveCommon<SolveCholesky>;
454 
NAMEPetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveCholesky455   static constexpr const char   *NAME() noexcept { return "Cholesky"; }
MATFACTORTYPEPetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveCholesky456   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }
457 
FactorPetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveCholesky458   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
459   {
460     const auto         n = static_cast<cupmBlasInt_t>(A->rmap->n);
461     PetscDeviceContext dctx;
462     cupmSolverHandle_t handle;
463     cupmStream_t       stream;
464 
465     PetscFunctionBegin;
466     if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
467     PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
468     PetscCall(GetHandles_(&dctx, &handle, &stream));
469     PetscCall(base_type::FactorPrepare(A, stream));
470     {
471       const auto mcu = MatCUPMCast(A);
472       const auto da  = DeviceArrayReadWrite(dctx, A);
473       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
474 
475       // clang-format off
476       PetscCall(
477         base_type::ResizeFactLwork(
478           mcu, stream,
479           [&](cupmBlasInt_t *fact_lwork)
480           {
481             return cupmSolverXpotrf_bufferSize(
482               handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
483             );
484           }
485         )
486       );
487       // clang-format on
488       PetscCall(PetscLogGpuTimeBegin());
489       PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
490       PetscCall(PetscLogGpuTimeEnd());
491       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
492     }
493     PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
494 
495 #if 0
496     // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
497     // and *hetr* routines. The code below should work, and it can be activated when *sytrs
498     // routines will be available
499     if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
500     if (!mcu->d_fact_lwork) {
501       PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
502       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
503     }
504     if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
505     PetscCall(PetscLogGpuTimeBegin());
506     PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
507     PetscCall(PetscLogGpuTimeEnd());
508 #endif
509     PetscFunctionReturn(PETSC_SUCCESS);
510   }
511 
512   template <bool transpose>
SolvePetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveCholesky513   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
514   {
515     const auto         mcu       = MatCUPMCast(A);
516     const auto         fact_info = mcu->d_fact_info;
517     cupmSolverHandle_t handle;
518 
519     PetscFunctionBegin;
520     PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
521     PetscCall(GetHandlesFrom_(dctx, &handle));
522     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
523     PetscCall(PetscLogGpuTimeBegin());
524     {
525       const auto da  = DeviceArrayRead(dctx, A);
526       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
527 
528       // clang-format off
529       PetscCall(
530         base_type::ResizeFactLwork(
531           mcu, stream,
532           [&](cupmBlasInt_t *lwork)
533           {
534             return cupmSolverXpotrs_bufferSize(
535               handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
536             );
537           }
538         )
539       );
540       // clang-format on
541       PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
542       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
543     }
544     PetscCall(PetscLogGpuTimeEnd());
545     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
546     PetscFunctionReturn(PETSC_SUCCESS);
547   }
548 };
549 
550 template <device::cupm::DeviceType T>
551 struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
552   using base_type = SolveCommon<SolveQR>;
553 
NAMEPetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveQR554   static constexpr const char   *NAME() noexcept { return "QR"; }
MATFACTORTYPEPetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveQR555   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }
556 
FactorPetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveQR557   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
558   {
559     const auto         m     = static_cast<cupmBlasInt_t>(A->rmap->n);
560     const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
561     const auto         min   = std::min(m, n);
562     const auto         mimpl = MatIMPLCast(A);
563     cupmStream_t       stream;
564     cupmSolverHandle_t handle;
565     PetscDeviceContext dctx;
566 
567     PetscFunctionBegin;
568     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
569     PetscCall(GetHandles_(&dctx, &handle, &stream));
570     PetscCall(base_type::FactorPrepare(A, stream));
571     mimpl->rank = min;
572     {
573       const auto mcu = MatCUPMCast(A);
574       const auto da  = DeviceArrayReadWrite(dctx, A);
575       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
576 
577       if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
578       if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
579       // clang-format off
580       PetscCall(
581         base_type::ResizeFactLwork(
582           mcu, stream,
583           [&](cupmBlasInt_t *fact_lwork)
584           {
585             return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
586           }
587         )
588       );
589       // clang-format on
590       PetscCall(PetscLogGpuTimeBegin());
591       PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
592       PetscCall(PetscLogGpuTimeEnd());
593       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
594     }
595     PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
596     PetscFunctionReturn(PETSC_SUCCESS);
597   }
598 
599   template <bool transpose>
SolvePetsc::mat::cupm::impl::MatDense_Seq_CUPM::SolveQR600   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
601   {
602     const auto         mimpl      = MatIMPLCast(A);
603     const auto         rank       = static_cast<cupmBlasInt_t>(mimpl->rank);
604     const auto         mcu        = MatCUPMCast(A);
605     const auto         fact_info  = mcu->d_fact_info;
606     const auto         fact_tau   = mcu->d_fact_tau;
607     const auto         fact_work  = mcu->d_fact_work;
608     const auto         fact_lwork = mcu->d_fact_lwork;
609     cupmSolverHandle_t solver_handle;
610     cupmBlasHandle_t   blas_handle;
611 
612     PetscFunctionBegin;
613     PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
614     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
615     PetscCall(PetscLogGpuTimeBegin());
616     {
617       const auto da  = DeviceArrayRead(dctx, A);
618       const auto one = cupmScalarCast(1.0);
619       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
620 
621       if (transpose) {
622         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
623         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
624         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
625       } else {
626         constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;
627 
628         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
629         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
630         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
631       }
632     }
633     PetscCall(PetscLogGpuTimeEnd());
634     PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
635     PetscFunctionReturn(PETSC_SUCCESS);
636   }
637 };
638 
639 template <device::cupm::DeviceType T>
640 template <typename Solver, bool transpose>
MatSolve_Factored_Dispatch_(Mat A,Vec x,Vec y)641 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
642 {
643   using namespace vec::cupm;
644   const auto         pobj_A  = PetscObjectCast(A);
645   const auto         m       = static_cast<cupmBlasInt_t>(A->rmap->n);
646   const auto         k       = static_cast<cupmBlasInt_t>(A->cmap->n);
647   auto              &workvec = MatCUPMCast(A)->workvec;
648   PetscScalar       *y_array = nullptr;
649   PetscDeviceContext dctx;
650   PetscBool          xiscupm, yiscupm, aiscupm;
651   bool               use_y_array_directly;
652   cupmStream_t       stream;
653 
654   PetscFunctionBegin;
655   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
656   PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
657   PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
658   PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
659   PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
660   PetscCall(GetHandles_(&dctx, &stream));
661   use_y_array_directly = yiscupm && (k >= m);
662   {
663     const PetscScalar *x_array;
664     const auto         xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
665     const auto         copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
666 
667     if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
668     // The logic here is to try to minimize the amount of memory copying:
669     //
670     // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
671     // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
672     // data in order to copy it into the y array. So the array x will be wherever the data
673     // already is so that only one memcpy is performed
674     if (xisdevice) {
675       PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
676     } else {
677       PetscCall(VecGetArrayRead(x, &x_array));
678     }
679     PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
680     PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
681     if (xisdevice) {
682       PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
683     } else {
684       PetscCall(VecRestoreArrayRead(x, &x_array));
685     }
686   }
687 
688   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
689   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
690   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
691 
692   if (use_y_array_directly) {
693     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
694   } else {
695     const auto   copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
696     PetscScalar *yv;
697 
698     // The logic here is that the data is not yet in either y's GPU array or its CPU array.
699     // There is nothing in the interface to say where the user would like it to end up. So we
700     // choose the GPU, because it is the faster option
701     if (yiscupm) {
702       PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
703     } else {
704       PetscCall(VecGetArray(y, &yv));
705     }
706     PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
707     if (yiscupm) {
708       PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
709     } else {
710       PetscCall(VecRestoreArray(y, &yv));
711     }
712     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
713   }
714   PetscFunctionReturn(PETSC_SUCCESS);
715 }
716 
717 template <device::cupm::DeviceType T>
718 template <typename Solver, bool transpose>
MatMatSolve_Factored_Dispatch_(Mat A,Mat B,Mat X)719 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
720 {
721   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
722   const auto         k = static_cast<cupmBlasInt_t>(A->cmap->n);
723   cupmBlasInt_t      nrhs, ldb, ldx, ldy;
724   PetscScalar       *y;
725   PetscBool          biscupm, xiscupm, aiscupm;
726   PetscDeviceContext dctx;
727   cupmStream_t       stream;
728 
729   PetscFunctionBegin;
730   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
731   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
732   PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
733   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
734   PetscCall(GetHandles_(&dctx, &stream));
735   {
736     PetscInt n;
737 
738     PetscCall(MatGetSize(B, nullptr, &n));
739     PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
740     PetscCall(MatDenseGetLDA(B, &n));
741     PetscCall(PetscCUPMBlasIntCast(n, &ldb));
742     PetscCall(MatDenseGetLDA(X, &n));
743     PetscCall(PetscCUPMBlasIntCast(n, &ldx));
744   }
745   {
746     // The logic here is to try to minimize the amount of memory copying:
747     //
748     // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
749     // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
750     // get the data in order to copy it into the y array. So the array b will be wherever the
751     // data already is so that only one memcpy is performed
752     const auto         bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
753     const auto         copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
754     const PetscScalar *b;
755 
756     if (bisdevice) {
757       b = DeviceArrayRead(dctx, B);
758     } else if (biscupm) {
759       b = HostArrayRead(dctx, B);
760     } else {
761       PetscCall(MatDenseGetArrayRead(B, &b));
762     }
763 
764     if (ldx < m || !xiscupm) {
765       // X's array cannot serve as the array (too small or not on device), B's array cannot
766       // serve as the array (const), so allocate a new array
767       ldy = m;
768       PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
769     } else {
770       // X's array should serve as the array
771       ldy = ldx;
772       y   = DeviceArrayWrite(dctx, X);
773     }
774     PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
775     if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
776   }
777 
778   // convert to CUPM twice??????????????????????????????????
779   // but A should already be CUPM??????????????????????????????????????
780   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
781   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
782   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
783 
784   if (ldx < m || !xiscupm) {
785     const auto   copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
786     PetscScalar *x;
787 
788     // The logic here is that the data is not yet in either X's GPU array or its CPU
789     // array. There is nothing in the interface to say where the user would like it to end up.
790     // So we choose the GPU, because it is the faster option
791     if (xiscupm) {
792       x = DeviceArrayWrite(dctx, X);
793     } else {
794       PetscCall(MatDenseGetArray(X, &x));
795     }
796     PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
797     if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
798     PetscCallCUPM(cupmFreeAsync(y, stream));
799   }
800   PetscFunctionReturn(PETSC_SUCCESS);
801 }
802 
803 template <device::cupm::DeviceType T>
804 template <bool transpose, bool hermitian>
MatMultAddColumnRange_Dispatch_(Mat A,Vec xx,Vec yy,Vec zz,PetscInt c_start,PetscInt c_end)805 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAddColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end) noexcept
806 {
807   const auto         m   = static_cast<cupmBlasInt_t>(A->rmap->n);
808   const auto         n   = static_cast<cupmBlasInt_t>(c_end - c_start);
809   const auto         lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
810   PetscBool          xiscupm, yiscupm, ziscupm;
811   cupmBlasHandle_t   handle;
812   Vec                x = xx, y = yy, z = zz;
813   PetscDeviceContext dctx;
814 
815   PetscFunctionBegin;
816   PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(xx), &xiscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
817   if (!xiscupm || xx->boundtocpu) {
818     PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(xx)), &x));
819     PetscCall(VecSetLayout(x, xx->map));
820     PetscCall(VecSetType(x, VecSeq_CUPM::VECCUPM()));
821     PetscCall(VecCopy(xx, x));
822   }
823 
824   if (yy) {
825     PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(yy), &yiscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
826     if (!yiscupm || yy->boundtocpu) {
827       PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(yy)), &y));
828       PetscCall(VecSetLayout(y, yy->map));
829       PetscCall(VecSetType(y, VecSeq_CUPM::VECCUPM()));
830       PetscCall(VecCopy(yy, y));
831     }
832   }
833 
834   if (zz != yy) {
835     PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(zz), &ziscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
836     if (!ziscupm || zz->boundtocpu) {
837       PetscCall(VecCreate(PetscObjectComm(PetscObjectCast(zz)), &z));
838       PetscCall(VecSetLayout(z, zz->map));
839       PetscCall(VecSetType(z, VecSeq_CUPM::VECCUPM()));
840     }
841   } else {
842     z = y;
843   }
844 
845   if (y && y != z) PetscCall(VecSeq_CUPM::Copy(y, z)); // mult add
846   if (!m || !n) {
847     // mult only
848     if (!y) PetscCall(VecSeq_CUPM::Set(z, 0.0));
849     PetscFunctionReturn(PETSC_SUCCESS);
850   }
851   PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
852   PetscCall(GetHandles_(&dctx, &handle));
853   {
854     constexpr auto op   = transpose ? (hermitian ? CUPMBLAS_OP_C : CUPMBLAS_OP_T) : CUPMBLAS_OP_N;
855     const auto     one  = cupmScalarCast(1.0);
856     const auto     zero = cupmScalarCast(0.0);
857     const auto     da   = DeviceArrayRead(dctx, A);
858     const auto     dxx  = VecSeq_CUPM::DeviceArrayRead(dctx, x);
859     const auto     dzz  = VecSeq_CUPM::DeviceArrayReadWrite(dctx, z);
860 
861     PetscCall(PetscLogGpuTimeBegin());
862     PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata() + c_start * lda, lda, dxx.cupmdata() + (transpose ? 0 : c_start), 1, y ? &one : &zero, dzz.cupmdata() + (transpose ? c_start : 0), 1));
863     PetscCall(PetscLogGpuTimeEnd());
864   }
865   PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
866   if (z != zz) {
867     PetscCall(VecCopy(z, zz));
868     if (z != y) PetscCall(VecDestroy(&z));
869   }
870   if (y != yy) PetscCall(VecDestroy(&y));
871   if (x != xx) PetscCall(VecDestroy(&x));
872   PetscFunctionReturn(PETSC_SUCCESS);
873 }
874 
875 template <device::cupm::DeviceType T>
876 template <bool transpose, bool hermitian>
MatMultColumnRange_Dispatch_(Mat A,Vec xx,Vec yy,PetscInt c_start,PetscInt c_end)877 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end) noexcept
878 {
879   PetscFunctionBegin;
880   PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, nullptr, yy, c_start, c_end));
881   PetscFunctionReturn(PETSC_SUCCESS);
882 }
883 
884 template <device::cupm::DeviceType T>
885 template <bool transpose, bool hermitian>
MatMultAdd_Dispatch_(Mat A,Vec xx,Vec yy,Vec zz)886 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
887 {
888   PetscFunctionBegin;
889   PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, yy, zz, 0, A->cmap->n));
890   PetscFunctionReturn(PETSC_SUCCESS);
891 }
892 
893 // ==========================================================================================
894 // MatDense_Seq_CUPM - Private API - Conversion Dispatch
895 // ==========================================================================================
896 
897 template <device::cupm::DeviceType T>
898 template <bool to_host>
Convert_Dispatch_(Mat M,MatType type,MatReuse reuse,Mat * newmat)899 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
900 {
901   PetscFunctionBegin;
902   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
903     // TODO these cases should be optimized
904     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
905   } else {
906     const auto B    = *newmat;
907     const auto pobj = PetscObjectCast(B);
908 
909     if (to_host) {
910       PetscCall(BindToCPU(B, PETSC_TRUE));
911       PetscCall(Reset(B));
912     } else {
913       PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
914     }
915 
916     PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
917     PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
918     // cvec might be the wrong VecType, destroy and rebuild it if necessary
919     // REVIEW ME: this is possibly very inefficient
920     PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));
921 
922     MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
923     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
924     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
925     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
926     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
927     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
928     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
929     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
930     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
931     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
932     MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
933     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation);
934 
935     if (to_host) {
936       B->offloadmask = PETSC_OFFLOAD_CPU;
937     } else {
938       Mat_SeqDenseCUPM *mcu;
939 
940       PetscCall(PetscNew(&mcu));
941       B->spptr       = mcu;
942       B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
943       PetscCall(BindToCPU(B, PETSC_FALSE));
944     }
945 
946     MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
947     MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
948   }
949   PetscFunctionReturn(PETSC_SUCCESS);
950 }
951 
952 // ==========================================================================================
953 // MatDense_Seq_CUPM - Public API
954 // ==========================================================================================
955 
956 template <device::cupm::DeviceType T>
MATIMPLCUPM_()957 inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
958 {
959   return MATSEQDENSECUPM();
960 }
961 
962 template <device::cupm::DeviceType T>
MatCUPMCast(Mat m)963 inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
964 {
965   return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
966 }
967 
968 template <device::cupm::DeviceType T>
MatIMPLCast_(Mat m)969 inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
970 {
971   return static_cast<Mat_SeqDense *>(m->data);
972 }
973 
974 template <device::cupm::DeviceType T>
MatConvert_seqdensecupm_seqdense_C()975 inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
976 {
977   return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
978 }
979 
980 template <device::cupm::DeviceType T>
MatProductSetFromOptions_seqaij_seqdensecupm_C()981 inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
982 {
983   return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
984 }
985 
986 // ==========================================================================================
987 
988 // MatCreate_SeqDenseCUPM()
989 template <device::cupm::DeviceType T>
Create(Mat A)990 inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
991 {
992   PetscFunctionBegin;
993   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
994   PetscCall(MatCreate_SeqDense(A));
995   PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
996   PetscFunctionReturn(PETSC_SUCCESS);
997 }
998 
999 template <device::cupm::DeviceType T>
Destroy(Mat A)1000 inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
1001 {
1002   PetscFunctionBegin;
1003   // prevent copying back data if we own the data pointer
1004   if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1005   PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1006   PetscCall(MatDestroy_SeqDense(A));
1007   PetscFunctionReturn(PETSC_SUCCESS);
1008 }
1009 
1010 // obj->ops->setup()
1011 template <device::cupm::DeviceType T>
SetUp(Mat A)1012 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
1013 {
1014   PetscFunctionBegin;
1015   PetscCall(PetscLayoutSetUp(A->rmap));
1016   PetscCall(PetscLayoutSetUp(A->cmap));
1017   if (!A->preallocated) {
1018     PetscDeviceContext dctx;
1019 
1020     PetscCall(GetHandles_(&dctx));
1021     PetscCall(SetPreallocation(A, dctx, nullptr));
1022   }
1023   PetscFunctionReturn(PETSC_SUCCESS);
1024 }
1025 
1026 template <device::cupm::DeviceType T>
Reset(Mat A)1027 inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
1028 {
1029   PetscFunctionBegin;
1030   if (const auto mcu = MatCUPMCast(A)) {
1031     cupmStream_t stream;
1032 
1033     PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1034     PetscCall(GetHandles_(&stream));
1035     if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1036     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
1037     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
1038     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
1039     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1040     PetscCall(VecDestroy(&mcu->workvec));
1041     PetscCall(PetscFree(A->spptr /* mcu */));
1042   }
1043   PetscFunctionReturn(PETSC_SUCCESS);
1044 }
1045 
1046 // ==========================================================================================
1047 
1048 template <device::cupm::DeviceType T>
BindToCPU(Mat A,PetscBool to_host)1049 inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
1050 {
1051   const auto mimpl = MatIMPLCast(A);
1052   const auto pobj  = PetscObjectCast(A);
1053 
1054   PetscFunctionBegin;
1055   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1056   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1057   A->boundtocpu = to_host;
1058   PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
1059   if (to_host) {
1060     PetscDeviceContext dctx;
1061 
1062     // make sure we have an up-to-date copy on the CPU
1063     PetscCall(GetHandles_(&dctx));
1064     PetscCall(DeviceToHost_(A, dctx));
1065   } else {
1066     PetscBool iscupm;
1067 
1068     if (auto &cvec = mimpl->cvec) {
1069       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1070       if (!iscupm) PetscCall(VecDestroy(&cvec));
1071     }
1072     if (auto &cmat = mimpl->cmat) {
1073       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1074       if (!iscupm) PetscCall(MatDestroy(&cmat));
1075     }
1076   }
1077 
1078   // ============================================================
1079   // Composed ops
1080   // ============================================================
1081   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1082   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1083   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1084   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1085   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1086   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1087   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1088   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1089   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1090   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1091   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1092   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1093   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1094   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1095   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1096   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1097   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1098   MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1099   MatComposeOp_CUPM(to_host, pobj, "MatMultColumnRange_C", MatMultColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>);
1100   MatComposeOp_CUPM(to_host, pobj, "MatMultAddColumnRange_C", MatMultAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>);
1101   MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeColumnRange_C", MatMultHermitianTransposeColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1102   MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeAddColumnRange_C", MatMultHermitianTransposeAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1103   // always the same
1104   PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1105 
1106   // ============================================================
1107   // Function pointer ops
1108   // ============================================================
1109   MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1110   MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>(A, xx, nullptr, yy); });
1111   MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>(A, xx, nullptr, yy); });
1112   MatSetOp_CUPM(to_host, A, multhermitiantranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>(A, xx, nullptr, yy); });
1113   MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>);
1114   MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>);
1115   MatSetOp_CUPM(to_host, A, multhermitiantransposeadd, MatMultHermitianTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>);
1116   MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1117   MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1118   MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1119   MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1120   MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1121   MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1122   MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1123   MatSetOp_CUPM(to_host, A, conjugate, MatConjugate_SeqDense, Conjugate);
1124   MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1125   MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1126   MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1127   MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1128   MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1129   MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1130   MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal);
1131   // seemingly always the same
1132   A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1133 
1134   if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1135   PetscFunctionReturn(PETSC_SUCCESS);
1136 }
1137 
1138 template <device::cupm::DeviceType T>
Convert_SeqDenseCUPM_SeqDense(Mat M,MatType type,MatReuse reuse,Mat * newmat)1139 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1140 {
1141   PetscFunctionBegin;
1142   PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1143   PetscFunctionReturn(PETSC_SUCCESS);
1144 }
1145 
1146 template <device::cupm::DeviceType T>
Convert_SeqDense_SeqDenseCUPM(Mat M,MatType type,MatReuse reuse,Mat * newmat)1147 inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1148 {
1149   PetscFunctionBegin;
1150   PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1151   PetscFunctionReturn(PETSC_SUCCESS);
1152 }
1153 
1154 // ==========================================================================================
1155 
1156 template <device::cupm::DeviceType T>
1157 template <PetscMemType mtype, PetscMemoryAccessMode access>
GetArray(Mat m,PetscScalar ** array,PetscDeviceContext dctx)1158 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1159 {
1160   constexpr auto hostmem     = PetscMemTypeHost(mtype);
1161   constexpr auto read_access = PetscMemoryAccessRead(access);
1162 
1163   PetscFunctionBegin;
1164   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1165   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1166   if (hostmem) {
1167     if (read_access) {
1168       PetscCall(DeviceToHost_(m, dctx));
1169     } else if (!MatIMPLCast(m)->v) {
1170       // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1171       PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1172     }
1173     *array = MatIMPLCast(m)->v;
1174   } else {
1175     if (read_access) {
1176       PetscCall(HostToDevice_(m, dctx));
1177     } else if (!MatCUPMCast(m)->d_v) {
1178       // write-only
1179       PetscCall(SetPreallocation(m, dctx, nullptr));
1180     }
1181     *array = MatCUPMCast(m)->d_v;
1182   }
1183   if (PetscMemoryAccessWrite(access)) {
1184     m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1185     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1186   }
1187   PetscFunctionReturn(PETSC_SUCCESS);
1188 }
1189 
1190 template <device::cupm::DeviceType T>
1191 template <PetscMemType mtype, PetscMemoryAccessMode access>
RestoreArray(Mat m,PetscScalar ** array,PetscDeviceContext)1192 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1193 {
1194   PetscFunctionBegin;
1195   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1196   if (PetscMemoryAccessWrite(access)) {
1197     // WRITE or READ_WRITE
1198     m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1199     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1200   }
1201   if (array) {
1202     PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1203     *array = nullptr;
1204   }
1205   PetscFunctionReturn(PETSC_SUCCESS);
1206 }
1207 
1208 template <device::cupm::DeviceType T>
1209 template <PetscMemoryAccessMode access>
GetArrayAndMemType(Mat m,PetscScalar ** array,PetscMemType * mtype,PetscDeviceContext dctx)1210 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1211 {
1212   PetscFunctionBegin;
1213   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1214   if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1215   PetscFunctionReturn(PETSC_SUCCESS);
1216 }
1217 
1218 template <device::cupm::DeviceType T>
1219 template <PetscMemoryAccessMode access>
RestoreArrayAndMemType(Mat m,PetscScalar ** array,PetscDeviceContext dctx)1220 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1221 {
1222   PetscFunctionBegin;
1223   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1224   PetscFunctionReturn(PETSC_SUCCESS);
1225 }
1226 
1227 // ==========================================================================================
1228 
1229 template <device::cupm::DeviceType T>
PlaceArray(Mat A,const PetscScalar * array)1230 inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1231 {
1232   const auto mimpl = MatIMPLCast(A);
1233   const auto mcu   = MatCUPMCast(A);
1234 
1235   PetscFunctionBegin;
1236   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1237   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1238   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1239   if (mimpl->v) {
1240     PetscDeviceContext dctx;
1241 
1242     PetscCall(GetHandles_(&dctx));
1243     PetscCall(HostToDevice_(A, dctx));
1244   }
1245   mcu->unplacedarray         = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1246   mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1247   PetscFunctionReturn(PETSC_SUCCESS);
1248 }
1249 
1250 template <device::cupm::DeviceType T>
ReplaceArray(Mat A,const PetscScalar * array)1251 inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1252 {
1253   const auto mimpl = MatIMPLCast(A);
1254   const auto mcu   = MatCUPMCast(A);
1255 
1256   PetscFunctionBegin;
1257   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1258   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1259   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1260   if (!mcu->d_user_alloc) {
1261     cupmStream_t stream;
1262 
1263     PetscCall(GetHandles_(&stream));
1264     PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1265   }
1266   mcu->d_v          = const_cast<PetscScalar *>(array);
1267   mcu->d_user_alloc = PETSC_FALSE;
1268   PetscFunctionReturn(PETSC_SUCCESS);
1269 }
1270 
1271 template <device::cupm::DeviceType T>
ResetArray(Mat A)1272 inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1273 {
1274   const auto mimpl = MatIMPLCast(A);
1275   const auto mcu   = MatCUPMCast(A);
1276 
1277   PetscFunctionBegin;
1278   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1279   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1280   if (mimpl->v) {
1281     PetscDeviceContext dctx;
1282 
1283     PetscCall(GetHandles_(&dctx));
1284     PetscCall(HostToDevice_(A, dctx));
1285   }
1286   mcu->d_v          = util::exchange(mcu->unplacedarray, nullptr);
1287   mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1288   PetscFunctionReturn(PETSC_SUCCESS);
1289 }
1290 
1291 // ==========================================================================================
1292 
1293 template <device::cupm::DeviceType T>
1294 template <bool transpose_A, bool transpose_B>
MatMatMult_Numeric_Dispatch(Mat A,Mat B,Mat C)1295 inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1296 {
1297   cupmBlasInt_t      m, n, k;
1298   PetscBool          Aiscupm, Biscupm;
1299   PetscDeviceContext dctx;
1300   cupmBlasHandle_t   handle;
1301 
1302   PetscFunctionBegin;
1303   PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1304   PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1305   PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1306   if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
1307 
1308   // we may end up with SEQDENSE as one of the arguments
1309   // REVIEW ME: how? and why is it not B and C????????
1310   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1311   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1312   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1313   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1314   PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1315   PetscCall(GetHandles_(&dctx, &handle));
1316 
1317   PetscCall(PetscLogGpuTimeBegin());
1318   {
1319     const auto one  = cupmScalarCast(1.0);
1320     const auto zero = cupmScalarCast(0.0);
1321     const auto da   = DeviceArrayRead(dctx, A);
1322     const auto db   = DeviceArrayRead(dctx, B);
1323     const auto dc   = DeviceArrayWrite(dctx, C);
1324     PetscInt   alda, blda, clda;
1325 
1326     PetscCall(MatDenseGetLDA(A, &alda));
1327     PetscCall(MatDenseGetLDA(B, &blda));
1328     PetscCall(MatDenseGetLDA(C, &clda));
1329     PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1330   }
1331   PetscCall(PetscLogGpuTimeEnd());
1332 
1333   PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1334   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1335   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1336   PetscFunctionReturn(PETSC_SUCCESS);
1337 }
1338 
1339 template <device::cupm::DeviceType T>
Copy(Mat A,Mat B,MatStructure str)1340 inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1341 {
1342   const auto m = A->rmap->n;
1343   const auto n = A->cmap->n;
1344 
1345   PetscFunctionBegin;
1346   PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1347   // The two matrices must have the same copy implementation to be eligible for fast copy
1348   if (A->ops->copy == B->ops->copy) {
1349     PetscDeviceContext dctx;
1350     cupmStream_t       stream;
1351 
1352     PetscCall(GetHandles_(&dctx, &stream));
1353     PetscCall(PetscLogGpuTimeBegin());
1354     {
1355       const auto va = DeviceArrayRead(dctx, A);
1356       const auto vb = DeviceArrayWrite(dctx, B);
1357       // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1358       // lda!
1359       const auto lda_a = MatIMPLCast(A)->lda;
1360       const auto lda_b = MatIMPLCast(B)->lda;
1361 
1362       if (lda_a > m || lda_b > m) {
1363         PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1364         PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1365         PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1366       } else {
1367         PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1368       }
1369     }
1370     PetscCall(PetscLogGpuTimeEnd());
1371   } else {
1372     PetscCall(MatCopy_Basic(A, B, str));
1373   }
1374   PetscFunctionReturn(PETSC_SUCCESS);
1375 }
1376 
1377 template <device::cupm::DeviceType T>
ZeroEntries(Mat m)1378 inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1379 {
1380   PetscDeviceContext dctx;
1381   cupmStream_t       stream;
1382 
1383   PetscFunctionBegin;
1384   PetscCall(GetHandles_(&dctx, &stream));
1385   PetscCall(PetscLogGpuTimeBegin());
1386   {
1387     const auto va  = DeviceArrayWrite(dctx, m);
1388     const auto lda = MatIMPLCast(m)->lda;
1389     const auto ma  = m->rmap->n;
1390     const auto na  = m->cmap->n;
1391 
1392     if (lda > ma) {
1393       PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1394     } else {
1395       PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1396     }
1397   }
1398   PetscCall(PetscLogGpuTimeEnd());
1399   PetscFunctionReturn(PETSC_SUCCESS);
1400 }
1401 
1402 namespace detail
1403 {
1404 
1405 // ==========================================================================================
1406 // SubMatIndexFunctor
1407 //
1408 // Iterator which permutes a linear index range into matrix indices for an nrows x ncols
1409 // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1410 // the i'th sequential entry in the matrix.
1411 // ==========================================================================================
1412 template <typename T>
1413 struct SubMatIndexFunctor {
operator ()Petsc::mat::cupm::impl::detail::SubMatIndexFunctor1414   PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }
1415 
1416   PetscInt nrows;
1417   PetscInt ncols;
1418   PetscInt lda;
1419 };
1420 
1421 template <typename Iterator>
1422 struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<iter_difference_t<Iterator>>> {
1423   using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<iter_difference_t<Iterator>>>;
1424 
1425   using iterator = typename base_type::iterator;
1426 
SubMatrixIteratorPetsc::mat::cupm::impl::detail::SubMatrixIterator1427   constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1428     base_type{
1429       std::move(first), std::move(last), {nrows, ncols, lda}
1430   }
1431   {
1432   }
1433 
endPetsc::mat::cupm::impl::detail::SubMatrixIterator1434   PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1435 };
1436 
1437 namespace
1438 {
1439 
1440 template <typename T>
make_submat_iterator(PetscInt rstart,PetscInt rend,PetscInt cstart,PetscInt cend,PetscInt lda,T * ptr)1441 PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1442 {
1443   const auto nrows = rend - rstart;
1444   const auto ncols = cend - cstart;
1445   const auto dptr  = thrust::device_pointer_cast(ptr);
1446 
1447   return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1448 }
1449 
1450 } // namespace
1451 
1452 struct conjugate {
operator ()Petsc::mat::cupm::impl::detail::conjugate1453   PETSC_NODISCARD PETSC_HOSTDEVICE_INLINE_DECL PetscScalar operator()(const PetscScalar &x) const noexcept { return PetscConj(x); }
1454 };
1455 
1456 } // namespace detail
1457 
1458 template <device::cupm::DeviceType T>
Conjugate(Mat A)1459 inline PetscErrorCode MatDense_Seq_CUPM<T>::Conjugate(Mat A) noexcept
1460 {
1461   const auto         m = A->rmap->n;
1462   const auto         n = A->cmap->n;
1463   const auto         N = m * n;
1464   PetscDeviceContext dctx;
1465   cupmStream_t       stream;
1466 
1467   PetscFunctionBegin;
1468   if (PetscDefined(USE_COMPLEX)) {
1469     PetscCall(GetHandles_(&dctx, &stream));
1470     PetscCall(PetscLogGpuTimeBegin());
1471     {
1472       const auto   da  = DeviceArrayReadWrite(dctx, A);
1473       const auto   lda = MatIMPLCast(A)->lda;
1474       cupmStream_t stream;
1475       PetscCall(GetHandlesFrom_(dctx, &stream));
1476 
1477       if (lda > m) {
1478         // clang-format off
1479         PetscCallThrust(
1480           const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());
1481 
1482           THRUST_CALL(
1483             thrust::transform,
1484             stream,
1485             sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1486             detail::conjugate{}
1487           )
1488         );
1489         // clang-format on
1490       } else {
1491         // clang-format off
1492         PetscCallThrust(
1493           const auto aptr = thrust::device_pointer_cast(da.data());
1494 
1495           THRUST_CALL(
1496             thrust::transform,
1497             stream,
1498             aptr, aptr + N, aptr,
1499             detail::conjugate{}
1500           )
1501         );
1502         // clang-format on
1503       }
1504     }
1505     PetscCall(PetscLogGpuTimeEnd());
1506   }
1507   PetscFunctionReturn(PETSC_SUCCESS);
1508 }
1509 
1510 template <device::cupm::DeviceType T>
Scale(Mat A,PetscScalar alpha)1511 inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1512 {
1513   const auto         m = A->rmap->n;
1514   const auto         n = A->cmap->n;
1515   const auto         N = m * n;
1516   PetscDeviceContext dctx;
1517 
1518   PetscFunctionBegin;
1519   PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1520   PetscCall(GetHandles_(&dctx));
1521   {
1522     const auto da  = DeviceArrayReadWrite(dctx, A);
1523     const auto lda = MatIMPLCast(A)->lda;
1524 
1525     if (lda > m) {
1526       cupmStream_t stream;
1527 
1528       PetscCall(GetHandlesFrom_(dctx, &stream));
1529       // clang-format off
1530       PetscCallThrust(
1531         const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());
1532 
1533         THRUST_CALL(
1534           thrust::transform,
1535           stream,
1536           sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1537           device::cupm::functors::make_times_equals(alpha)
1538         )
1539       );
1540       // clang-format on
1541     } else {
1542       const auto       cu_alpha = cupmScalarCast(alpha);
1543       cupmBlasHandle_t handle;
1544 
1545       PetscCall(GetHandlesFrom_(dctx, &handle));
1546       PetscCall(PetscLogGpuTimeBegin());
1547       PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1548       PetscCall(PetscLogGpuTimeEnd());
1549     }
1550   }
1551   PetscCall(PetscLogGpuFlops(N));
1552   PetscFunctionReturn(PETSC_SUCCESS);
1553 }
1554 
1555 template <device::cupm::DeviceType T>
AXPY(Mat Y,PetscScalar alpha,Mat X,MatStructure)1556 inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1557 {
1558   const auto         m_x = X->rmap->n, m_y = Y->rmap->n;
1559   const auto         n_x = X->cmap->n, n_y = Y->cmap->n;
1560   const auto         N = m_x * n_x;
1561   PetscDeviceContext dctx;
1562 
1563   PetscFunctionBegin;
1564   if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1565   PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1566   PetscCall(GetHandles_(&dctx));
1567   {
1568     const auto dx    = DeviceArrayRead(dctx, X);
1569     const auto dy    = DeviceArrayReadWrite(dctx, Y);
1570     const auto lda_x = MatIMPLCast(X)->lda;
1571     const auto lda_y = MatIMPLCast(Y)->lda;
1572 
1573     if (lda_x > m_x || lda_y > m_x) {
1574       cupmStream_t stream;
1575 
1576       PetscCall(GetHandlesFrom_(dctx, &stream));
1577       // clang-format off
1578       PetscCallThrust(
1579         const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1580         const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());
1581 
1582         THRUST_CALL(
1583           thrust::transform,
1584           stream,
1585           sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1586           device::cupm::functors::make_axpy(alpha)
1587         );
1588       );
1589       // clang-format on
1590     } else {
1591       const auto       cu_alpha = cupmScalarCast(alpha);
1592       cupmBlasHandle_t handle;
1593 
1594       PetscCall(GetHandlesFrom_(dctx, &handle));
1595       PetscCall(PetscLogGpuTimeBegin());
1596       PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1597       PetscCall(PetscLogGpuTimeEnd());
1598     }
1599   }
1600   PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1601   PetscFunctionReturn(PETSC_SUCCESS);
1602 }
1603 
1604 template <device::cupm::DeviceType T>
Duplicate(Mat A,MatDuplicateOption opt,Mat * B)1605 inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1606 {
1607   const auto         hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1608   PetscDeviceContext dctx;
1609 
1610   PetscFunctionBegin;
1611   PetscCall(GetHandles_(&dctx));
1612   // do not call SetPreallocation() yet, we call it afterwards??
1613   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1614   PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1615   if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1616   // allocate memory if needed
1617   if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr));
1618   PetscFunctionReturn(PETSC_SUCCESS);
1619 }
1620 
1621 template <device::cupm::DeviceType T>
SetRandom(Mat A,PetscRandom rng)1622 inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1623 {
1624   PetscBool device_rand_is_rander48;
1625   PetscBool device = PETSC_FALSE;
1626 
1627   PetscFunctionBegin;
1628   // CUPMObject<hip>::PETSCDEVICERAD() is PETSCRANDER48 until PetscRandom is implemented for hiprand
1629   PetscCall(PetscStrncmp(PETSCDEVICERAND(), PETSCRANDER48, sizeof(PETSCRANDER48), &device_rand_is_rander48));
1630   if (!device_rand_is_rander48) PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1631   if (device) {
1632     const auto         m = A->rmap->n;
1633     const auto         n = A->cmap->n;
1634     PetscDeviceContext dctx;
1635 
1636     PetscCall(GetHandles_(&dctx));
1637     {
1638       const auto a = DeviceArrayWrite(dctx, A);
1639       PetscInt   lda;
1640 
1641       PetscCall(MatDenseGetLDA(A, &lda));
1642       if (lda > m) {
1643         for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1644       } else {
1645         PetscInt mn;
1646 
1647         PetscCall(PetscIntMultError(m, n, &mn));
1648         PetscCall(PetscRandomGetValues(rng, mn, a));
1649       }
1650     }
1651   } else {
1652     PetscCall(MatSetRandom_SeqDense(A, rng));
1653   }
1654   PetscFunctionReturn(PETSC_SUCCESS);
1655 }
1656 
1657 // ==========================================================================================
1658 
1659 template <device::cupm::DeviceType T>
GetColumnVector(Mat A,Vec v,PetscInt col)1660 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1661 {
1662   const auto         offloadmask = A->offloadmask;
1663   const auto         n           = A->rmap->n;
1664   const auto         col_offset  = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1665   PetscBool          viscupm;
1666   PetscDeviceContext dctx;
1667   cupmStream_t       stream;
1668 
1669   PetscFunctionBegin;
1670   PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1671   PetscCall(GetHandles_(&dctx, &stream));
1672   if (viscupm && !v->boundtocpu) {
1673     const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
1674 
1675     // update device data
1676     if (PetscOffloadDevice(offloadmask)) {
1677       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1678     } else {
1679       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1680     }
1681   } else {
1682     PetscScalar *x;
1683 
1684     // update host data
1685     PetscCall(VecGetArrayWrite(v, &x));
1686     if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1687       PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1688     } else if (PetscOffloadDevice(offloadmask)) {
1689       PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1690     }
1691     PetscCall(VecRestoreArrayWrite(v, &x));
1692   }
1693   PetscFunctionReturn(PETSC_SUCCESS);
1694 }
1695 
1696 template <device::cupm::DeviceType T>
1697 template <PetscMemoryAccessMode access>
GetColumnVec(Mat A,PetscInt col,Vec * v)1698 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1699 {
1700   using namespace vec::cupm;
1701   const auto         mimpl = MatIMPLCast(A);
1702   PetscDeviceContext dctx;
1703 
1704   PetscFunctionBegin;
1705   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1706   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1707   mimpl->vecinuse = col + 1;
1708   if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec));
1709   PetscCall(GetHandles_(&dctx));
1710   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1711   PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1712   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1713   *v = mimpl->cvec;
1714   PetscFunctionReturn(PETSC_SUCCESS);
1715 }
1716 
1717 template <device::cupm::DeviceType T>
1718 template <PetscMemoryAccessMode access>
RestoreColumnVec(Mat A,PetscInt,Vec * v)1719 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1720 {
1721   using namespace vec::cupm;
1722   const auto         mimpl = MatIMPLCast(A);
1723   const auto         cvec  = mimpl->cvec;
1724   PetscDeviceContext dctx;
1725 
1726   PetscFunctionBegin;
1727   PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1728   PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1729   mimpl->vecinuse = 0;
1730   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1731   PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1732   PetscCall(GetHandles_(&dctx));
1733   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1734   if (v) *v = nullptr;
1735   PetscFunctionReturn(PETSC_SUCCESS);
1736 }
1737 
1738 // ==========================================================================================
1739 
1740 template <device::cupm::DeviceType T>
GetFactor(Mat A,MatFactorType ftype,Mat * fact_out)1741 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1742 {
1743   Mat                fact = nullptr;
1744   PetscDeviceContext dctx;
1745 
1746   PetscFunctionBegin;
1747   PetscCall(GetHandles_(&dctx));
1748   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1749   fact->factortype = ftype;
1750   switch (ftype) {
1751   case MAT_FACTOR_LU:
1752   case MAT_FACTOR_ILU: // fall-through
1753     fact->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1754     fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1755     break;
1756   case MAT_FACTOR_CHOLESKY:
1757   case MAT_FACTOR_ICC: // fall-through
1758     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1759     break;
1760   case MAT_FACTOR_QR: {
1761     const auto pobj = PetscObjectCast(fact);
1762 
1763     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1764     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1765   } break;
1766   case MAT_FACTOR_NONE:
1767   case MAT_FACTOR_ILUDT:     // fall-through
1768   case MAT_FACTOR_NUM_TYPES: // fall-through
1769     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1770   }
1771   PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1772   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1773   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1774   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1775   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1776   *fact_out = fact;
1777   PetscFunctionReturn(PETSC_SUCCESS);
1778 }
1779 
1780 template <device::cupm::DeviceType T>
InvertFactors(Mat A)1781 inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1782 {
1783   const auto         mimpl = MatIMPLCast(A);
1784   const auto         mcu   = MatCUPMCast(A);
1785   const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
1786   cupmSolverHandle_t handle;
1787   PetscDeviceContext dctx;
1788   cupmStream_t       stream;
1789 
1790   PetscFunctionBegin;
1791 #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1792   // HIP appears to have this by default??
1793   PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1794 #endif
1795   if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1796   PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1797   // spd
1798   PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());
1799 
1800   PetscCall(GetHandles_(&dctx, &handle, &stream));
1801   {
1802     const auto    da  = DeviceArrayReadWrite(dctx, A);
1803     const auto    lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1804     cupmBlasInt_t il;
1805 
1806     PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1807     if (il > mcu->d_fact_lwork) {
1808       mcu->d_fact_lwork = il;
1809       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1810       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1811     }
1812     PetscCall(PetscLogGpuTimeBegin());
1813     PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1814     PetscCall(PetscLogGpuTimeEnd());
1815   }
1816   PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1817   // TODO (write cuda kernel)
1818   PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1819   PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
1820 
1821   A->ops->solve          = nullptr;
1822   A->ops->solvetranspose = nullptr;
1823   A->ops->matsolve       = nullptr;
1824   A->factortype          = MAT_FACTOR_NONE;
1825 
1826   PetscCall(PetscFree(A->solvertype));
1827   PetscFunctionReturn(PETSC_SUCCESS);
1828 }
1829 
1830 // ==========================================================================================
1831 
1832 template <device::cupm::DeviceType T>
GetSubMatrix(Mat A,PetscInt rbegin,PetscInt rend,PetscInt cbegin,PetscInt cend,Mat * mat)1833 inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1834 {
1835   const auto         mimpl        = MatIMPLCast(A);
1836   const auto         array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1837   const auto         n            = rend - rbegin;
1838   const auto         m            = cend - cbegin;
1839   auto              &cmat         = mimpl->cmat;
1840   PetscDeviceContext dctx;
1841 
1842   PetscFunctionBegin;
1843   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1844   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1845   mimpl->matinuse = cbegin + 1;
1846 
1847   PetscCall(GetHandles_(&dctx));
1848   PetscCall(HostToDevice_(A, dctx));
1849 
1850   if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1851   {
1852     const auto device_array = array_offset(MatCUPMCast(A)->d_v);
1853 
1854     if (cmat) {
1855       PetscCall(PlaceArray(cmat, device_array));
1856     } else {
1857       PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1858     }
1859   }
1860   PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1861   // place CPU array if present but do not copy any data
1862   if (const auto host_array = mimpl->v) {
1863     cmat->offloadmask = PETSC_OFFLOAD_GPU;
1864     PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1865   }
1866 
1867   cmat->offloadmask = A->offloadmask;
1868   *mat              = cmat;
1869   PetscFunctionReturn(PETSC_SUCCESS);
1870 }
1871 
1872 template <device::cupm::DeviceType T>
RestoreSubMatrix(Mat A,Mat * m)1873 inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1874 {
1875   const auto mimpl = MatIMPLCast(A);
1876   const auto cmat  = mimpl->cmat;
1877   const auto reset = static_cast<bool>(mimpl->v);
1878   bool       copy, was_offload_host;
1879 
1880   PetscFunctionBegin;
1881   PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1882   PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1883   PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1884   mimpl->matinuse = 0;
1885 
1886   // calls to ResetArray may change it, so save it here
1887   was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1888   if (was_offload_host && !reset) {
1889     copy = true;
1890     PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1891   } else {
1892     copy = false;
1893   }
1894 
1895   PetscCall(ResetArray(cmat));
1896   if (reset) PetscCall(MatDenseResetArray(cmat));
1897   if (copy) {
1898     PetscDeviceContext dctx;
1899 
1900     PetscCall(GetHandles_(&dctx));
1901     PetscCall(DeviceToHost_(A, dctx));
1902   } else {
1903     A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1904   }
1905 
1906   cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1907   *m                = nullptr;
1908   PetscFunctionReturn(PETSC_SUCCESS);
1909 }
1910 
1911 // ==========================================================================================
1912 
1913 namespace
1914 {
1915 
1916 template <device::cupm::DeviceType T>
MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A,Mat B,Mat C,PetscBool TA,PetscBool TB)1917 inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1918 {
1919   PetscFunctionBegin;
1920   if (TA) {
1921     if (TB) {
1922       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1923     } else {
1924       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1925     }
1926   } else {
1927     if (TB) {
1928       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1929     } else {
1930       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1931     }
1932   }
1933   PetscFunctionReturn(PETSC_SUCCESS);
1934 }
1935 
1936 template <device::cupm::DeviceType T>
MatSolverTypeRegister_DENSECUPM()1937 inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
1938 {
1939   PetscFunctionBegin;
1940   for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
1941     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
1942     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
1943   }
1944   PetscFunctionReturn(PETSC_SUCCESS);
1945 }
1946 
1947 } // anonymous namespace
1948 
1949 } // namespace impl
1950 
1951 } // namespace cupm
1952 
1953 } // namespace mat
1954 
1955 } // namespace Petsc
1956