xref: /petsc/include/petsc/private/matdensecupmimpl.h (revision cd871708d6ae82bd70cc1a9e2138f9b57839fe75)
1 #pragma once
2 
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 #include <petsc/private/matimpl.h> /*I <petscmat.h> I*/
5 
6 #ifdef __cplusplus
7   #include <petsc/private/deviceimpl.h>
8   #include <petsc/private/cupmsolverinterface.hpp>
9   #include <petsc/private/cupmobject.hpp>
10 
11   #include "../src/sys/objects/device/impls/cupm/cupmthrustutility.hpp"
12   #include "../src/sys/objects/device/impls/cupm/kernels.hpp"
13 
14   #include <thrust/device_vector.h>
15   #include <thrust/device_ptr.h>
16   #include <thrust/iterator/counting_iterator.h>
17   #include <thrust/iterator/transform_iterator.h>
18   #include <thrust/iterator/permutation_iterator.h>
19   #include <thrust/transform.h>
20   #include <thrust/copy.h>
21 
22 namespace Petsc
23 {
24 
25 namespace vec
26 {
27 
28 namespace cupm
29 {
30 
31 namespace impl
32 {
33 
34 template <device::cupm::DeviceType>
35 class VecSeq_CUPM;
36 template <device::cupm::DeviceType>
37 class VecMPI_CUPM;
38 
39 } // namespace impl
40 
41 } // namespace cupm
42 
43 } // namespace vec
44 
45 namespace mat
46 {
47 
48 namespace cupm
49 {
50 
51 namespace impl
52 {
53 
54 // ==========================================================================================
55 // MatDense_CUPM_Base
56 //
57 // A base class to separate out the CRTP code from the common CUPM stuff (like the composed
58 // function names).
59 // ==========================================================================================
60 
61 template <device::cupm::DeviceType T>
62 class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_CUPM_Base : protected device::cupm::impl::CUPMObject<T> {
63 public:
64   PETSC_CUPMOBJECT_HEADER(T);
65 
66   #define MatDenseCUPMComposedOpDecl(OP_NAME) \
67     PETSC_NODISCARD static constexpr const char *PetscConcat(MatDenseCUPM, OP_NAME)() noexcept \
68     { \
69       return T == device::cupm::DeviceType::CUDA ? PetscStringize(PetscConcat(MatDenseCUDA, OP_NAME)) : PetscStringize(PetscConcat(MatDenseHIP, OP_NAME)); \
70     }
71 
72   // clang-format off
73   MatDenseCUPMComposedOpDecl(GetArray_C)
74   MatDenseCUPMComposedOpDecl(GetArrayRead_C)
75   MatDenseCUPMComposedOpDecl(GetArrayWrite_C)
76   MatDenseCUPMComposedOpDecl(RestoreArray_C)
77   MatDenseCUPMComposedOpDecl(RestoreArrayRead_C)
78   MatDenseCUPMComposedOpDecl(RestoreArrayWrite_C)
79   MatDenseCUPMComposedOpDecl(PlaceArray_C)
80   MatDenseCUPMComposedOpDecl(ReplaceArray_C)
81   MatDenseCUPMComposedOpDecl(ResetArray_C)
82   MatDenseCUPMComposedOpDecl(SetPreallocation_C)
83   // clang-format on
84 
85   #undef MatDenseCUPMComposedOpDecl
86 
87     PETSC_NODISCARD static constexpr MatType MATSEQDENSECUPM() noexcept;
88   PETSC_NODISCARD static constexpr MatType       MATMPIDENSECUPM() noexcept;
89   PETSC_NODISCARD static constexpr MatType       MATDENSECUPM() noexcept;
90   PETSC_NODISCARD static constexpr MatSolverType MATSOLVERCUPM() noexcept;
91 };
92 
93 // ==========================================================================================
94 // MatDense_CUPM_Base -- Public API
95 // ==========================================================================================
96 
97 template <device::cupm::DeviceType T>
MATSEQDENSECUPM()98 inline constexpr MatType MatDense_CUPM_Base<T>::MATSEQDENSECUPM() noexcept
99 {
100   return T == device::cupm::DeviceType::CUDA ? MATSEQDENSECUDA : MATSEQDENSEHIP;
101 }
102 
103 template <device::cupm::DeviceType T>
MATMPIDENSECUPM()104 inline constexpr MatType MatDense_CUPM_Base<T>::MATMPIDENSECUPM() noexcept
105 {
106   return T == device::cupm::DeviceType::CUDA ? MATMPIDENSECUDA : MATMPIDENSEHIP;
107 }
108 
109 template <device::cupm::DeviceType T>
MATDENSECUPM()110 inline constexpr MatType MatDense_CUPM_Base<T>::MATDENSECUPM() noexcept
111 {
112   return T == device::cupm::DeviceType::CUDA ? MATDENSECUDA : MATDENSEHIP;
113 }
114 
115 template <device::cupm::DeviceType T>
MATSOLVERCUPM()116 inline constexpr MatSolverType MatDense_CUPM_Base<T>::MATSOLVERCUPM() noexcept
117 {
118   return T == device::cupm::DeviceType::CUDA ? MATSOLVERCUDA : MATSOLVERHIP;
119 }
120 
121   #define MATDENSECUPM_BASE_HEADER(T) \
122     PETSC_CUPMOBJECT_HEADER(T); \
123     using VecSeq_CUPM = ::Petsc::vec::cupm::impl::VecSeq_CUPM<T>; \
124     using VecMPI_CUPM = ::Petsc::vec::cupm::impl::VecMPI_CUPM<T>; \
125     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATSEQDENSECUPM; \
126     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATMPIDENSECUPM; \
127     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATDENSECUPM; \
128     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATSOLVERCUPM; \
129     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArray_C; \
130     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayRead_C; \
131     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayWrite_C; \
132     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArray_C; \
133     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayRead_C; \
134     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayWrite_C; \
135     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMPlaceArray_C; \
136     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMReplaceArray_C; \
137     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMResetArray_C; \
138     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMSetPreallocation_C
139 
140 // forward declare
141 template <device::cupm::DeviceType>
142 class MatDense_Seq_CUPM;
143 template <device::cupm::DeviceType>
144 class MatDense_MPI_CUPM;
145 
146 // ==========================================================================================
147 // MatDense_CUPM
148 //
149 // The true "base" class for MatDenseCUPM. The reason MatDense_CUPM and MatDense_CUPM_Base
150 // exist is to separate out the CRTP code from the non-crtp code so that the generic functions
151 // can be called via templates below.
152 // ==========================================================================================
153 
154 template <device::cupm::DeviceType T, typename Derived>
155 class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_CUPM : protected MatDense_CUPM_Base<T> {
156 private:
157   static PetscErrorCode CheckSaneSequentialMatSizes_(Mat) noexcept;
158 
159 protected:
160   MATDENSECUPM_BASE_HEADER(T);
161 
162   template <PetscMemType, PetscMemoryAccessMode>
163   class MatrixArray;
164 
165   // Cast the Mat to its host struct, i.e. return the result of (Mat_SeqDense *)m->data
166   template <typename U = Derived>
167   PETSC_NODISCARD static constexpr auto    MatIMPLCast(Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(U::MatIMPLCast_(m))
168   PETSC_NODISCARD static constexpr MatType MATIMPLCUPM() noexcept;
169 
170   static PetscErrorCode CreateIMPLDenseCUPM(MPI_Comm, PetscInt, PetscInt, PetscInt, PetscInt, PetscScalar *, Mat *, PetscDeviceContext, bool) noexcept;
171   static PetscErrorCode SetPreallocation(Mat, PetscDeviceContext, PetscScalar *) noexcept;
172 
173   template <typename F>
174   static PetscErrorCode DiagonalUnaryTransform(Mat, PetscDeviceContext, F &&) noexcept;
175 
176   static PetscErrorCode Shift(Mat, PetscScalar) noexcept;
177   static PetscErrorCode GetDiagonal(Mat, Vec) noexcept;
178 
179   PETSC_NODISCARD static auto DeviceArrayRead(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>{dctx, m})
180   PETSC_NODISCARD static auto DeviceArrayWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>{dctx, m})
181   PETSC_NODISCARD static auto DeviceArrayReadWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, m})
182   PETSC_NODISCARD static auto HostArrayRead(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>{dctx, m})
183   PETSC_NODISCARD static auto HostArrayWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>{dctx, m})
184   PETSC_NODISCARD static auto HostArrayReadWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, m})
185 };
186 
187 // ==========================================================================================
188 // MatDense_CUPM::MatrixArray
189 // ==========================================================================================
190 
191 template <device::cupm::DeviceType T, typename D>
192 template <PetscMemType MT, PetscMemoryAccessMode MA>
193 class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_CUPM<T, D>::MatrixArray : public device::cupm::impl::RestoreableArray<T, MT, MA> {
194   using base_type = device::cupm::impl::RestoreableArray<T, MT, MA>;
195 
196 public:
197   MatrixArray(PetscDeviceContext, Mat) noexcept;
198   ~MatrixArray() noexcept;
199 
200   // must declare move constructor since we declare a destructor
201   constexpr MatrixArray(MatrixArray &&) noexcept;
202 
203 private:
204   Mat m_ = nullptr;
205 };
206 
207 // ==========================================================================================
208 // MatDense_CUPM::MatrixArray -- Public API
209 // ==========================================================================================
210 
211 template <device::cupm::DeviceType T, typename D>
212 template <PetscMemType MT, PetscMemoryAccessMode MA>
MatrixArray(PetscDeviceContext dctx,Mat m)213 inline MatDense_CUPM<T, D>::MatrixArray<MT, MA>::MatrixArray(PetscDeviceContext dctx, Mat m) noexcept : base_type{dctx}, m_{m}
214 {
215   PetscFunctionBegin;
216   PetscCallAbort(PETSC_COMM_SELF, D::template GetArray<MT, MA>(m, &this->ptr_, dctx));
217   PetscFunctionReturnVoid();
218 }
219 
220 template <device::cupm::DeviceType T, typename D>
221 template <PetscMemType MT, PetscMemoryAccessMode MA>
~MatrixArray()222 inline MatDense_CUPM<T, D>::MatrixArray<MT, MA>::~MatrixArray() noexcept
223 {
224   PetscFunctionBegin;
225   PetscCallAbort(PETSC_COMM_SELF, D::template RestoreArray<MT, MA>(m_, &this->ptr_, this->dctx_));
226   PetscFunctionReturnVoid();
227 }
228 
229 template <device::cupm::DeviceType T, typename D>
230 template <PetscMemType MT, PetscMemoryAccessMode MA>
MatrixArray(MatrixArray && other)231 inline constexpr MatDense_CUPM<T, D>::MatrixArray<MT, MA>::MatrixArray(MatrixArray &&other) noexcept : base_type{std::move(other)}, m_{util::exchange(other.m_, nullptr)}
232 {
233 }
234 
235 // ==========================================================================================
236 // MatDense_CUPM -- Private API
237 // ==========================================================================================
238 
239 template <device::cupm::DeviceType T, typename D>
CheckSaneSequentialMatSizes_(Mat A)240 inline PetscErrorCode MatDense_CUPM<T, D>::CheckSaneSequentialMatSizes_(Mat A) noexcept
241 {
242   PetscFunctionBegin;
243   if (PetscDefined(USE_DEBUG)) {
244     PetscBool isseq;
245 
246     PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), D::MATSEQDENSECUPM(), &isseq));
247     if (isseq) {
248       // doing this check allows both sequential and parallel implementations to just pass in
249       // A, otherwise they would need to specify rstart, rend, and cols separately!
250       PetscCheck(A->rmap->rstart == 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Sequential matrix row start %" PetscInt_FMT " != 0?", A->rmap->rstart);
251       PetscCheck(A->rmap->rend == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Sequential matrix row end %" PetscInt_FMT " != number of rows %" PetscInt_FMT, A->rmap->rend, A->rmap->n);
252       PetscCheck(A->cmap->n == A->cmap->N, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Sequential matrix number of local columns %" PetscInt_FMT " != number of global columns %" PetscInt_FMT, A->cmap->n, A->cmap->n);
253     }
254   }
255   PetscFunctionReturn(PETSC_SUCCESS);
256 }
257 
258 // ==========================================================================================
259 // MatDense_CUPM -- Protected API
260 // ==========================================================================================
261 
262 template <device::cupm::DeviceType T, typename D>
MATIMPLCUPM()263 inline constexpr MatType MatDense_CUPM<T, D>::MATIMPLCUPM() noexcept
264 {
265   return D::MATIMPLCUPM_();
266 }
267 
268 // Common core for MatCreateSeqDenseCUPM() and MatCreateMPIDenseCUPM()
269 template <device::cupm::DeviceType T, typename D>
CreateIMPLDenseCUPM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscScalar * data,Mat * A,PetscDeviceContext dctx,bool preallocate)270 inline PetscErrorCode MatDense_CUPM<T, D>::CreateIMPLDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscScalar *data, Mat *A, PetscDeviceContext dctx, bool preallocate) noexcept
271 {
272   Mat mat;
273 
274   PetscFunctionBegin;
275   PetscAssertPointer(A, 7);
276   PetscCall(MatCreate(comm, &mat));
277   PetscCall(MatSetSizes(mat, m, n, M, N));
278   PetscCall(MatSetType(mat, D::MATIMPLCUPM()));
279   if (preallocate) {
280     PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
281     PetscCall(D::SetPreallocation(mat, dctx, data));
282   }
283   *A = mat;
284   PetscFunctionReturn(PETSC_SUCCESS);
285 }
286 
287 template <device::cupm::DeviceType T, typename D>
SetPreallocation(Mat A,PetscDeviceContext dctx,PetscScalar * device_array)288 inline PetscErrorCode MatDense_CUPM<T, D>::SetPreallocation(Mat A, PetscDeviceContext dctx, PetscScalar *device_array) noexcept
289 {
290   PetscFunctionBegin;
291   // cannot use PetscValidHeaderSpecificType(..., MATIMPLCUPM()) since the incoming matrix
292   // might be the local (sequential) matrix of a MatMPIDense_CUPM. Since this would be called
293   // from the MPI matrix'es impl MATIMPLCUPM() would return MATMPIDENSECUPM().
294   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
295   PetscCheckTypeNames(A, D::MATSEQDENSECUPM(), D::MATMPIDENSECUPM());
296   PetscCall(PetscLayoutSetUp(A->rmap));
297   PetscCall(PetscLayoutSetUp(A->cmap));
298   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
299   PetscCall(D::SetPreallocation_(A, dctx, device_array));
300   A->preallocated = PETSC_TRUE;
301   A->assembled    = PETSC_TRUE;
302   PetscFunctionReturn(PETSC_SUCCESS);
303 }
304 
305 namespace detail
306 {
307 
308   #if CCCL_VERSION >= 3000000
309 template <class T>
310 using iter_difference_t = cuda::std::iter_difference_t<T>;
311   #else
312 template <class T>
313 using iter_difference_t = typename thrust::iterator_difference<T>::type;
314   #endif
315 
316 // ==========================================================================================
317 // MatrixIteratorBase
318 //
319 // A base class for creating thrust iterators over the local sub-matrix. This will set up the
320 // proper iterator definitions so thrust knows how to handle things properly. Template
321 // parameters are as follows:
322 //
323 // - Iterator:
324 // The type of the primary array iterator. Usually this is
325 // thrust::device_pointer<PetscScalar>::iterator.
326 //
327 // - IndexFunctor:
328 // This should be a functor which contains an operator() that when called with an index `i`,
329 // returns the i'th permuted index into the array. For example, it could return the i'th
330 // diagonal entry.
331 // ==========================================================================================
332 template <typename Iterator, typename IndexFunctor>
333 class MatrixIteratorBase {
334 public:
335   using array_iterator_type = Iterator;
336   using index_functor_type  = IndexFunctor;
337 
338   using difference_type     = iter_difference_t<array_iterator_type>;
339   using CountingIterator    = thrust::counting_iterator<difference_type>;
340   using TransformIterator   = thrust::transform_iterator<index_functor_type, CountingIterator>;
341   using PermutationIterator = thrust::permutation_iterator<array_iterator_type, TransformIterator>;
342   using iterator            = PermutationIterator; // type of the begin/end iterator
343 
MatrixIteratorBase(array_iterator_type first,array_iterator_type last,index_functor_type idx_func)344   constexpr MatrixIteratorBase(array_iterator_type first, array_iterator_type last, index_functor_type idx_func) noexcept : first{std::move(first)}, last{std::move(last)}, func{std::move(idx_func)} { }
345 
begin()346   PETSC_NODISCARD iterator begin() const noexcept
347   {
348     return PermutationIterator{
349       first, TransformIterator{CountingIterator{0}, func}
350     };
351   }
352 
353 protected:
354   array_iterator_type first;
355   array_iterator_type last;
356   index_functor_type  func;
357 };
358 
359 // ==========================================================================================
360 // StridedIndexFunctor
361 //
362 // Iterator which permutes a linear index range into strided matrix indices. Usually used to
363 // get the diagonal.
364 // ==========================================================================================
365 template <typename T>
366 struct StridedIndexFunctor {
operatorStridedIndexFunctor367   PETSC_NODISCARD PETSC_HOSTDEVICE_INLINE_DECL constexpr T operator()(const T &i) const noexcept { return stride * i; }
368 
369   T stride;
370 };
371 
372 template <typename Iterator>
373 class DiagonalIterator : public MatrixIteratorBase<Iterator, StridedIndexFunctor<iter_difference_t<Iterator>>> {
374 public:
375   using base_type = MatrixIteratorBase<Iterator, StridedIndexFunctor<iter_difference_t<Iterator>>>;
376 
377   using difference_type = typename base_type::difference_type;
378   using iterator        = typename base_type::iterator;
379 
DiagonalIterator(Iterator first,Iterator last,difference_type stride)380   constexpr DiagonalIterator(Iterator first, Iterator last, difference_type stride) noexcept : base_type{std::move(first), std::move(last), {stride}} { }
381 
end()382   PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->last - this->first + this->func.stride - 1) / this->func.stride; }
383 };
384 
385 template <typename T>
MakeDiagonalIterator(T * data,PetscInt rstart,PetscInt rend,PetscInt cols,PetscInt lda)386 inline DiagonalIterator<typename thrust::device_vector<T>::iterator> MakeDiagonalIterator(T *data, PetscInt rstart, PetscInt rend, PetscInt cols, PetscInt lda) noexcept
387 {
388   const auto        rend2 = std::min(rend, cols);
389   const std::size_t begin = rstart * lda;
390   const std::size_t end   = rend2 - rstart + rend2 * lda;
391   const auto        dptr  = thrust::device_pointer_cast(data);
392 
393   return {dptr + begin, dptr + end, lda + 1};
394 }
395 
396 } // namespace detail
397 
398 template <device::cupm::DeviceType T, typename D>
399 template <typename F>
DiagonalUnaryTransform(Mat A,PetscDeviceContext dctx,F && functor)400 inline PetscErrorCode MatDense_CUPM<T, D>::DiagonalUnaryTransform(Mat A, PetscDeviceContext dctx, F &&functor) noexcept
401 {
402   const auto rstart = A->rmap->rstart;
403   const auto rend   = A->rmap->rend;
404   const auto gcols  = A->cmap->N;
405   const auto rend2  = std::min(rend, gcols);
406 
407   PetscFunctionBegin;
408   PetscCall(CheckSaneSequentialMatSizes_(A));
409   if (rend2 > rstart) {
410     const auto   da = D::DeviceArrayReadWrite(dctx, A);
411     cupmStream_t stream;
412     PetscInt     lda;
413 
414     PetscCall(MatDenseGetLDA(A, &lda));
415     PetscCall(D::GetHandlesFrom_(dctx, &stream));
416     {
417       auto diagonal = detail::MakeDiagonalIterator(da.data(), rstart, rend, gcols, lda);
418 
419       // clang-format off
420       PetscCallThrust(
421         THRUST_CALL(
422           thrust::transform,
423           stream,
424           diagonal.begin(), diagonal.end(), diagonal.begin(),
425           std::forward<F>(functor)
426         )
427       );
428       // clang-format on
429     }
430     PetscCall(PetscLogGpuFlops(rend2 - rstart));
431   }
432   PetscFunctionReturn(PETSC_SUCCESS);
433 }
434 
435 template <device::cupm::DeviceType T, typename D>
Shift(Mat A,PetscScalar alpha)436 inline PetscErrorCode MatDense_CUPM<T, D>::Shift(Mat A, PetscScalar alpha) noexcept
437 {
438   PetscDeviceContext dctx;
439 
440   PetscFunctionBegin;
441   PetscCall(GetHandles_(&dctx));
442   PetscCall(DiagonalUnaryTransform(A, dctx, device::cupm::functors::make_plus_equals(alpha)));
443   PetscFunctionReturn(PETSC_SUCCESS);
444 }
445 
446 template <device::cupm::DeviceType T, typename D>
GetDiagonal(Mat A,Vec v)447 inline PetscErrorCode MatDense_CUPM<T, D>::GetDiagonal(Mat A, Vec v) noexcept
448 {
449   const auto         rstart = A->rmap->rstart;
450   const auto         rend   = A->rmap->rend;
451   const auto         gcols  = A->cmap->N;
452   PetscInt           lda;
453   PetscDeviceContext dctx;
454 
455   PetscFunctionBegin;
456   PetscCall(CheckSaneSequentialMatSizes_(A));
457   PetscCall(GetHandles_(&dctx));
458   PetscCall(MatDenseGetLDA(A, &lda));
459   {
460     auto dv       = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
461     auto da       = D::DeviceArrayRead(dctx, A);
462     auto diagonal = detail::MakeDiagonalIterator(da.data(), rstart, rend, gcols, lda);
463     // We must have this cast outside of THRUST_CALL(). Without it, GCC 6.4 - 7.5, and 11.3.0
464     // throw spurious warnings:
465     //
466     // warning: 'MatDense_CUPM<...>::GetDiagonal(Mat, Vec)::<lambda()>' declared with greater
467     // visibility than the type of its field 'MatDense_CUPM<...>::GetDiagonal(Mat,
468     // Vec)::<lambda()>::<dv capture>' [-Wattributes]
469     // 460 |     PetscCallThrust(
470     //     |     ^~~~~~~~~~~~~~~~
471     auto         dvp = thrust::device_pointer_cast(dv.data());
472     cupmStream_t stream;
473 
474     PetscCall(GetHandlesFrom_(dctx, &stream));
475     PetscCallThrust(THRUST_CALL(thrust::copy, stream, diagonal.begin(), diagonal.end(), dvp));
476   }
477   PetscFunctionReturn(PETSC_SUCCESS);
478 }
479 
480   #define MatComposeOp_CUPM(use_host, pobj, op_str, op_host, ...) \
481     do { \
482       if (use_host) { \
483         PetscCall(PetscObjectComposeFunction(pobj, op_str, op_host)); \
484       } else { \
485         PetscCall(PetscObjectComposeFunction(pobj, op_str, __VA_ARGS__)); \
486       } \
487     } while (0)
488 
489   #define MatSetOp_CUPM(use_host, mat, op_name, op_host, ...) \
490     do { \
491       if (use_host) { \
492         (mat)->ops->op_name = op_host; \
493       } else { \
494         (mat)->ops->op_name = __VA_ARGS__; \
495       } \
496     } while (0)
497 
498   #define MATDENSECUPM_HEADER(T, ...) \
499     MATDENSECUPM_BASE_HEADER(T); \
500     friend class ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>; \
501     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::MatIMPLCast; \
502     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::MATIMPLCUPM; \
503     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::CreateIMPLDenseCUPM; \
504     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::SetPreallocation; \
505     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DeviceArrayRead; \
506     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DeviceArrayWrite; \
507     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DeviceArrayReadWrite; \
508     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::HostArrayRead; \
509     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::HostArrayWrite; \
510     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::HostArrayReadWrite; \
511     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DiagonalUnaryTransform; \
512     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::Shift; \
513     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::GetDiagonal
514 
515 } // namespace impl
516 
517 // ==========================================================================================
518 // MatDense_CUPM -- Implementations
519 // ==========================================================================================
520 
521 template <device::cupm::DeviceType T, PetscMemoryAccessMode access>
MatDenseCUPMGetArray_Private(Mat A,PetscScalar ** array)522 inline PetscErrorCode MatDenseCUPMGetArray_Private(Mat A, PetscScalar **array) noexcept
523 {
524   PetscFunctionBegin;
525   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
526   PetscAssertPointer(array, 2);
527   switch (access) {
528   case PETSC_MEMORY_ACCESS_READ:
529     PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayRead_C(), (Mat, PetscScalar **), (A, array));
530     break;
531   case PETSC_MEMORY_ACCESS_WRITE:
532     PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayWrite_C(), (Mat, PetscScalar **), (A, array));
533     break;
534   case PETSC_MEMORY_ACCESS_READ_WRITE:
535     PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArray_C(), (Mat, PetscScalar **), (A, array));
536     break;
537   }
538   if (PetscMemoryAccessWrite(access)) PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
539   PetscFunctionReturn(PETSC_SUCCESS);
540 }
541 
542 template <device::cupm::DeviceType T, PetscMemoryAccessMode access>
MatDenseCUPMRestoreArray_Private(Mat A,PetscScalar ** array)543 inline PetscErrorCode MatDenseCUPMRestoreArray_Private(Mat A, PetscScalar **array) noexcept
544 {
545   PetscFunctionBegin;
546   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
547   if (array) PetscAssertPointer(array, 2);
548   switch (access) {
549   case PETSC_MEMORY_ACCESS_READ:
550     PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayRead_C(), (Mat, PetscScalar **), (A, array));
551     break;
552   case PETSC_MEMORY_ACCESS_WRITE:
553     PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayWrite_C(), (Mat, PetscScalar **), (A, array));
554     break;
555   case PETSC_MEMORY_ACCESS_READ_WRITE:
556     PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArray_C(), (Mat, PetscScalar **), (A, array));
557     break;
558   }
559   if (PetscMemoryAccessWrite(access)) {
560     PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
561     A->offloadmask = PETSC_OFFLOAD_GPU;
562   }
563   if (array) *array = nullptr;
564   PetscFunctionReturn(PETSC_SUCCESS);
565 }
566 
567 template <device::cupm::DeviceType T>
MatDenseCUPMGetArray(Mat A,PetscScalar ** array)568 inline PetscErrorCode MatDenseCUPMGetArray(Mat A, PetscScalar **array) noexcept
569 {
570   PetscFunctionBegin;
571   PetscCall(MatDenseCUPMGetArray_Private<T, PETSC_MEMORY_ACCESS_READ_WRITE>(A, array));
572   PetscFunctionReturn(PETSC_SUCCESS);
573 }
574 
575 template <device::cupm::DeviceType T>
MatDenseCUPMGetArrayRead(Mat A,const PetscScalar ** array)576 inline PetscErrorCode MatDenseCUPMGetArrayRead(Mat A, const PetscScalar **array) noexcept
577 {
578   PetscFunctionBegin;
579   PetscCall(MatDenseCUPMGetArray_Private<T, PETSC_MEMORY_ACCESS_READ>(A, const_cast<PetscScalar **>(array)));
580   PetscFunctionReturn(PETSC_SUCCESS);
581 }
582 
583 template <device::cupm::DeviceType T>
MatDenseCUPMGetArrayWrite(Mat A,PetscScalar ** array)584 inline PetscErrorCode MatDenseCUPMGetArrayWrite(Mat A, PetscScalar **array) noexcept
585 {
586   PetscFunctionBegin;
587   PetscCall(MatDenseCUPMGetArray_Private<T, PETSC_MEMORY_ACCESS_WRITE>(A, array));
588   PetscFunctionReturn(PETSC_SUCCESS);
589 }
590 
591 template <device::cupm::DeviceType T>
MatDenseCUPMRestoreArray(Mat A,PetscScalar ** array)592 inline PetscErrorCode MatDenseCUPMRestoreArray(Mat A, PetscScalar **array) noexcept
593 {
594   PetscFunctionBegin;
595   PetscCall(MatDenseCUPMRestoreArray_Private<T, PETSC_MEMORY_ACCESS_READ_WRITE>(A, array));
596   PetscFunctionReturn(PETSC_SUCCESS);
597 }
598 
599 template <device::cupm::DeviceType T>
MatDenseCUPMRestoreArrayRead(Mat A,const PetscScalar ** array)600 inline PetscErrorCode MatDenseCUPMRestoreArrayRead(Mat A, const PetscScalar **array) noexcept
601 {
602   PetscFunctionBegin;
603   PetscCall(MatDenseCUPMRestoreArray_Private<T, PETSC_MEMORY_ACCESS_READ>(A, const_cast<PetscScalar **>(array)));
604   PetscFunctionReturn(PETSC_SUCCESS);
605 }
606 
607 template <device::cupm::DeviceType T>
MatDenseCUPMRestoreArrayWrite(Mat A,PetscScalar ** array)608 inline PetscErrorCode MatDenseCUPMRestoreArrayWrite(Mat A, PetscScalar **array) noexcept
609 {
610   PetscFunctionBegin;
611   PetscCall(MatDenseCUPMRestoreArray_Private<T, PETSC_MEMORY_ACCESS_WRITE>(A, array));
612   PetscFunctionReturn(PETSC_SUCCESS);
613 }
614 
615 template <device::cupm::DeviceType T>
MatDenseCUPMPlaceArray(Mat A,const PetscScalar * array)616 inline PetscErrorCode MatDenseCUPMPlaceArray(Mat A, const PetscScalar *array) noexcept
617 {
618   PetscFunctionBegin;
619   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
620   PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMPlaceArray_C(), (Mat, const PetscScalar *), (A, array));
621   PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
622   A->offloadmask = PETSC_OFFLOAD_GPU;
623   PetscFunctionReturn(PETSC_SUCCESS);
624 }
625 
626 template <device::cupm::DeviceType T>
MatDenseCUPMReplaceArray(Mat A,const PetscScalar * array)627 inline PetscErrorCode MatDenseCUPMReplaceArray(Mat A, const PetscScalar *array) noexcept
628 {
629   PetscFunctionBegin;
630   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
631   PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMReplaceArray_C(), (Mat, const PetscScalar *), (A, array));
632   PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
633   A->offloadmask = PETSC_OFFLOAD_GPU;
634   PetscFunctionReturn(PETSC_SUCCESS);
635 }
636 
637 template <device::cupm::DeviceType T>
MatDenseCUPMResetArray(Mat A)638 inline PetscErrorCode MatDenseCUPMResetArray(Mat A) noexcept
639 {
640   PetscFunctionBegin;
641   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
642   PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMResetArray_C(), (Mat), (A));
643   PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
644   PetscFunctionReturn(PETSC_SUCCESS);
645 }
646 
647 template <device::cupm::DeviceType T>
648 inline PetscErrorCode MatDenseCUPMSetPreallocation(Mat A, PetscScalar *device_data, PetscDeviceContext dctx = nullptr) noexcept
649 {
650   PetscFunctionBegin;
651   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
652   PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMSetPreallocation_C(), (Mat, PetscDeviceContext, PetscScalar *), (A, dctx, device_data));
653   PetscFunctionReturn(PETSC_SUCCESS);
654 }
655 
656 } // namespace cupm
657 
658 } // namespace mat
659 
660 } // namespace Petsc
661 
662 #endif // __cplusplus
663