1 #pragma once
2
3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4 #include <petsc/private/matimpl.h> /*I <petscmat.h> I*/
5
6 #ifdef __cplusplus
7 #include <petsc/private/deviceimpl.h>
8 #include <petsc/private/cupmsolverinterface.hpp>
9 #include <petsc/private/cupmobject.hpp>
10
11 #include "../src/sys/objects/device/impls/cupm/cupmthrustutility.hpp"
12 #include "../src/sys/objects/device/impls/cupm/kernels.hpp"
13
14 #include <thrust/device_vector.h>
15 #include <thrust/device_ptr.h>
16 #include <thrust/iterator/counting_iterator.h>
17 #include <thrust/iterator/transform_iterator.h>
18 #include <thrust/iterator/permutation_iterator.h>
19 #include <thrust/transform.h>
20 #include <thrust/copy.h>
21
22 namespace Petsc
23 {
24
25 namespace vec
26 {
27
28 namespace cupm
29 {
30
31 namespace impl
32 {
33
34 template <device::cupm::DeviceType>
35 class VecSeq_CUPM;
36 template <device::cupm::DeviceType>
37 class VecMPI_CUPM;
38
39 } // namespace impl
40
41 } // namespace cupm
42
43 } // namespace vec
44
45 namespace mat
46 {
47
48 namespace cupm
49 {
50
51 namespace impl
52 {
53
54 // ==========================================================================================
55 // MatDense_CUPM_Base
56 //
57 // A base class to separate out the CRTP code from the common CUPM stuff (like the composed
58 // function names).
59 // ==========================================================================================
60
61 template <device::cupm::DeviceType T>
62 class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_CUPM_Base : protected device::cupm::impl::CUPMObject<T> {
63 public:
64 PETSC_CUPMOBJECT_HEADER(T);
65
66 #define MatDenseCUPMComposedOpDecl(OP_NAME) \
67 PETSC_NODISCARD static constexpr const char *PetscConcat(MatDenseCUPM, OP_NAME)() noexcept \
68 { \
69 return T == device::cupm::DeviceType::CUDA ? PetscStringize(PetscConcat(MatDenseCUDA, OP_NAME)) : PetscStringize(PetscConcat(MatDenseHIP, OP_NAME)); \
70 }
71
72 // clang-format off
73 MatDenseCUPMComposedOpDecl(GetArray_C)
74 MatDenseCUPMComposedOpDecl(GetArrayRead_C)
75 MatDenseCUPMComposedOpDecl(GetArrayWrite_C)
76 MatDenseCUPMComposedOpDecl(RestoreArray_C)
77 MatDenseCUPMComposedOpDecl(RestoreArrayRead_C)
78 MatDenseCUPMComposedOpDecl(RestoreArrayWrite_C)
79 MatDenseCUPMComposedOpDecl(PlaceArray_C)
80 MatDenseCUPMComposedOpDecl(ReplaceArray_C)
81 MatDenseCUPMComposedOpDecl(ResetArray_C)
82 MatDenseCUPMComposedOpDecl(SetPreallocation_C)
83 // clang-format on
84
85 #undef MatDenseCUPMComposedOpDecl
86
87 PETSC_NODISCARD static constexpr MatType MATSEQDENSECUPM() noexcept;
88 PETSC_NODISCARD static constexpr MatType MATMPIDENSECUPM() noexcept;
89 PETSC_NODISCARD static constexpr MatType MATDENSECUPM() noexcept;
90 PETSC_NODISCARD static constexpr MatSolverType MATSOLVERCUPM() noexcept;
91 };
92
93 // ==========================================================================================
94 // MatDense_CUPM_Base -- Public API
95 // ==========================================================================================
96
97 template <device::cupm::DeviceType T>
MATSEQDENSECUPM()98 inline constexpr MatType MatDense_CUPM_Base<T>::MATSEQDENSECUPM() noexcept
99 {
100 return T == device::cupm::DeviceType::CUDA ? MATSEQDENSECUDA : MATSEQDENSEHIP;
101 }
102
103 template <device::cupm::DeviceType T>
MATMPIDENSECUPM()104 inline constexpr MatType MatDense_CUPM_Base<T>::MATMPIDENSECUPM() noexcept
105 {
106 return T == device::cupm::DeviceType::CUDA ? MATMPIDENSECUDA : MATMPIDENSEHIP;
107 }
108
109 template <device::cupm::DeviceType T>
MATDENSECUPM()110 inline constexpr MatType MatDense_CUPM_Base<T>::MATDENSECUPM() noexcept
111 {
112 return T == device::cupm::DeviceType::CUDA ? MATDENSECUDA : MATDENSEHIP;
113 }
114
115 template <device::cupm::DeviceType T>
MATSOLVERCUPM()116 inline constexpr MatSolverType MatDense_CUPM_Base<T>::MATSOLVERCUPM() noexcept
117 {
118 return T == device::cupm::DeviceType::CUDA ? MATSOLVERCUDA : MATSOLVERHIP;
119 }
120
121 #define MATDENSECUPM_BASE_HEADER(T) \
122 PETSC_CUPMOBJECT_HEADER(T); \
123 using VecSeq_CUPM = ::Petsc::vec::cupm::impl::VecSeq_CUPM<T>; \
124 using VecMPI_CUPM = ::Petsc::vec::cupm::impl::VecMPI_CUPM<T>; \
125 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATSEQDENSECUPM; \
126 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATMPIDENSECUPM; \
127 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATDENSECUPM; \
128 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATSOLVERCUPM; \
129 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArray_C; \
130 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayRead_C; \
131 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayWrite_C; \
132 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArray_C; \
133 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayRead_C; \
134 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayWrite_C; \
135 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMPlaceArray_C; \
136 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMReplaceArray_C; \
137 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMResetArray_C; \
138 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMSetPreallocation_C
139
140 // forward declare
141 template <device::cupm::DeviceType>
142 class MatDense_Seq_CUPM;
143 template <device::cupm::DeviceType>
144 class MatDense_MPI_CUPM;
145
146 // ==========================================================================================
147 // MatDense_CUPM
148 //
149 // The true "base" class for MatDenseCUPM. The reason MatDense_CUPM and MatDense_CUPM_Base
150 // exist is to separate out the CRTP code from the non-crtp code so that the generic functions
151 // can be called via templates below.
152 // ==========================================================================================
153
154 template <device::cupm::DeviceType T, typename Derived>
155 class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_CUPM : protected MatDense_CUPM_Base<T> {
156 private:
157 static PetscErrorCode CheckSaneSequentialMatSizes_(Mat) noexcept;
158
159 protected:
160 MATDENSECUPM_BASE_HEADER(T);
161
162 template <PetscMemType, PetscMemoryAccessMode>
163 class MatrixArray;
164
165 // Cast the Mat to its host struct, i.e. return the result of (Mat_SeqDense *)m->data
166 template <typename U = Derived>
167 PETSC_NODISCARD static constexpr auto MatIMPLCast(Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(U::MatIMPLCast_(m))
168 PETSC_NODISCARD static constexpr MatType MATIMPLCUPM() noexcept;
169
170 static PetscErrorCode CreateIMPLDenseCUPM(MPI_Comm, PetscInt, PetscInt, PetscInt, PetscInt, PetscScalar *, Mat *, PetscDeviceContext, bool) noexcept;
171 static PetscErrorCode SetPreallocation(Mat, PetscDeviceContext, PetscScalar *) noexcept;
172
173 template <typename F>
174 static PetscErrorCode DiagonalUnaryTransform(Mat, PetscDeviceContext, F &&) noexcept;
175
176 static PetscErrorCode Shift(Mat, PetscScalar) noexcept;
177 static PetscErrorCode GetDiagonal(Mat, Vec) noexcept;
178
179 PETSC_NODISCARD static auto DeviceArrayRead(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>{dctx, m})
180 PETSC_NODISCARD static auto DeviceArrayWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>{dctx, m})
181 PETSC_NODISCARD static auto DeviceArrayReadWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, m})
182 PETSC_NODISCARD static auto HostArrayRead(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>{dctx, m})
183 PETSC_NODISCARD static auto HostArrayWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>{dctx, m})
184 PETSC_NODISCARD static auto HostArrayReadWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, m})
185 };
186
187 // ==========================================================================================
188 // MatDense_CUPM::MatrixArray
189 // ==========================================================================================
190
191 template <device::cupm::DeviceType T, typename D>
192 template <PetscMemType MT, PetscMemoryAccessMode MA>
193 class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_CUPM<T, D>::MatrixArray : public device::cupm::impl::RestoreableArray<T, MT, MA> {
194 using base_type = device::cupm::impl::RestoreableArray<T, MT, MA>;
195
196 public:
197 MatrixArray(PetscDeviceContext, Mat) noexcept;
198 ~MatrixArray() noexcept;
199
200 // must declare move constructor since we declare a destructor
201 constexpr MatrixArray(MatrixArray &&) noexcept;
202
203 private:
204 Mat m_ = nullptr;
205 };
206
207 // ==========================================================================================
208 // MatDense_CUPM::MatrixArray -- Public API
209 // ==========================================================================================
210
211 template <device::cupm::DeviceType T, typename D>
212 template <PetscMemType MT, PetscMemoryAccessMode MA>
MatrixArray(PetscDeviceContext dctx,Mat m)213 inline MatDense_CUPM<T, D>::MatrixArray<MT, MA>::MatrixArray(PetscDeviceContext dctx, Mat m) noexcept : base_type{dctx}, m_{m}
214 {
215 PetscFunctionBegin;
216 PetscCallAbort(PETSC_COMM_SELF, D::template GetArray<MT, MA>(m, &this->ptr_, dctx));
217 PetscFunctionReturnVoid();
218 }
219
220 template <device::cupm::DeviceType T, typename D>
221 template <PetscMemType MT, PetscMemoryAccessMode MA>
~MatrixArray()222 inline MatDense_CUPM<T, D>::MatrixArray<MT, MA>::~MatrixArray() noexcept
223 {
224 PetscFunctionBegin;
225 PetscCallAbort(PETSC_COMM_SELF, D::template RestoreArray<MT, MA>(m_, &this->ptr_, this->dctx_));
226 PetscFunctionReturnVoid();
227 }
228
229 template <device::cupm::DeviceType T, typename D>
230 template <PetscMemType MT, PetscMemoryAccessMode MA>
MatrixArray(MatrixArray && other)231 inline constexpr MatDense_CUPM<T, D>::MatrixArray<MT, MA>::MatrixArray(MatrixArray &&other) noexcept : base_type{std::move(other)}, m_{util::exchange(other.m_, nullptr)}
232 {
233 }
234
235 // ==========================================================================================
236 // MatDense_CUPM -- Private API
237 // ==========================================================================================
238
239 template <device::cupm::DeviceType T, typename D>
CheckSaneSequentialMatSizes_(Mat A)240 inline PetscErrorCode MatDense_CUPM<T, D>::CheckSaneSequentialMatSizes_(Mat A) noexcept
241 {
242 PetscFunctionBegin;
243 if (PetscDefined(USE_DEBUG)) {
244 PetscBool isseq;
245
246 PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), D::MATSEQDENSECUPM(), &isseq));
247 if (isseq) {
248 // doing this check allows both sequential and parallel implementations to just pass in
249 // A, otherwise they would need to specify rstart, rend, and cols separately!
250 PetscCheck(A->rmap->rstart == 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Sequential matrix row start %" PetscInt_FMT " != 0?", A->rmap->rstart);
251 PetscCheck(A->rmap->rend == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Sequential matrix row end %" PetscInt_FMT " != number of rows %" PetscInt_FMT, A->rmap->rend, A->rmap->n);
252 PetscCheck(A->cmap->n == A->cmap->N, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Sequential matrix number of local columns %" PetscInt_FMT " != number of global columns %" PetscInt_FMT, A->cmap->n, A->cmap->n);
253 }
254 }
255 PetscFunctionReturn(PETSC_SUCCESS);
256 }
257
258 // ==========================================================================================
259 // MatDense_CUPM -- Protected API
260 // ==========================================================================================
261
262 template <device::cupm::DeviceType T, typename D>
MATIMPLCUPM()263 inline constexpr MatType MatDense_CUPM<T, D>::MATIMPLCUPM() noexcept
264 {
265 return D::MATIMPLCUPM_();
266 }
267
268 // Common core for MatCreateSeqDenseCUPM() and MatCreateMPIDenseCUPM()
269 template <device::cupm::DeviceType T, typename D>
CreateIMPLDenseCUPM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscScalar * data,Mat * A,PetscDeviceContext dctx,bool preallocate)270 inline PetscErrorCode MatDense_CUPM<T, D>::CreateIMPLDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscScalar *data, Mat *A, PetscDeviceContext dctx, bool preallocate) noexcept
271 {
272 Mat mat;
273
274 PetscFunctionBegin;
275 PetscAssertPointer(A, 7);
276 PetscCall(MatCreate(comm, &mat));
277 PetscCall(MatSetSizes(mat, m, n, M, N));
278 PetscCall(MatSetType(mat, D::MATIMPLCUPM()));
279 if (preallocate) {
280 PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
281 PetscCall(D::SetPreallocation(mat, dctx, data));
282 }
283 *A = mat;
284 PetscFunctionReturn(PETSC_SUCCESS);
285 }
286
287 template <device::cupm::DeviceType T, typename D>
SetPreallocation(Mat A,PetscDeviceContext dctx,PetscScalar * device_array)288 inline PetscErrorCode MatDense_CUPM<T, D>::SetPreallocation(Mat A, PetscDeviceContext dctx, PetscScalar *device_array) noexcept
289 {
290 PetscFunctionBegin;
291 // cannot use PetscValidHeaderSpecificType(..., MATIMPLCUPM()) since the incoming matrix
292 // might be the local (sequential) matrix of a MatMPIDense_CUPM. Since this would be called
293 // from the MPI matrix'es impl MATIMPLCUPM() would return MATMPIDENSECUPM().
294 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
295 PetscCheckTypeNames(A, D::MATSEQDENSECUPM(), D::MATMPIDENSECUPM());
296 PetscCall(PetscLayoutSetUp(A->rmap));
297 PetscCall(PetscLayoutSetUp(A->cmap));
298 PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
299 PetscCall(D::SetPreallocation_(A, dctx, device_array));
300 A->preallocated = PETSC_TRUE;
301 A->assembled = PETSC_TRUE;
302 PetscFunctionReturn(PETSC_SUCCESS);
303 }
304
305 namespace detail
306 {
307
308 #if CCCL_VERSION >= 3000000
309 template <class T>
310 using iter_difference_t = cuda::std::iter_difference_t<T>;
311 #else
312 template <class T>
313 using iter_difference_t = typename thrust::iterator_difference<T>::type;
314 #endif
315
316 // ==========================================================================================
317 // MatrixIteratorBase
318 //
319 // A base class for creating thrust iterators over the local sub-matrix. This will set up the
320 // proper iterator definitions so thrust knows how to handle things properly. Template
321 // parameters are as follows:
322 //
323 // - Iterator:
324 // The type of the primary array iterator. Usually this is
325 // thrust::device_pointer<PetscScalar>::iterator.
326 //
327 // - IndexFunctor:
328 // This should be a functor which contains an operator() that when called with an index `i`,
329 // returns the i'th permuted index into the array. For example, it could return the i'th
330 // diagonal entry.
331 // ==========================================================================================
332 template <typename Iterator, typename IndexFunctor>
333 class MatrixIteratorBase {
334 public:
335 using array_iterator_type = Iterator;
336 using index_functor_type = IndexFunctor;
337
338 using difference_type = iter_difference_t<array_iterator_type>;
339 using CountingIterator = thrust::counting_iterator<difference_type>;
340 using TransformIterator = thrust::transform_iterator<index_functor_type, CountingIterator>;
341 using PermutationIterator = thrust::permutation_iterator<array_iterator_type, TransformIterator>;
342 using iterator = PermutationIterator; // type of the begin/end iterator
343
MatrixIteratorBase(array_iterator_type first,array_iterator_type last,index_functor_type idx_func)344 constexpr MatrixIteratorBase(array_iterator_type first, array_iterator_type last, index_functor_type idx_func) noexcept : first{std::move(first)}, last{std::move(last)}, func{std::move(idx_func)} { }
345
begin()346 PETSC_NODISCARD iterator begin() const noexcept
347 {
348 return PermutationIterator{
349 first, TransformIterator{CountingIterator{0}, func}
350 };
351 }
352
353 protected:
354 array_iterator_type first;
355 array_iterator_type last;
356 index_functor_type func;
357 };
358
359 // ==========================================================================================
360 // StridedIndexFunctor
361 //
362 // Iterator which permutes a linear index range into strided matrix indices. Usually used to
363 // get the diagonal.
364 // ==========================================================================================
365 template <typename T>
366 struct StridedIndexFunctor {
operatorStridedIndexFunctor367 PETSC_NODISCARD PETSC_HOSTDEVICE_INLINE_DECL constexpr T operator()(const T &i) const noexcept { return stride * i; }
368
369 T stride;
370 };
371
372 template <typename Iterator>
373 class DiagonalIterator : public MatrixIteratorBase<Iterator, StridedIndexFunctor<iter_difference_t<Iterator>>> {
374 public:
375 using base_type = MatrixIteratorBase<Iterator, StridedIndexFunctor<iter_difference_t<Iterator>>>;
376
377 using difference_type = typename base_type::difference_type;
378 using iterator = typename base_type::iterator;
379
DiagonalIterator(Iterator first,Iterator last,difference_type stride)380 constexpr DiagonalIterator(Iterator first, Iterator last, difference_type stride) noexcept : base_type{std::move(first), std::move(last), {stride}} { }
381
end()382 PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->last - this->first + this->func.stride - 1) / this->func.stride; }
383 };
384
385 template <typename T>
MakeDiagonalIterator(T * data,PetscInt rstart,PetscInt rend,PetscInt cols,PetscInt lda)386 inline DiagonalIterator<typename thrust::device_vector<T>::iterator> MakeDiagonalIterator(T *data, PetscInt rstart, PetscInt rend, PetscInt cols, PetscInt lda) noexcept
387 {
388 const auto rend2 = std::min(rend, cols);
389 const std::size_t begin = rstart * lda;
390 const std::size_t end = rend2 - rstart + rend2 * lda;
391 const auto dptr = thrust::device_pointer_cast(data);
392
393 return {dptr + begin, dptr + end, lda + 1};
394 }
395
396 } // namespace detail
397
398 template <device::cupm::DeviceType T, typename D>
399 template <typename F>
DiagonalUnaryTransform(Mat A,PetscDeviceContext dctx,F && functor)400 inline PetscErrorCode MatDense_CUPM<T, D>::DiagonalUnaryTransform(Mat A, PetscDeviceContext dctx, F &&functor) noexcept
401 {
402 const auto rstart = A->rmap->rstart;
403 const auto rend = A->rmap->rend;
404 const auto gcols = A->cmap->N;
405 const auto rend2 = std::min(rend, gcols);
406
407 PetscFunctionBegin;
408 PetscCall(CheckSaneSequentialMatSizes_(A));
409 if (rend2 > rstart) {
410 const auto da = D::DeviceArrayReadWrite(dctx, A);
411 cupmStream_t stream;
412 PetscInt lda;
413
414 PetscCall(MatDenseGetLDA(A, &lda));
415 PetscCall(D::GetHandlesFrom_(dctx, &stream));
416 {
417 auto diagonal = detail::MakeDiagonalIterator(da.data(), rstart, rend, gcols, lda);
418
419 // clang-format off
420 PetscCallThrust(
421 THRUST_CALL(
422 thrust::transform,
423 stream,
424 diagonal.begin(), diagonal.end(), diagonal.begin(),
425 std::forward<F>(functor)
426 )
427 );
428 // clang-format on
429 }
430 PetscCall(PetscLogGpuFlops(rend2 - rstart));
431 }
432 PetscFunctionReturn(PETSC_SUCCESS);
433 }
434
435 template <device::cupm::DeviceType T, typename D>
Shift(Mat A,PetscScalar alpha)436 inline PetscErrorCode MatDense_CUPM<T, D>::Shift(Mat A, PetscScalar alpha) noexcept
437 {
438 PetscDeviceContext dctx;
439
440 PetscFunctionBegin;
441 PetscCall(GetHandles_(&dctx));
442 PetscCall(DiagonalUnaryTransform(A, dctx, device::cupm::functors::make_plus_equals(alpha)));
443 PetscFunctionReturn(PETSC_SUCCESS);
444 }
445
446 template <device::cupm::DeviceType T, typename D>
GetDiagonal(Mat A,Vec v)447 inline PetscErrorCode MatDense_CUPM<T, D>::GetDiagonal(Mat A, Vec v) noexcept
448 {
449 const auto rstart = A->rmap->rstart;
450 const auto rend = A->rmap->rend;
451 const auto gcols = A->cmap->N;
452 PetscInt lda;
453 PetscDeviceContext dctx;
454
455 PetscFunctionBegin;
456 PetscCall(CheckSaneSequentialMatSizes_(A));
457 PetscCall(GetHandles_(&dctx));
458 PetscCall(MatDenseGetLDA(A, &lda));
459 {
460 auto dv = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
461 auto da = D::DeviceArrayRead(dctx, A);
462 auto diagonal = detail::MakeDiagonalIterator(da.data(), rstart, rend, gcols, lda);
463 // We must have this cast outside of THRUST_CALL(). Without it, GCC 6.4 - 7.5, and 11.3.0
464 // throw spurious warnings:
465 //
466 // warning: 'MatDense_CUPM<...>::GetDiagonal(Mat, Vec)::<lambda()>' declared with greater
467 // visibility than the type of its field 'MatDense_CUPM<...>::GetDiagonal(Mat,
468 // Vec)::<lambda()>::<dv capture>' [-Wattributes]
469 // 460 | PetscCallThrust(
470 // | ^~~~~~~~~~~~~~~~
471 auto dvp = thrust::device_pointer_cast(dv.data());
472 cupmStream_t stream;
473
474 PetscCall(GetHandlesFrom_(dctx, &stream));
475 PetscCallThrust(THRUST_CALL(thrust::copy, stream, diagonal.begin(), diagonal.end(), dvp));
476 }
477 PetscFunctionReturn(PETSC_SUCCESS);
478 }
479
480 #define MatComposeOp_CUPM(use_host, pobj, op_str, op_host, ...) \
481 do { \
482 if (use_host) { \
483 PetscCall(PetscObjectComposeFunction(pobj, op_str, op_host)); \
484 } else { \
485 PetscCall(PetscObjectComposeFunction(pobj, op_str, __VA_ARGS__)); \
486 } \
487 } while (0)
488
489 #define MatSetOp_CUPM(use_host, mat, op_name, op_host, ...) \
490 do { \
491 if (use_host) { \
492 (mat)->ops->op_name = op_host; \
493 } else { \
494 (mat)->ops->op_name = __VA_ARGS__; \
495 } \
496 } while (0)
497
498 #define MATDENSECUPM_HEADER(T, ...) \
499 MATDENSECUPM_BASE_HEADER(T); \
500 friend class ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>; \
501 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::MatIMPLCast; \
502 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::MATIMPLCUPM; \
503 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::CreateIMPLDenseCUPM; \
504 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::SetPreallocation; \
505 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DeviceArrayRead; \
506 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DeviceArrayWrite; \
507 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DeviceArrayReadWrite; \
508 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::HostArrayRead; \
509 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::HostArrayWrite; \
510 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::HostArrayReadWrite; \
511 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DiagonalUnaryTransform; \
512 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::Shift; \
513 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::GetDiagonal
514
515 } // namespace impl
516
517 // ==========================================================================================
518 // MatDense_CUPM -- Implementations
519 // ==========================================================================================
520
521 template <device::cupm::DeviceType T, PetscMemoryAccessMode access>
MatDenseCUPMGetArray_Private(Mat A,PetscScalar ** array)522 inline PetscErrorCode MatDenseCUPMGetArray_Private(Mat A, PetscScalar **array) noexcept
523 {
524 PetscFunctionBegin;
525 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
526 PetscAssertPointer(array, 2);
527 switch (access) {
528 case PETSC_MEMORY_ACCESS_READ:
529 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayRead_C(), (Mat, PetscScalar **), (A, array));
530 break;
531 case PETSC_MEMORY_ACCESS_WRITE:
532 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayWrite_C(), (Mat, PetscScalar **), (A, array));
533 break;
534 case PETSC_MEMORY_ACCESS_READ_WRITE:
535 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArray_C(), (Mat, PetscScalar **), (A, array));
536 break;
537 }
538 if (PetscMemoryAccessWrite(access)) PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
539 PetscFunctionReturn(PETSC_SUCCESS);
540 }
541
542 template <device::cupm::DeviceType T, PetscMemoryAccessMode access>
MatDenseCUPMRestoreArray_Private(Mat A,PetscScalar ** array)543 inline PetscErrorCode MatDenseCUPMRestoreArray_Private(Mat A, PetscScalar **array) noexcept
544 {
545 PetscFunctionBegin;
546 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
547 if (array) PetscAssertPointer(array, 2);
548 switch (access) {
549 case PETSC_MEMORY_ACCESS_READ:
550 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayRead_C(), (Mat, PetscScalar **), (A, array));
551 break;
552 case PETSC_MEMORY_ACCESS_WRITE:
553 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayWrite_C(), (Mat, PetscScalar **), (A, array));
554 break;
555 case PETSC_MEMORY_ACCESS_READ_WRITE:
556 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArray_C(), (Mat, PetscScalar **), (A, array));
557 break;
558 }
559 if (PetscMemoryAccessWrite(access)) {
560 PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
561 A->offloadmask = PETSC_OFFLOAD_GPU;
562 }
563 if (array) *array = nullptr;
564 PetscFunctionReturn(PETSC_SUCCESS);
565 }
566
567 template <device::cupm::DeviceType T>
MatDenseCUPMGetArray(Mat A,PetscScalar ** array)568 inline PetscErrorCode MatDenseCUPMGetArray(Mat A, PetscScalar **array) noexcept
569 {
570 PetscFunctionBegin;
571 PetscCall(MatDenseCUPMGetArray_Private<T, PETSC_MEMORY_ACCESS_READ_WRITE>(A, array));
572 PetscFunctionReturn(PETSC_SUCCESS);
573 }
574
575 template <device::cupm::DeviceType T>
MatDenseCUPMGetArrayRead(Mat A,const PetscScalar ** array)576 inline PetscErrorCode MatDenseCUPMGetArrayRead(Mat A, const PetscScalar **array) noexcept
577 {
578 PetscFunctionBegin;
579 PetscCall(MatDenseCUPMGetArray_Private<T, PETSC_MEMORY_ACCESS_READ>(A, const_cast<PetscScalar **>(array)));
580 PetscFunctionReturn(PETSC_SUCCESS);
581 }
582
583 template <device::cupm::DeviceType T>
MatDenseCUPMGetArrayWrite(Mat A,PetscScalar ** array)584 inline PetscErrorCode MatDenseCUPMGetArrayWrite(Mat A, PetscScalar **array) noexcept
585 {
586 PetscFunctionBegin;
587 PetscCall(MatDenseCUPMGetArray_Private<T, PETSC_MEMORY_ACCESS_WRITE>(A, array));
588 PetscFunctionReturn(PETSC_SUCCESS);
589 }
590
591 template <device::cupm::DeviceType T>
MatDenseCUPMRestoreArray(Mat A,PetscScalar ** array)592 inline PetscErrorCode MatDenseCUPMRestoreArray(Mat A, PetscScalar **array) noexcept
593 {
594 PetscFunctionBegin;
595 PetscCall(MatDenseCUPMRestoreArray_Private<T, PETSC_MEMORY_ACCESS_READ_WRITE>(A, array));
596 PetscFunctionReturn(PETSC_SUCCESS);
597 }
598
599 template <device::cupm::DeviceType T>
MatDenseCUPMRestoreArrayRead(Mat A,const PetscScalar ** array)600 inline PetscErrorCode MatDenseCUPMRestoreArrayRead(Mat A, const PetscScalar **array) noexcept
601 {
602 PetscFunctionBegin;
603 PetscCall(MatDenseCUPMRestoreArray_Private<T, PETSC_MEMORY_ACCESS_READ>(A, const_cast<PetscScalar **>(array)));
604 PetscFunctionReturn(PETSC_SUCCESS);
605 }
606
607 template <device::cupm::DeviceType T>
MatDenseCUPMRestoreArrayWrite(Mat A,PetscScalar ** array)608 inline PetscErrorCode MatDenseCUPMRestoreArrayWrite(Mat A, PetscScalar **array) noexcept
609 {
610 PetscFunctionBegin;
611 PetscCall(MatDenseCUPMRestoreArray_Private<T, PETSC_MEMORY_ACCESS_WRITE>(A, array));
612 PetscFunctionReturn(PETSC_SUCCESS);
613 }
614
615 template <device::cupm::DeviceType T>
MatDenseCUPMPlaceArray(Mat A,const PetscScalar * array)616 inline PetscErrorCode MatDenseCUPMPlaceArray(Mat A, const PetscScalar *array) noexcept
617 {
618 PetscFunctionBegin;
619 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
620 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMPlaceArray_C(), (Mat, const PetscScalar *), (A, array));
621 PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
622 A->offloadmask = PETSC_OFFLOAD_GPU;
623 PetscFunctionReturn(PETSC_SUCCESS);
624 }
625
626 template <device::cupm::DeviceType T>
MatDenseCUPMReplaceArray(Mat A,const PetscScalar * array)627 inline PetscErrorCode MatDenseCUPMReplaceArray(Mat A, const PetscScalar *array) noexcept
628 {
629 PetscFunctionBegin;
630 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
631 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMReplaceArray_C(), (Mat, const PetscScalar *), (A, array));
632 PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
633 A->offloadmask = PETSC_OFFLOAD_GPU;
634 PetscFunctionReturn(PETSC_SUCCESS);
635 }
636
637 template <device::cupm::DeviceType T>
MatDenseCUPMResetArray(Mat A)638 inline PetscErrorCode MatDenseCUPMResetArray(Mat A) noexcept
639 {
640 PetscFunctionBegin;
641 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
642 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMResetArray_C(), (Mat), (A));
643 PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
644 PetscFunctionReturn(PETSC_SUCCESS);
645 }
646
647 template <device::cupm::DeviceType T>
648 inline PetscErrorCode MatDenseCUPMSetPreallocation(Mat A, PetscScalar *device_data, PetscDeviceContext dctx = nullptr) noexcept
649 {
650 PetscFunctionBegin;
651 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
652 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMSetPreallocation_C(), (Mat, PetscDeviceContext, PetscScalar *), (A, dctx, device_data));
653 PetscFunctionReturn(PETSC_SUCCESS);
654 }
655
656 } // namespace cupm
657
658 } // namespace mat
659
660 } // namespace Petsc
661
662 #endif // __cplusplus
663