1054def41SToby Isaac #pragma once
2054def41SToby Isaac
3054def41SToby Isaac #include "sfcupm.hpp"
4054def41SToby Isaac #include <../src/sys/objects/device/impls/cupm/kernels.hpp>
5054def41SToby Isaac #include <petsc/private/cupmatomics.hpp>
6054def41SToby Isaac
7054def41SToby Isaac namespace Petsc
8054def41SToby Isaac {
9054def41SToby Isaac
10054def41SToby Isaac namespace sf
11054def41SToby Isaac {
12054def41SToby Isaac
13054def41SToby Isaac namespace cupm
14054def41SToby Isaac {
15054def41SToby Isaac
16054def41SToby Isaac namespace kernels
17054def41SToby Isaac {
18054def41SToby Isaac
19054def41SToby Isaac /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
MapTidToIndex(const PetscInt * opt,PetscInt tid)20054def41SToby Isaac PETSC_NODISCARD static PETSC_DEVICE_INLINE_DECL PetscInt MapTidToIndex(const PetscInt *opt, PetscInt tid) noexcept
21054def41SToby Isaac {
22054def41SToby Isaac PetscInt i, j, k, m, n, r;
23054def41SToby Isaac const PetscInt *offset, *start, *dx, *dy, *X, *Y;
24054def41SToby Isaac
25054def41SToby Isaac n = opt[0];
26054def41SToby Isaac offset = opt + 1;
27054def41SToby Isaac start = opt + n + 2;
28054def41SToby Isaac dx = opt + 2 * n + 2;
29054def41SToby Isaac dy = opt + 3 * n + 2;
30054def41SToby Isaac X = opt + 5 * n + 2;
31054def41SToby Isaac Y = opt + 6 * n + 2;
32054def41SToby Isaac for (r = 0; r < n; r++) {
33054def41SToby Isaac if (tid < offset[r + 1]) break;
34054def41SToby Isaac }
35054def41SToby Isaac m = (tid - offset[r]);
36054def41SToby Isaac k = m / (dx[r] * dy[r]);
37054def41SToby Isaac j = (m - k * dx[r] * dy[r]) / dx[r];
38054def41SToby Isaac i = m - k * dx[r] * dy[r] - j * dx[r];
39054def41SToby Isaac
404ad8454bSPierre Jolivet return start[r] + k * X[r] * Y[r] + j * X[r] + i;
41054def41SToby Isaac }
42054def41SToby Isaac
43054def41SToby Isaac /*====================================================================================*/
44054def41SToby Isaac /* Templated CUPM kernels for pack/unpack. The Op can be regular or atomic */
45054def41SToby Isaac /*====================================================================================*/
46054def41SToby Isaac
47054def41SToby Isaac /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
48054def41SToby Isaac <Type> is PetscReal, which is the primitive type we operate on.
49054def41SToby Isaac <bs> is 16, which says <unit> contains 16 primitive types.
50054def41SToby Isaac <BS> is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
51054def41SToby Isaac <EQ> is 0, which is (bs == BS ? 1 : 0)
52054def41SToby Isaac
53054def41SToby Isaac If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
54054def41SToby Isaac For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
55054def41SToby Isaac */
56054def41SToby Isaac template <class Type, PetscInt BS, PetscInt EQ>
d_Pack(PetscInt bs,PetscInt count,PetscInt start,const PetscInt * opt,const PetscInt * idx,const Type * data,Type * buf)57054def41SToby Isaac PETSC_KERNEL_DECL static void d_Pack(PetscInt bs, PetscInt count, PetscInt start, const PetscInt *opt, const PetscInt *idx, const Type *data, Type *buf)
58054def41SToby Isaac {
59054def41SToby Isaac const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */
60054def41SToby Isaac const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */
61054def41SToby Isaac
62054def41SToby Isaac ::Petsc::device::cupm::kernels::util::grid_stride_1D(count, [&](PetscInt tid) {
63054def41SToby Isaac PetscInt t = (opt ? MapTidToIndex(opt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
64054def41SToby Isaac PetscInt s = tid * MBS;
65054def41SToby Isaac for (PetscInt i = 0; i < MBS; i++) buf[s + i] = data[t + i];
66054def41SToby Isaac });
67054def41SToby Isaac }
68054def41SToby Isaac
69054def41SToby Isaac template <class Type, class Op, PetscInt BS, PetscInt EQ>
d_UnpackAndOp(PetscInt bs,PetscInt count,PetscInt start,const PetscInt * opt,const PetscInt * idx,Type * data,const Type * buf)70054def41SToby Isaac PETSC_KERNEL_DECL static void d_UnpackAndOp(PetscInt bs, PetscInt count, PetscInt start, const PetscInt *opt, const PetscInt *idx, Type *data, const Type *buf)
71054def41SToby Isaac {
72054def41SToby Isaac const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
73054def41SToby Isaac Op op;
74054def41SToby Isaac
75054def41SToby Isaac ::Petsc::device::cupm::kernels::util::grid_stride_1D(count, [&](PetscInt tid) {
76054def41SToby Isaac PetscInt t = (opt ? MapTidToIndex(opt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
77054def41SToby Isaac PetscInt s = tid * MBS;
78054def41SToby Isaac for (PetscInt i = 0; i < MBS; i++) op(data[t + i], buf[s + i]);
79054def41SToby Isaac });
80054def41SToby Isaac }
81054def41SToby Isaac
82054def41SToby Isaac template <class Type, class Op, PetscInt BS, PetscInt EQ>
d_FetchAndOp(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt * rootopt,const PetscInt * rootidx,Type * rootdata,Type * leafbuf)83054def41SToby Isaac PETSC_KERNEL_DECL static void d_FetchAndOp(PetscInt bs, PetscInt count, PetscInt rootstart, const PetscInt *rootopt, const PetscInt *rootidx, Type *rootdata, Type *leafbuf)
84054def41SToby Isaac {
85054def41SToby Isaac const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
86054def41SToby Isaac Op op;
87054def41SToby Isaac
88054def41SToby Isaac ::Petsc::device::cupm::kernels::util::grid_stride_1D(count, [&](PetscInt tid) {
89054def41SToby Isaac PetscInt r = (rootopt ? MapTidToIndex(rootopt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS;
90054def41SToby Isaac PetscInt l = tid * MBS;
91054def41SToby Isaac for (PetscInt i = 0; i < MBS; i++) leafbuf[l + i] = op(rootdata[r + i], leafbuf[l + i]);
92054def41SToby Isaac });
93054def41SToby Isaac }
94054def41SToby Isaac
95054def41SToby Isaac template <class Type, class Op, PetscInt BS, PetscInt EQ>
d_ScatterAndOp(PetscInt bs,PetscInt count,PetscInt srcx,PetscInt srcy,PetscInt srcX,PetscInt srcY,PetscInt srcStart,const PetscInt * srcIdx,const Type * src,PetscInt dstx,PetscInt dsty,PetscInt dstX,PetscInt dstY,PetscInt dstStart,const PetscInt * dstIdx,Type * dst)96054def41SToby Isaac PETSC_KERNEL_DECL static void d_ScatterAndOp(PetscInt bs, PetscInt count, PetscInt srcx, PetscInt srcy, PetscInt srcX, PetscInt srcY, PetscInt srcStart, const PetscInt *srcIdx, const Type *src, PetscInt dstx, PetscInt dsty, PetscInt dstX, PetscInt dstY, PetscInt dstStart, const PetscInt *dstIdx, Type *dst)
97054def41SToby Isaac {
98054def41SToby Isaac const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
99054def41SToby Isaac Op op;
100054def41SToby Isaac
101054def41SToby Isaac ::Petsc::device::cupm::kernels::util::grid_stride_1D(count, [&](PetscInt tid) {
102054def41SToby Isaac PetscInt s, t;
103054def41SToby Isaac
104054def41SToby Isaac if (!srcIdx) { /* src is either contiguous or 3D */
105054def41SToby Isaac PetscInt k = tid / (srcx * srcy);
106054def41SToby Isaac PetscInt j = (tid - k * srcx * srcy) / srcx;
107054def41SToby Isaac PetscInt i = tid - k * srcx * srcy - j * srcx;
108054def41SToby Isaac
109054def41SToby Isaac s = srcStart + k * srcX * srcY + j * srcX + i;
110054def41SToby Isaac } else {
111054def41SToby Isaac s = srcIdx[tid];
112054def41SToby Isaac }
113054def41SToby Isaac
114054def41SToby Isaac if (!dstIdx) { /* dst is either contiguous or 3D */
115054def41SToby Isaac PetscInt k = tid / (dstx * dsty);
116054def41SToby Isaac PetscInt j = (tid - k * dstx * dsty) / dstx;
117054def41SToby Isaac PetscInt i = tid - k * dstx * dsty - j * dstx;
118054def41SToby Isaac
119054def41SToby Isaac t = dstStart + k * dstX * dstY + j * dstX + i;
120054def41SToby Isaac } else {
121054def41SToby Isaac t = dstIdx[tid];
122054def41SToby Isaac }
123054def41SToby Isaac
124054def41SToby Isaac s *= MBS;
125054def41SToby Isaac t *= MBS;
126054def41SToby Isaac for (PetscInt i = 0; i < MBS; i++) op(dst[t + i], src[s + i]);
127054def41SToby Isaac });
128054def41SToby Isaac }
129054def41SToby Isaac
130054def41SToby Isaac template <class Type, class Op, PetscInt BS, PetscInt EQ>
d_FetchAndOpLocal(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt * rootopt,const PetscInt * rootidx,Type * rootdata,PetscInt leafstart,const PetscInt * leafopt,const PetscInt * leafidx,const Type * leafdata,Type * leafupdate)131054def41SToby Isaac PETSC_KERNEL_DECL static void d_FetchAndOpLocal(PetscInt bs, PetscInt count, PetscInt rootstart, const PetscInt *rootopt, const PetscInt *rootidx, Type *rootdata, PetscInt leafstart, const PetscInt *leafopt, const PetscInt *leafidx, const Type *leafdata, Type *leafupdate)
132054def41SToby Isaac {
133054def41SToby Isaac const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
134054def41SToby Isaac Op op;
135054def41SToby Isaac
136054def41SToby Isaac ::Petsc::device::cupm::kernels::util::grid_stride_1D(count, [&](PetscInt tid) {
137054def41SToby Isaac PetscInt r = (rootopt ? MapTidToIndex(rootopt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS;
138054def41SToby Isaac PetscInt l = (leafopt ? MapTidToIndex(leafopt, tid) : (leafidx ? leafidx[tid] : leafstart + tid)) * MBS;
139054def41SToby Isaac for (PetscInt i = 0; i < MBS; i++) leafupdate[l + i] = op(rootdata[r + i], leafdata[l + i]);
140054def41SToby Isaac });
141054def41SToby Isaac }
142054def41SToby Isaac
143054def41SToby Isaac /*====================================================================================*/
144054def41SToby Isaac /* Regular operations on device */
145054def41SToby Isaac /*====================================================================================*/
146054def41SToby Isaac template <typename Type>
147054def41SToby Isaac struct Insert {
operator ()Petsc::sf::cupm::kernels::Insert148054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
149054def41SToby Isaac {
150054def41SToby Isaac Type old = x;
151054def41SToby Isaac x = y;
152054def41SToby Isaac return old;
153054def41SToby Isaac }
154054def41SToby Isaac };
155054def41SToby Isaac template <typename Type>
156054def41SToby Isaac struct Add {
operator ()Petsc::sf::cupm::kernels::Add157054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
158054def41SToby Isaac {
159054def41SToby Isaac Type old = x;
160054def41SToby Isaac x += y;
161054def41SToby Isaac return old;
162054def41SToby Isaac }
163054def41SToby Isaac };
164054def41SToby Isaac template <typename Type>
165054def41SToby Isaac struct Mult {
operator ()Petsc::sf::cupm::kernels::Mult166054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
167054def41SToby Isaac {
168054def41SToby Isaac Type old = x;
169054def41SToby Isaac x *= y;
170054def41SToby Isaac return old;
171054def41SToby Isaac }
172054def41SToby Isaac };
173054def41SToby Isaac template <typename Type>
174054def41SToby Isaac struct Min {
operator ()Petsc::sf::cupm::kernels::Min175054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
176054def41SToby Isaac {
177054def41SToby Isaac Type old = x;
178054def41SToby Isaac x = PetscMin(x, y);
179054def41SToby Isaac return old;
180054def41SToby Isaac }
181054def41SToby Isaac };
182054def41SToby Isaac template <typename Type>
183054def41SToby Isaac struct Max {
operator ()Petsc::sf::cupm::kernels::Max184054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
185054def41SToby Isaac {
186054def41SToby Isaac Type old = x;
187054def41SToby Isaac x = PetscMax(x, y);
188054def41SToby Isaac return old;
189054def41SToby Isaac }
190054def41SToby Isaac };
191054def41SToby Isaac template <typename Type>
192054def41SToby Isaac struct LAND {
operator ()Petsc::sf::cupm::kernels::LAND193054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
194054def41SToby Isaac {
195054def41SToby Isaac Type old = x;
196054def41SToby Isaac x = x && y;
197054def41SToby Isaac return old;
198054def41SToby Isaac }
199054def41SToby Isaac };
200054def41SToby Isaac template <typename Type>
201054def41SToby Isaac struct LOR {
operator ()Petsc::sf::cupm::kernels::LOR202054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
203054def41SToby Isaac {
204054def41SToby Isaac Type old = x;
205054def41SToby Isaac x = x || y;
206054def41SToby Isaac return old;
207054def41SToby Isaac }
208054def41SToby Isaac };
209054def41SToby Isaac template <typename Type>
210054def41SToby Isaac struct LXOR {
operator ()Petsc::sf::cupm::kernels::LXOR211054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
212054def41SToby Isaac {
213054def41SToby Isaac Type old = x;
214054def41SToby Isaac x = !x != !y;
215054def41SToby Isaac return old;
216054def41SToby Isaac }
217054def41SToby Isaac };
218054def41SToby Isaac template <typename Type>
219054def41SToby Isaac struct BAND {
operator ()Petsc::sf::cupm::kernels::BAND220054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
221054def41SToby Isaac {
222054def41SToby Isaac Type old = x;
223054def41SToby Isaac x = x & y;
224054def41SToby Isaac return old;
225054def41SToby Isaac }
226054def41SToby Isaac };
227054def41SToby Isaac template <typename Type>
228054def41SToby Isaac struct BOR {
operator ()Petsc::sf::cupm::kernels::BOR229054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
230054def41SToby Isaac {
231054def41SToby Isaac Type old = x;
232054def41SToby Isaac x = x | y;
233054def41SToby Isaac return old;
234054def41SToby Isaac }
235054def41SToby Isaac };
236054def41SToby Isaac template <typename Type>
237054def41SToby Isaac struct BXOR {
operator ()Petsc::sf::cupm::kernels::BXOR238054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
239054def41SToby Isaac {
240054def41SToby Isaac Type old = x;
241054def41SToby Isaac x = x ^ y;
242054def41SToby Isaac return old;
243054def41SToby Isaac }
244054def41SToby Isaac };
245054def41SToby Isaac template <typename Type>
246054def41SToby Isaac struct Minloc {
operator ()Petsc::sf::cupm::kernels::Minloc247054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
248054def41SToby Isaac {
249054def41SToby Isaac Type old = x;
250054def41SToby Isaac if (y.a < x.a) x = y;
251054def41SToby Isaac else if (y.a == x.a) x.b = min(x.b, y.b);
252054def41SToby Isaac return old;
253054def41SToby Isaac }
254054def41SToby Isaac };
255054def41SToby Isaac template <typename Type>
256054def41SToby Isaac struct Maxloc {
operator ()Petsc::sf::cupm::kernels::Maxloc257054def41SToby Isaac PETSC_DEVICE_DECL Type operator()(Type &x, Type y) const
258054def41SToby Isaac {
259054def41SToby Isaac Type old = x;
260054def41SToby Isaac if (y.a > x.a) x = y;
261054def41SToby Isaac else if (y.a == x.a) x.b = min(x.b, y.b); /* See MPI MAXLOC */
262054def41SToby Isaac return old;
263054def41SToby Isaac }
264054def41SToby Isaac };
265054def41SToby Isaac
266054def41SToby Isaac } // namespace kernels
267054def41SToby Isaac
268054def41SToby Isaac namespace impl
269054def41SToby Isaac {
270054def41SToby Isaac
271054def41SToby Isaac /*====================================================================================*/
272054def41SToby Isaac /* Wrapper functions of cupm kernels. Function pointers are stored in 'link' */
273054def41SToby Isaac /*====================================================================================*/
274054def41SToby Isaac template <device::cupm::DeviceType T>
275054def41SToby Isaac template <typename Type, PetscInt BS, PetscInt EQ>
Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt * idx,const void * data,void * buf)276054def41SToby Isaac inline PetscErrorCode SfInterface<T>::Pack(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *data, void *buf) noexcept
277054def41SToby Isaac {
278054def41SToby Isaac const PetscInt *iarray = opt ? opt->array : NULL;
279054def41SToby Isaac
280054def41SToby Isaac PetscFunctionBegin;
281054def41SToby Isaac if (!count) PetscFunctionReturn(PETSC_SUCCESS);
282054def41SToby Isaac if (PetscDefined(USING_NVCC) && !opt && !idx) { /* It is a 'CUDA data to nvshmem buf' memory copy */
283054def41SToby Isaac PetscCallCUPM(cupmMemcpyAsync(buf, (char *)data + start * link->unitbytes, count * link->unitbytes, cupmMemcpyDeviceToDevice, link->stream));
284054def41SToby Isaac } else {
285054def41SToby Isaac PetscCall(PetscCUPMLaunchKernel1D(count, 0, link->stream, kernels::d_Pack<Type, BS, EQ>, link->bs, count, start, iarray, idx, (const Type *)data, (Type *)buf));
286054def41SToby Isaac }
287054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
288054def41SToby Isaac }
289054def41SToby Isaac
290054def41SToby Isaac template <device::cupm::DeviceType T>
291054def41SToby Isaac template <typename Type, class Op, PetscInt BS, PetscInt EQ>
UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt * idx,void * data,const void * buf)292054def41SToby Isaac inline PetscErrorCode SfInterface<T>::UnpackAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, const void *buf) noexcept
293054def41SToby Isaac {
294054def41SToby Isaac const PetscInt *iarray = opt ? opt->array : NULL;
295054def41SToby Isaac
296054def41SToby Isaac PetscFunctionBegin;
297054def41SToby Isaac if (!count) PetscFunctionReturn(PETSC_SUCCESS);
298054def41SToby Isaac if (PetscDefined(USING_NVCC) && std::is_same<Op, kernels::Insert<Type>>::value && !opt && !idx) { /* It is a 'nvshmem buf to CUDA data' memory copy */
299054def41SToby Isaac PetscCallCUPM(cupmMemcpyAsync((char *)data + start * link->unitbytes, buf, count * link->unitbytes, cupmMemcpyDeviceToDevice, link->stream));
300054def41SToby Isaac } else {
301054def41SToby Isaac PetscCall(PetscCUPMLaunchKernel1D(count, 0, link->stream, kernels::d_UnpackAndOp<Type, Op, BS, EQ>, link->bs, count, start, iarray, idx, (Type *)data, (const Type *)buf));
302054def41SToby Isaac }
303054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
304054def41SToby Isaac }
305054def41SToby Isaac
306054def41SToby Isaac template <device::cupm::DeviceType T>
307054def41SToby Isaac template <typename Type, class Op, PetscInt BS, PetscInt EQ>
FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt * idx,void * data,void * buf)308054def41SToby Isaac inline PetscErrorCode SfInterface<T>::FetchAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, void *buf) noexcept
309054def41SToby Isaac {
310054def41SToby Isaac const PetscInt *iarray = opt ? opt->array : NULL;
311054def41SToby Isaac
312054def41SToby Isaac PetscFunctionBegin;
313054def41SToby Isaac if (!count) PetscFunctionReturn(PETSC_SUCCESS);
314054def41SToby Isaac PetscCall(PetscCUPMLaunchKernel1D(count, 0, link->stream, kernels::d_FetchAndOp<Type, Op, BS, EQ>, link->bs, count, start, iarray, idx, (Type *)data, (const Type *)buf));
315054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
316054def41SToby Isaac }
317054def41SToby Isaac
318054def41SToby Isaac template <device::cupm::DeviceType T>
319054def41SToby Isaac template <typename Type, class Op, PetscInt BS, PetscInt EQ>
ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt * srcIdx,const void * src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt * dstIdx,void * dst)320054def41SToby Isaac inline PetscErrorCode SfInterface<T>::ScatterAndOp(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) noexcept
321054def41SToby Isaac {
322054def41SToby Isaac PetscInt nthreads = 256;
323054def41SToby Isaac PetscInt nblocks = (count + nthreads - 1) / nthreads;
324054def41SToby Isaac PetscInt srcx = 0, srcy = 0, srcX = 0, srcY = 0, dstx = 0, dsty = 0, dstX = 0, dstY = 0;
325054def41SToby Isaac
326054def41SToby Isaac PetscFunctionBegin;
327054def41SToby Isaac if (!count) PetscFunctionReturn(PETSC_SUCCESS);
328054def41SToby Isaac nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
329054def41SToby Isaac
330054def41SToby Isaac /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use 3D grid and block */
331054def41SToby Isaac if (srcOpt) {
332054def41SToby Isaac srcx = srcOpt->dx[0];
333054def41SToby Isaac srcy = srcOpt->dy[0];
334054def41SToby Isaac srcX = srcOpt->X[0];
335054def41SToby Isaac srcY = srcOpt->Y[0];
336054def41SToby Isaac srcStart = srcOpt->start[0];
337054def41SToby Isaac srcIdx = NULL;
338054def41SToby Isaac } else if (!srcIdx) {
339054def41SToby Isaac srcx = srcX = count;
340054def41SToby Isaac srcy = srcY = 1;
341054def41SToby Isaac }
342054def41SToby Isaac
343054def41SToby Isaac if (dstOpt) {
344054def41SToby Isaac dstx = dstOpt->dx[0];
345054def41SToby Isaac dsty = dstOpt->dy[0];
346054def41SToby Isaac dstX = dstOpt->X[0];
347054def41SToby Isaac dstY = dstOpt->Y[0];
348054def41SToby Isaac dstStart = dstOpt->start[0];
349054def41SToby Isaac dstIdx = NULL;
350054def41SToby Isaac } else if (!dstIdx) {
351054def41SToby Isaac dstx = dstX = count;
352054def41SToby Isaac dsty = dstY = 1;
353054def41SToby Isaac }
354054def41SToby Isaac
355054def41SToby Isaac PetscCall(PetscCUPMLaunchKernel1D(count, 0, link->stream, kernels::d_ScatterAndOp<Type, Op, BS, EQ>, link->bs, count, srcx, srcy, srcX, srcY, srcStart, srcIdx, (const Type *)src, dstx, dsty, dstX, dstY, dstStart, dstIdx, (Type *)dst));
356054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
357054def41SToby Isaac }
358054def41SToby Isaac
359054def41SToby Isaac template <device::cupm::DeviceType T>
360054def41SToby Isaac /* Specialization for Insert since we may use cupmMemcpyAsync */
361054def41SToby Isaac template <typename Type, PetscInt BS, PetscInt EQ>
ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt * srcIdx,const void * src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt * dstIdx,void * dst)362054def41SToby Isaac inline PetscErrorCode SfInterface<T>::ScatterAndInsert(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) noexcept
363054def41SToby Isaac {
364054def41SToby Isaac PetscFunctionBegin;
365054def41SToby Isaac if (!count) PetscFunctionReturn(PETSC_SUCCESS);
366054def41SToby Isaac /*src and dst are contiguous */
367054def41SToby Isaac if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
368054def41SToby Isaac PetscCallCUPM(cupmMemcpyAsync((Type *)dst + dstStart * link->bs, (const Type *)src + srcStart * link->bs, count * link->unitbytes, cupmMemcpyDeviceToDevice, link->stream));
369054def41SToby Isaac } else {
370054def41SToby Isaac PetscCall(ScatterAndOp<Type, kernels::Insert<Type>, BS, EQ>(link, count, srcStart, srcOpt, srcIdx, src, dstStart, dstOpt, dstIdx, dst));
371054def41SToby Isaac }
372054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
373054def41SToby Isaac }
374054def41SToby Isaac
375054def41SToby Isaac template <device::cupm::DeviceType T>
376054def41SToby Isaac template <typename Type, class Op, PetscInt BS, PetscInt EQ>
FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt * rootidx,void * rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt * leafidx,const void * leafdata,void * leafupdate)377054def41SToby Isaac inline PetscErrorCode SfInterface<T>::FetchAndOpLocal(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) noexcept
378054def41SToby Isaac {
379054def41SToby Isaac const PetscInt *rarray = rootopt ? rootopt->array : NULL;
380054def41SToby Isaac const PetscInt *larray = leafopt ? leafopt->array : NULL;
381054def41SToby Isaac
382054def41SToby Isaac PetscFunctionBegin;
383054def41SToby Isaac if (!count) PetscFunctionReturn(PETSC_SUCCESS);
384054def41SToby Isaac PetscCall(PetscCUPMLaunchKernel1D(count, 0, link->stream, kernels::d_FetchAndOpLocal<Type, Op, BS, EQ>, link->bs, count, rootstart, rarray, rootidx, (Type *)rootdata, leafstart, larray, leafidx, (const Type *)leafdata, (Type *)leafupdate));
385054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
386054def41SToby Isaac }
387054def41SToby Isaac
388054def41SToby Isaac /*====================================================================================*/
389054def41SToby Isaac /* Init various types and instantiate pack/unpack function pointers */
390054def41SToby Isaac /*====================================================================================*/
391054def41SToby Isaac template <device::cupm::DeviceType T>
392054def41SToby Isaac template <typename Type, PetscInt BS, PetscInt EQ>
PackInit_RealType(PetscSFLink link)393054def41SToby Isaac inline void SfInterface<T>::PackInit_RealType(PetscSFLink link) noexcept
394054def41SToby Isaac {
395054def41SToby Isaac /* Pack/unpack for remote communication */
396054def41SToby Isaac link->d_Pack = Pack<Type, BS, EQ>;
397054def41SToby Isaac link->d_UnpackAndInsert = UnpackAndOp<Type, kernels::Insert<Type>, BS, EQ>;
398054def41SToby Isaac link->d_UnpackAndAdd = UnpackAndOp<Type, kernels::Add<Type>, BS, EQ>;
399054def41SToby Isaac link->d_UnpackAndMult = UnpackAndOp<Type, kernels::Mult<Type>, BS, EQ>;
400054def41SToby Isaac link->d_UnpackAndMin = UnpackAndOp<Type, kernels::Min<Type>, BS, EQ>;
401054def41SToby Isaac link->d_UnpackAndMax = UnpackAndOp<Type, kernels::Max<Type>, BS, EQ>;
402054def41SToby Isaac link->d_FetchAndAdd = FetchAndOp<Type, kernels::Add<Type>, BS, EQ>;
403054def41SToby Isaac
404054def41SToby Isaac /* Scatter for local communication */
405054def41SToby Isaac link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>; /* Has special optimizations */
406054def41SToby Isaac link->d_ScatterAndAdd = ScatterAndOp<Type, kernels::Add<Type>, BS, EQ>;
407054def41SToby Isaac link->d_ScatterAndMult = ScatterAndOp<Type, kernels::Mult<Type>, BS, EQ>;
408054def41SToby Isaac link->d_ScatterAndMin = ScatterAndOp<Type, kernels::Min<Type>, BS, EQ>;
409054def41SToby Isaac link->d_ScatterAndMax = ScatterAndOp<Type, kernels::Max<Type>, BS, EQ>;
410054def41SToby Isaac link->d_FetchAndAddLocal = FetchAndOpLocal<Type, kernels::Add<Type>, BS, EQ>;
411054def41SToby Isaac
412054def41SToby Isaac /* Atomic versions when there are data-race possibilities */
413054def41SToby Isaac link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
414054def41SToby Isaac link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
415054def41SToby Isaac link->da_UnpackAndMult = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
416054def41SToby Isaac link->da_UnpackAndMin = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
417054def41SToby Isaac link->da_UnpackAndMax = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
418054def41SToby Isaac link->da_FetchAndAdd = FetchAndOp<Type, AtomicAdd<Type>, BS, EQ>;
419054def41SToby Isaac
420054def41SToby Isaac link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
421054def41SToby Isaac link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
422054def41SToby Isaac link->da_ScatterAndMult = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
423054def41SToby Isaac link->da_ScatterAndMin = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
424054def41SToby Isaac link->da_ScatterAndMax = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
425054def41SToby Isaac link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicAdd<Type>, BS, EQ>;
426054def41SToby Isaac }
427054def41SToby Isaac
428054def41SToby Isaac /* Have this templated class to specialize for char integers */
429054def41SToby Isaac template <device::cupm::DeviceType T>
430054def41SToby Isaac template <typename Type, PetscInt BS, PetscInt EQ, PetscInt size /*sizeof(Type)*/>
431054def41SToby Isaac struct SfInterface<T>::PackInit_IntegerType_Atomic {
InitPetsc::sf::cupm::impl::SfInterface::PackInit_IntegerType_Atomic432054def41SToby Isaac static inline void Init(PetscSFLink link) noexcept
433054def41SToby Isaac {
434054def41SToby Isaac link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
435054def41SToby Isaac link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
436054def41SToby Isaac link->da_UnpackAndMult = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
437054def41SToby Isaac link->da_UnpackAndMin = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
438054def41SToby Isaac link->da_UnpackAndMax = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
439054def41SToby Isaac link->da_UnpackAndLAND = UnpackAndOp<Type, AtomicLAND<Type>, BS, EQ>;
440054def41SToby Isaac link->da_UnpackAndLOR = UnpackAndOp<Type, AtomicLOR<Type>, BS, EQ>;
441054def41SToby Isaac link->da_UnpackAndLXOR = UnpackAndOp<Type, AtomicLXOR<Type>, BS, EQ>;
442054def41SToby Isaac link->da_UnpackAndBAND = UnpackAndOp<Type, AtomicBAND<Type>, BS, EQ>;
443054def41SToby Isaac link->da_UnpackAndBOR = UnpackAndOp<Type, AtomicBOR<Type>, BS, EQ>;
444054def41SToby Isaac link->da_UnpackAndBXOR = UnpackAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
445054def41SToby Isaac link->da_FetchAndAdd = FetchAndOp<Type, AtomicAdd<Type>, BS, EQ>;
446054def41SToby Isaac
447054def41SToby Isaac link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
448054def41SToby Isaac link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
449054def41SToby Isaac link->da_ScatterAndMult = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
450054def41SToby Isaac link->da_ScatterAndMin = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
451054def41SToby Isaac link->da_ScatterAndMax = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
452054def41SToby Isaac link->da_ScatterAndLAND = ScatterAndOp<Type, AtomicLAND<Type>, BS, EQ>;
453054def41SToby Isaac link->da_ScatterAndLOR = ScatterAndOp<Type, AtomicLOR<Type>, BS, EQ>;
454054def41SToby Isaac link->da_ScatterAndLXOR = ScatterAndOp<Type, AtomicLXOR<Type>, BS, EQ>;
455054def41SToby Isaac link->da_ScatterAndBAND = ScatterAndOp<Type, AtomicBAND<Type>, BS, EQ>;
456054def41SToby Isaac link->da_ScatterAndBOR = ScatterAndOp<Type, AtomicBOR<Type>, BS, EQ>;
457054def41SToby Isaac link->da_ScatterAndBXOR = ScatterAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
458054def41SToby Isaac link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicAdd<Type>, BS, EQ>;
459054def41SToby Isaac }
460054def41SToby Isaac };
461054def41SToby Isaac
462054def41SToby Isaac /* CUDA does not support atomics on chars. It is TBD in PETSc. */
463054def41SToby Isaac template <device::cupm::DeviceType T>
464054def41SToby Isaac template <typename Type, PetscInt BS, PetscInt EQ>
465054def41SToby Isaac struct SfInterface<T>::PackInit_IntegerType_Atomic<Type, BS, EQ, 1> {
InitPetsc::sf::cupm::impl::SfInterface::PackInit_IntegerType_Atomic466fbccb6d4SPierre Jolivet static inline void Init(PetscSFLink) { /* Nothing to leave function pointers NULL */ }
467054def41SToby Isaac };
468054def41SToby Isaac
469054def41SToby Isaac template <device::cupm::DeviceType T>
470054def41SToby Isaac template <typename Type, PetscInt BS, PetscInt EQ>
PackInit_IntegerType(PetscSFLink link)471054def41SToby Isaac inline void SfInterface<T>::PackInit_IntegerType(PetscSFLink link) noexcept
472054def41SToby Isaac {
473054def41SToby Isaac link->d_Pack = Pack<Type, BS, EQ>;
474054def41SToby Isaac link->d_UnpackAndInsert = UnpackAndOp<Type, kernels::Insert<Type>, BS, EQ>;
475054def41SToby Isaac link->d_UnpackAndAdd = UnpackAndOp<Type, kernels::Add<Type>, BS, EQ>;
476054def41SToby Isaac link->d_UnpackAndMult = UnpackAndOp<Type, kernels::Mult<Type>, BS, EQ>;
477054def41SToby Isaac link->d_UnpackAndMin = UnpackAndOp<Type, kernels::Min<Type>, BS, EQ>;
478054def41SToby Isaac link->d_UnpackAndMax = UnpackAndOp<Type, kernels::Max<Type>, BS, EQ>;
479054def41SToby Isaac link->d_UnpackAndLAND = UnpackAndOp<Type, kernels::LAND<Type>, BS, EQ>;
480054def41SToby Isaac link->d_UnpackAndLOR = UnpackAndOp<Type, kernels::LOR<Type>, BS, EQ>;
481054def41SToby Isaac link->d_UnpackAndLXOR = UnpackAndOp<Type, kernels::LXOR<Type>, BS, EQ>;
482054def41SToby Isaac link->d_UnpackAndBAND = UnpackAndOp<Type, kernels::BAND<Type>, BS, EQ>;
483054def41SToby Isaac link->d_UnpackAndBOR = UnpackAndOp<Type, kernels::BOR<Type>, BS, EQ>;
484054def41SToby Isaac link->d_UnpackAndBXOR = UnpackAndOp<Type, kernels::BXOR<Type>, BS, EQ>;
485054def41SToby Isaac link->d_FetchAndAdd = FetchAndOp<Type, kernels::Add<Type>, BS, EQ>;
486054def41SToby Isaac
487054def41SToby Isaac link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
488054def41SToby Isaac link->d_ScatterAndAdd = ScatterAndOp<Type, kernels::Add<Type>, BS, EQ>;
489054def41SToby Isaac link->d_ScatterAndMult = ScatterAndOp<Type, kernels::Mult<Type>, BS, EQ>;
490054def41SToby Isaac link->d_ScatterAndMin = ScatterAndOp<Type, kernels::Min<Type>, BS, EQ>;
491054def41SToby Isaac link->d_ScatterAndMax = ScatterAndOp<Type, kernels::Max<Type>, BS, EQ>;
492054def41SToby Isaac link->d_ScatterAndLAND = ScatterAndOp<Type, kernels::LAND<Type>, BS, EQ>;
493054def41SToby Isaac link->d_ScatterAndLOR = ScatterAndOp<Type, kernels::LOR<Type>, BS, EQ>;
494054def41SToby Isaac link->d_ScatterAndLXOR = ScatterAndOp<Type, kernels::LXOR<Type>, BS, EQ>;
495054def41SToby Isaac link->d_ScatterAndBAND = ScatterAndOp<Type, kernels::BAND<Type>, BS, EQ>;
496054def41SToby Isaac link->d_ScatterAndBOR = ScatterAndOp<Type, kernels::BOR<Type>, BS, EQ>;
497054def41SToby Isaac link->d_ScatterAndBXOR = ScatterAndOp<Type, kernels::BXOR<Type>, BS, EQ>;
498054def41SToby Isaac link->d_FetchAndAddLocal = FetchAndOpLocal<Type, kernels::Add<Type>, BS, EQ>;
499054def41SToby Isaac PackInit_IntegerType_Atomic<Type, BS, EQ, sizeof(Type)>::Init(link);
500054def41SToby Isaac }
501054def41SToby Isaac
502054def41SToby Isaac #if defined(PETSC_HAVE_COMPLEX)
503054def41SToby Isaac template <device::cupm::DeviceType T>
504054def41SToby Isaac template <typename Type, PetscInt BS, PetscInt EQ>
PackInit_ComplexType(PetscSFLink link)505054def41SToby Isaac inline void SfInterface<T>::PackInit_ComplexType(PetscSFLink link) noexcept
506054def41SToby Isaac {
507054def41SToby Isaac link->d_Pack = Pack<Type, BS, EQ>;
508054def41SToby Isaac link->d_UnpackAndInsert = UnpackAndOp<Type, kernels::Insert<Type>, BS, EQ>;
509054def41SToby Isaac link->d_UnpackAndAdd = UnpackAndOp<Type, kernels::Add<Type>, BS, EQ>;
510054def41SToby Isaac link->d_UnpackAndMult = UnpackAndOp<Type, kernels::Mult<Type>, BS, EQ>;
511054def41SToby Isaac link->d_FetchAndAdd = FetchAndOp<Type, kernels::Add<Type>, BS, EQ>;
512054def41SToby Isaac
513054def41SToby Isaac link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
514054def41SToby Isaac link->d_ScatterAndAdd = ScatterAndOp<Type, kernels::Add<Type>, BS, EQ>;
515054def41SToby Isaac link->d_ScatterAndMult = ScatterAndOp<Type, kernels::Mult<Type>, BS, EQ>;
516054def41SToby Isaac link->d_FetchAndAddLocal = FetchAndOpLocal<Type, kernels::Add<Type>, BS, EQ>;
517054def41SToby Isaac
518054def41SToby Isaac link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
519054def41SToby Isaac link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
520054def41SToby Isaac link->da_UnpackAndMult = NULL; /* Not implemented yet */
521054def41SToby Isaac link->da_FetchAndAdd = NULL; /* Return value of atomicAdd on complex is not atomic */
522054def41SToby Isaac
523054def41SToby Isaac link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
524054def41SToby Isaac link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
525054def41SToby Isaac }
526054def41SToby Isaac #endif
527054def41SToby Isaac
528054def41SToby Isaac typedef signed char SignedChar;
529054def41SToby Isaac typedef unsigned char UnsignedChar;
530054def41SToby Isaac typedef struct {
531054def41SToby Isaac int a;
532054def41SToby Isaac int b;
533054def41SToby Isaac } PairInt;
534054def41SToby Isaac typedef struct {
535054def41SToby Isaac PetscInt a;
536054def41SToby Isaac PetscInt b;
537054def41SToby Isaac } PairPetscInt;
538054def41SToby Isaac
539054def41SToby Isaac template <device::cupm::DeviceType T>
540054def41SToby Isaac template <typename Type>
PackInit_PairType(PetscSFLink link)541054def41SToby Isaac inline void SfInterface<T>::PackInit_PairType(PetscSFLink link) noexcept
542054def41SToby Isaac {
543054def41SToby Isaac link->d_Pack = Pack<Type, 1, 1>;
544054def41SToby Isaac link->d_UnpackAndInsert = UnpackAndOp<Type, kernels::Insert<Type>, 1, 1>;
545054def41SToby Isaac link->d_UnpackAndMaxloc = UnpackAndOp<Type, kernels::Maxloc<Type>, 1, 1>;
546054def41SToby Isaac link->d_UnpackAndMinloc = UnpackAndOp<Type, kernels::Minloc<Type>, 1, 1>;
547054def41SToby Isaac
548054def41SToby Isaac link->d_ScatterAndInsert = ScatterAndOp<Type, kernels::Insert<Type>, 1, 1>;
549054def41SToby Isaac link->d_ScatterAndMaxloc = ScatterAndOp<Type, kernels::Maxloc<Type>, 1, 1>;
550054def41SToby Isaac link->d_ScatterAndMinloc = ScatterAndOp<Type, kernels::Minloc<Type>, 1, 1>;
551054def41SToby Isaac /* Atomics for pair types are not implemented yet */
552054def41SToby Isaac }
553054def41SToby Isaac
554054def41SToby Isaac template <device::cupm::DeviceType T>
555054def41SToby Isaac template <typename Type, PetscInt BS, PetscInt EQ>
PackInit_DumbType(PetscSFLink link)556054def41SToby Isaac inline void SfInterface<T>::PackInit_DumbType(PetscSFLink link) noexcept
557054def41SToby Isaac {
558054def41SToby Isaac link->d_Pack = Pack<Type, BS, EQ>;
559054def41SToby Isaac link->d_UnpackAndInsert = UnpackAndOp<Type, kernels::Insert<Type>, BS, EQ>;
560054def41SToby Isaac link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
561054def41SToby Isaac /* Atomics for dumb types are not implemented yet */
562054def41SToby Isaac }
563054def41SToby Isaac
564054def41SToby Isaac /* Some device-specific utilities */
565054def41SToby Isaac template <device::cupm::DeviceType T>
LinkSyncDevice(PetscSFLink)5663d0abfa8SJose E. Roman inline PetscErrorCode SfInterface<T>::LinkSyncDevice(PetscSFLink) noexcept
567054def41SToby Isaac {
568054def41SToby Isaac PetscFunctionBegin;
569054def41SToby Isaac PetscCallCUPM(cupmDeviceSynchronize());
570054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
571054def41SToby Isaac }
572054def41SToby Isaac
573054def41SToby Isaac template <device::cupm::DeviceType T>
LinkSyncStream(PetscSFLink link)574054def41SToby Isaac inline PetscErrorCode SfInterface<T>::LinkSyncStream(PetscSFLink link) noexcept
575054def41SToby Isaac {
576054def41SToby Isaac PetscFunctionBegin;
577054def41SToby Isaac PetscCallCUPM(cupmStreamSynchronize(link->stream));
578054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
579054def41SToby Isaac }
580054def41SToby Isaac
581054def41SToby Isaac template <device::cupm::DeviceType T>
LinkMemcpy(PetscSFLink link,PetscMemType dstmtype,void * dst,PetscMemType srcmtype,const void * src,size_t n)582054def41SToby Isaac inline PetscErrorCode SfInterface<T>::LinkMemcpy(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n) noexcept
583054def41SToby Isaac {
584054def41SToby Isaac PetscFunctionBegin;
585054def41SToby Isaac cupmMemcpyKind_t kinds[2][2] = {
586054def41SToby Isaac {cupmMemcpyHostToHost, cupmMemcpyHostToDevice },
587054def41SToby Isaac {cupmMemcpyDeviceToHost, cupmMemcpyDeviceToDevice}
588054def41SToby Isaac };
589054def41SToby Isaac
590054def41SToby Isaac if (n) {
591054def41SToby Isaac if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) { /* Separate HostToHost so that pure-cpu code won't call cupm runtime */
592054def41SToby Isaac PetscCall(PetscMemcpy(dst, src, n));
593054def41SToby Isaac } else {
594054def41SToby Isaac int stype = PetscMemTypeDevice(srcmtype) ? 1 : 0;
595054def41SToby Isaac int dtype = PetscMemTypeDevice(dstmtype) ? 1 : 0;
596054def41SToby Isaac PetscCallCUPM(cupmMemcpyAsync(dst, src, n, kinds[stype][dtype], link->stream));
597054def41SToby Isaac }
598054def41SToby Isaac }
599054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
600054def41SToby Isaac }
601054def41SToby Isaac
602054def41SToby Isaac template <device::cupm::DeviceType T>
Malloc(PetscMemType mtype,size_t size,void ** ptr)603054def41SToby Isaac inline PetscErrorCode SfInterface<T>::Malloc(PetscMemType mtype, size_t size, void **ptr) noexcept
604054def41SToby Isaac {
605054def41SToby Isaac PetscFunctionBegin;
606054def41SToby Isaac if (PetscMemTypeHost(mtype)) PetscCall(PetscMalloc(size, ptr));
607054def41SToby Isaac else if (PetscMemTypeDevice(mtype)) {
608054def41SToby Isaac PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
609054def41SToby Isaac PetscCallCUPM(cupmMalloc(ptr, size));
610054def41SToby Isaac } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
611054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
612054def41SToby Isaac }
613054def41SToby Isaac
614054def41SToby Isaac template <device::cupm::DeviceType T>
Free(PetscMemType mtype,void * ptr)615054def41SToby Isaac inline PetscErrorCode SfInterface<T>::Free(PetscMemType mtype, void *ptr) noexcept
616054def41SToby Isaac {
617054def41SToby Isaac PetscFunctionBegin;
618054def41SToby Isaac if (PetscMemTypeHost(mtype)) PetscCall(PetscFree(ptr));
619054def41SToby Isaac else if (PetscMemTypeDevice(mtype)) PetscCallCUPM(cupmFree(ptr));
620054def41SToby Isaac else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
621054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
622054def41SToby Isaac }
623054def41SToby Isaac
624054def41SToby Isaac /* Destructor when the link uses MPI for communication on CUPM device */
625054def41SToby Isaac template <device::cupm::DeviceType T>
LinkDestroy_MPI(PetscSF,PetscSFLink link)626054def41SToby Isaac inline PetscErrorCode SfInterface<T>::LinkDestroy_MPI(PetscSF, PetscSFLink link) noexcept
627054def41SToby Isaac {
628054def41SToby Isaac PetscFunctionBegin;
629054def41SToby Isaac for (int i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
630054def41SToby Isaac PetscCallCUPM(cupmFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]));
631054def41SToby Isaac PetscCallCUPM(cupmFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]));
632054def41SToby Isaac }
633054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
634054def41SToby Isaac }
635054def41SToby Isaac
636054def41SToby Isaac /*====================================================================================*/
637054def41SToby Isaac /* Main driver to init MPI datatype on device */
638054def41SToby Isaac /*====================================================================================*/
639054def41SToby Isaac
640054def41SToby Isaac /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
641054def41SToby Isaac template <device::cupm::DeviceType T>
LinkSetUp(PetscSF sf,PetscSFLink link,MPI_Datatype unit)642054def41SToby Isaac inline PetscErrorCode SfInterface<T>::LinkSetUp(PetscSF sf, PetscSFLink link, MPI_Datatype unit) noexcept
643054def41SToby Isaac {
644054def41SToby Isaac PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
645054def41SToby Isaac PetscBool is2Int, is2PetscInt;
646054def41SToby Isaac #if defined(PETSC_HAVE_COMPLEX)
647054def41SToby Isaac PetscInt nPetscComplex = 0;
648054def41SToby Isaac #endif
649054def41SToby Isaac
650054def41SToby Isaac PetscFunctionBegin;
651054def41SToby Isaac if (link->deviceinited) PetscFunctionReturn(PETSC_SUCCESS);
652054def41SToby Isaac PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar));
653054def41SToby Isaac PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar));
654054def41SToby Isaac /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
655054def41SToby Isaac PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt));
656054def41SToby Isaac PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt));
657054def41SToby Isaac PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal));
658054def41SToby Isaac #if defined(PETSC_HAVE_COMPLEX)
659054def41SToby Isaac PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex));
660054def41SToby Isaac #endif
661054def41SToby Isaac PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int));
662054def41SToby Isaac PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt));
663054def41SToby Isaac
664054def41SToby Isaac if (is2Int) {
665054def41SToby Isaac PackInit_PairType<PairInt>(link);
666054def41SToby Isaac } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
667054def41SToby Isaac PackInit_PairType<PairPetscInt>(link);
668054def41SToby Isaac } else if (nPetscReal) {
669054def41SToby Isaac #if !defined(PETSC_HAVE_DEVICE)
670054def41SToby Isaac if (nPetscReal == 8) PackInit_RealType<PetscReal, 8, 1>(link);
671054def41SToby Isaac else if (nPetscReal % 8 == 0) PackInit_RealType<PetscReal, 8, 0>(link);
672054def41SToby Isaac else if (nPetscReal == 4) PackInit_RealType<PetscReal, 4, 1>(link);
673054def41SToby Isaac else if (nPetscReal % 4 == 0) PackInit_RealType<PetscReal, 4, 0>(link);
674054def41SToby Isaac else if (nPetscReal == 2) PackInit_RealType<PetscReal, 2, 1>(link);
675054def41SToby Isaac else if (nPetscReal % 2 == 0) PackInit_RealType<PetscReal, 2, 0>(link);
676054def41SToby Isaac else if (nPetscReal == 1) PackInit_RealType<PetscReal, 1, 1>(link);
677054def41SToby Isaac else if (nPetscReal % 1 == 0)
678054def41SToby Isaac #endif
679054def41SToby Isaac PackInit_RealType<PetscReal, 1, 0>(link);
680054def41SToby Isaac } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
681054def41SToby Isaac #if !defined(PETSC_HAVE_DEVICE)
682054def41SToby Isaac if (nPetscInt == 8) PackInit_IntegerType<llint, 8, 1>(link);
683054def41SToby Isaac else if (nPetscInt % 8 == 0) PackInit_IntegerType<llint, 8, 0>(link);
684054def41SToby Isaac else if (nPetscInt == 4) PackInit_IntegerType<llint, 4, 1>(link);
685054def41SToby Isaac else if (nPetscInt % 4 == 0) PackInit_IntegerType<llint, 4, 0>(link);
686054def41SToby Isaac else if (nPetscInt == 2) PackInit_IntegerType<llint, 2, 1>(link);
687054def41SToby Isaac else if (nPetscInt % 2 == 0) PackInit_IntegerType<llint, 2, 0>(link);
688054def41SToby Isaac else if (nPetscInt == 1) PackInit_IntegerType<llint, 1, 1>(link);
689054def41SToby Isaac else if (nPetscInt % 1 == 0)
690054def41SToby Isaac #endif
691054def41SToby Isaac PackInit_IntegerType<llint, 1, 0>(link);
692054def41SToby Isaac } else if (nInt) {
693054def41SToby Isaac #if !defined(PETSC_HAVE_DEVICE)
694054def41SToby Isaac if (nInt == 8) PackInit_IntegerType<int, 8, 1>(link);
695054def41SToby Isaac else if (nInt % 8 == 0) PackInit_IntegerType<int, 8, 0>(link);
696054def41SToby Isaac else if (nInt == 4) PackInit_IntegerType<int, 4, 1>(link);
697054def41SToby Isaac else if (nInt % 4 == 0) PackInit_IntegerType<int, 4, 0>(link);
698054def41SToby Isaac else if (nInt == 2) PackInit_IntegerType<int, 2, 1>(link);
699054def41SToby Isaac else if (nInt % 2 == 0) PackInit_IntegerType<int, 2, 0>(link);
700054def41SToby Isaac else if (nInt == 1) PackInit_IntegerType<int, 1, 1>(link);
701054def41SToby Isaac else if (nInt % 1 == 0)
702054def41SToby Isaac #endif
703054def41SToby Isaac PackInit_IntegerType<int, 1, 0>(link);
704054def41SToby Isaac } else if (nSignedChar) {
705054def41SToby Isaac #if !defined(PETSC_HAVE_DEVICE)
706054def41SToby Isaac if (nSignedChar == 8) PackInit_IntegerType<SignedChar, 8, 1>(link);
707054def41SToby Isaac else if (nSignedChar % 8 == 0) PackInit_IntegerType<SignedChar, 8, 0>(link);
708054def41SToby Isaac else if (nSignedChar == 4) PackInit_IntegerType<SignedChar, 4, 1>(link);
709054def41SToby Isaac else if (nSignedChar % 4 == 0) PackInit_IntegerType<SignedChar, 4, 0>(link);
710054def41SToby Isaac else if (nSignedChar == 2) PackInit_IntegerType<SignedChar, 2, 1>(link);
711054def41SToby Isaac else if (nSignedChar % 2 == 0) PackInit_IntegerType<SignedChar, 2, 0>(link);
712054def41SToby Isaac else if (nSignedChar == 1) PackInit_IntegerType<SignedChar, 1, 1>(link);
713054def41SToby Isaac else if (nSignedChar % 1 == 0)
714054def41SToby Isaac #endif
715054def41SToby Isaac PackInit_IntegerType<SignedChar, 1, 0>(link);
716054def41SToby Isaac } else if (nUnsignedChar) {
717054def41SToby Isaac #if !defined(PETSC_HAVE_DEVICE)
718054def41SToby Isaac if (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar, 8, 1>(link);
719054def41SToby Isaac else if (nUnsignedChar % 8 == 0) PackInit_IntegerType<UnsignedChar, 8, 0>(link);
720054def41SToby Isaac else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar, 4, 1>(link);
721054def41SToby Isaac else if (nUnsignedChar % 4 == 0) PackInit_IntegerType<UnsignedChar, 4, 0>(link);
722054def41SToby Isaac else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar, 2, 1>(link);
723054def41SToby Isaac else if (nUnsignedChar % 2 == 0) PackInit_IntegerType<UnsignedChar, 2, 0>(link);
724054def41SToby Isaac else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar, 1, 1>(link);
725054def41SToby Isaac else if (nUnsignedChar % 1 == 0)
726054def41SToby Isaac #endif
727054def41SToby Isaac PackInit_IntegerType<UnsignedChar, 1, 0>(link);
728054def41SToby Isaac #if defined(PETSC_HAVE_COMPLEX)
729054def41SToby Isaac } else if (nPetscComplex) {
730054def41SToby Isaac #if !defined(PETSC_HAVE_DEVICE)
731054def41SToby Isaac if (nPetscComplex == 8) PackInit_ComplexType<PetscComplex, 8, 1>(link);
732054def41SToby Isaac else if (nPetscComplex % 8 == 0) PackInit_ComplexType<PetscComplex, 8, 0>(link);
733054def41SToby Isaac else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex, 4, 1>(link);
734054def41SToby Isaac else if (nPetscComplex % 4 == 0) PackInit_ComplexType<PetscComplex, 4, 0>(link);
735054def41SToby Isaac else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex, 2, 1>(link);
736054def41SToby Isaac else if (nPetscComplex % 2 == 0) PackInit_ComplexType<PetscComplex, 2, 0>(link);
737054def41SToby Isaac else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex, 1, 1>(link);
738054def41SToby Isaac else if (nPetscComplex % 1 == 0)
739054def41SToby Isaac #endif
740054def41SToby Isaac PackInit_ComplexType<PetscComplex, 1, 0>(link);
741054def41SToby Isaac #endif
742054def41SToby Isaac } else {
7436497c311SBarry Smith MPI_Aint lb, nbyte;
744e1187f0dSToby Isaac
7456497c311SBarry Smith PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte));
7466497c311SBarry Smith PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
747054def41SToby Isaac if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
748054def41SToby Isaac #if !defined(PETSC_HAVE_DEVICE)
749054def41SToby Isaac if (nbyte == 4) PackInit_DumbType<char, 4, 1>(link);
750054def41SToby Isaac else if (nbyte % 4 == 0) PackInit_DumbType<char, 4, 0>(link);
751054def41SToby Isaac else if (nbyte == 2) PackInit_DumbType<char, 2, 1>(link);
752054def41SToby Isaac else if (nbyte % 2 == 0) PackInit_DumbType<char, 2, 0>(link);
753054def41SToby Isaac else if (nbyte == 1) PackInit_DumbType<char, 1, 1>(link);
754054def41SToby Isaac else if (nbyte % 1 == 0)
755054def41SToby Isaac #endif
756054def41SToby Isaac PackInit_DumbType<char, 1, 0>(link);
757054def41SToby Isaac } else {
758*d279a5e3SJunchao Zhang PetscCall(PetscIntCast(nbyte / sizeof(int), &nInt));
759054def41SToby Isaac #if !defined(PETSC_HAVE_DEVICE)
760054def41SToby Isaac if (nInt == 8) PackInit_DumbType<int, 8, 1>(link);
761054def41SToby Isaac else if (nInt % 8 == 0) PackInit_DumbType<int, 8, 0>(link);
762054def41SToby Isaac else if (nInt == 4) PackInit_DumbType<int, 4, 1>(link);
763054def41SToby Isaac else if (nInt % 4 == 0) PackInit_DumbType<int, 4, 0>(link);
764054def41SToby Isaac else if (nInt == 2) PackInit_DumbType<int, 2, 1>(link);
765054def41SToby Isaac else if (nInt % 2 == 0) PackInit_DumbType<int, 2, 0>(link);
766054def41SToby Isaac else if (nInt == 1) PackInit_DumbType<int, 1, 1>(link);
767054def41SToby Isaac else if (nInt % 1 == 0)
768054def41SToby Isaac #endif
769054def41SToby Isaac PackInit_DumbType<int, 1, 0>(link);
770054def41SToby Isaac }
771054def41SToby Isaac }
772054def41SToby Isaac
773054def41SToby Isaac if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
774054def41SToby Isaac int device;
775054def41SToby Isaac cupmDeviceProp_t props;
7766497c311SBarry Smith
777054def41SToby Isaac PetscCallCUPM(cupmGetDevice(&device));
778054def41SToby Isaac PetscCallCUPM(cupmGetDeviceProperties(&props, device));
779054def41SToby Isaac sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor * props.multiProcessorCount;
780054def41SToby Isaac }
781054def41SToby Isaac link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;
782054def41SToby Isaac
783054def41SToby Isaac {
784054def41SToby Isaac cupmStream_t *stream;
785054def41SToby Isaac PetscDeviceContext dctx;
786054def41SToby Isaac
787054def41SToby Isaac PetscCall(PetscDeviceContextGetCurrentContextAssertType_Internal(&dctx, PETSC_DEVICE_CUPM()));
788054def41SToby Isaac PetscCall(PetscDeviceContextGetStreamHandle(dctx, (void **)&stream));
789054def41SToby Isaac link->stream = *stream;
790054def41SToby Isaac }
791054def41SToby Isaac link->Destroy = LinkDestroy_MPI;
792054def41SToby Isaac link->SyncDevice = LinkSyncDevice;
793054def41SToby Isaac link->SyncStream = LinkSyncStream;
794054def41SToby Isaac link->Memcpy = LinkMemcpy;
795054def41SToby Isaac link->deviceinited = PETSC_TRUE;
796054def41SToby Isaac PetscFunctionReturn(PETSC_SUCCESS);
797054def41SToby Isaac }
798054def41SToby Isaac
799054def41SToby Isaac } // namespace impl
800054def41SToby Isaac
801054def41SToby Isaac } // namespace cupm
802054def41SToby Isaac
803054def41SToby Isaac } // namespace sf
804054def41SToby Isaac
805054def41SToby Isaac } // namespace Petsc
806