xref: /petsc/src/vec/is/sf/impls/basic/kokkos/sfkok.kokkos.cxx (revision 97fff7b26c35bf162ccb3cca40740faa3421ab44)
1914b7a73SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
2914b7a73SJunchao Zhang 
3524fe776SJunchao Zhang #include <petsc_kokkos.hpp>
445402d8aSJunchao Zhang #include <petsc/private/kokkosimpl.hpp>
5914b7a73SJunchao Zhang 
6914b7a73SJunchao Zhang using DeviceExecutionSpace = Kokkos::DefaultExecutionSpace;
7914b7a73SJunchao Zhang 
845402d8aSJunchao Zhang typedef Kokkos::View<char *, DefaultMemorySpace>    deviceBuffer_t;
945402d8aSJunchao Zhang typedef Kokkos::View<char *, HostMirrorMemorySpace> HostBuffer_t;
10914b7a73SJunchao Zhang 
1145402d8aSJunchao Zhang typedef Kokkos::View<const char *, DefaultMemorySpace>    deviceConstBuffer_t;
1245402d8aSJunchao Zhang typedef Kokkos::View<const char *, HostMirrorMemorySpace> HostConstBuffer_t;
13914b7a73SJunchao Zhang 
14914b7a73SJunchao Zhang /*====================================================================================*/
15914b7a73SJunchao Zhang /*                             Regular operations                           */
16914b7a73SJunchao Zhang /*====================================================================================*/
179371c9d4SSatish Balay template <typename Type>
189371c9d4SSatish Balay struct Insert {
operator ()Insert19d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
20d71ae5a4SJacob Faibussowitsch   {
219371c9d4SSatish Balay     Type old = x;
229371c9d4SSatish Balay     x        = y;
239371c9d4SSatish Balay     return old;
249371c9d4SSatish Balay   }
259371c9d4SSatish Balay };
269371c9d4SSatish Balay template <typename Type>
279371c9d4SSatish Balay struct Add {
operator ()Add28d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
29d71ae5a4SJacob Faibussowitsch   {
309371c9d4SSatish Balay     Type old = x;
319371c9d4SSatish Balay     x += y;
329371c9d4SSatish Balay     return old;
339371c9d4SSatish Balay   }
349371c9d4SSatish Balay };
359371c9d4SSatish Balay template <typename Type>
369371c9d4SSatish Balay struct Mult {
operator ()Mult37d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
38d71ae5a4SJacob Faibussowitsch   {
399371c9d4SSatish Balay     Type old = x;
409371c9d4SSatish Balay     x *= y;
419371c9d4SSatish Balay     return old;
429371c9d4SSatish Balay   }
439371c9d4SSatish Balay };
449371c9d4SSatish Balay template <typename Type>
459371c9d4SSatish Balay struct Min {
operator ()Min46d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
47d71ae5a4SJacob Faibussowitsch   {
489371c9d4SSatish Balay     Type old = x;
499371c9d4SSatish Balay     x        = PetscMin(x, y);
509371c9d4SSatish Balay     return old;
519371c9d4SSatish Balay   }
529371c9d4SSatish Balay };
539371c9d4SSatish Balay template <typename Type>
549371c9d4SSatish Balay struct Max {
operator ()Max55d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
56d71ae5a4SJacob Faibussowitsch   {
579371c9d4SSatish Balay     Type old = x;
589371c9d4SSatish Balay     x        = PetscMax(x, y);
599371c9d4SSatish Balay     return old;
609371c9d4SSatish Balay   }
619371c9d4SSatish Balay };
629371c9d4SSatish Balay template <typename Type>
639371c9d4SSatish Balay struct LAND {
operator ()LAND64d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
65d71ae5a4SJacob Faibussowitsch   {
669371c9d4SSatish Balay     Type old = x;
679371c9d4SSatish Balay     x        = x && y;
689371c9d4SSatish Balay     return old;
699371c9d4SSatish Balay   }
709371c9d4SSatish Balay };
719371c9d4SSatish Balay template <typename Type>
729371c9d4SSatish Balay struct LOR {
operator ()LOR73d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
74d71ae5a4SJacob Faibussowitsch   {
759371c9d4SSatish Balay     Type old = x;
769371c9d4SSatish Balay     x        = x || y;
779371c9d4SSatish Balay     return old;
789371c9d4SSatish Balay   }
799371c9d4SSatish Balay };
809371c9d4SSatish Balay template <typename Type>
819371c9d4SSatish Balay struct LXOR {
operator ()LXOR82d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
83d71ae5a4SJacob Faibussowitsch   {
849371c9d4SSatish Balay     Type old = x;
859371c9d4SSatish Balay     x        = !x != !y;
869371c9d4SSatish Balay     return old;
879371c9d4SSatish Balay   }
889371c9d4SSatish Balay };
899371c9d4SSatish Balay template <typename Type>
909371c9d4SSatish Balay struct BAND {
operator ()BAND91d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
92d71ae5a4SJacob Faibussowitsch   {
939371c9d4SSatish Balay     Type old = x;
949371c9d4SSatish Balay     x        = x & y;
959371c9d4SSatish Balay     return old;
969371c9d4SSatish Balay   }
979371c9d4SSatish Balay };
989371c9d4SSatish Balay template <typename Type>
999371c9d4SSatish Balay struct BOR {
operator ()BOR100d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
101d71ae5a4SJacob Faibussowitsch   {
1029371c9d4SSatish Balay     Type old = x;
1039371c9d4SSatish Balay     x        = x | y;
1049371c9d4SSatish Balay     return old;
1059371c9d4SSatish Balay   }
1069371c9d4SSatish Balay };
1079371c9d4SSatish Balay template <typename Type>
1089371c9d4SSatish Balay struct BXOR {
operator ()BXOR109d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
110d71ae5a4SJacob Faibussowitsch   {
1119371c9d4SSatish Balay     Type old = x;
1129371c9d4SSatish Balay     x        = x ^ y;
1139371c9d4SSatish Balay     return old;
1149371c9d4SSatish Balay   }
1159371c9d4SSatish Balay };
1169371c9d4SSatish Balay template <typename PairType>
1179371c9d4SSatish Balay struct Minloc {
operator ()Minloc118d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION PairType operator()(PairType &x, PairType y) const
119d71ae5a4SJacob Faibussowitsch   {
120914b7a73SJunchao Zhang     PairType old = x;
121914b7a73SJunchao Zhang     if (y.first < x.first) x = y;
122914b7a73SJunchao Zhang     else if (y.first == x.first) x.second = PetscMin(x.second, y.second);
123914b7a73SJunchao Zhang     return old;
124914b7a73SJunchao Zhang   }
125914b7a73SJunchao Zhang };
1269371c9d4SSatish Balay template <typename PairType>
1279371c9d4SSatish Balay struct Maxloc {
operator ()Maxloc128d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION PairType operator()(PairType &x, PairType y) const
129d71ae5a4SJacob Faibussowitsch   {
130914b7a73SJunchao Zhang     PairType old = x;
131914b7a73SJunchao Zhang     if (y.first > x.first) x = y;
132914b7a73SJunchao Zhang     else if (y.first == x.first) x.second = PetscMin(x.second, y.second); /* See MPI MAXLOC */
133914b7a73SJunchao Zhang     return old;
134914b7a73SJunchao Zhang   }
135914b7a73SJunchao Zhang };
136914b7a73SJunchao Zhang 
137914b7a73SJunchao Zhang /*====================================================================================*/
138914b7a73SJunchao Zhang /*                             Atomic operations                            */
139914b7a73SJunchao Zhang /*====================================================================================*/
1409371c9d4SSatish Balay template <typename Type>
1419371c9d4SSatish Balay struct AtomicInsert {
operator ()AtomicInsert142ffc29c3aSJunchao Zhang   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_store(&x, y); }
1439371c9d4SSatish Balay };
1449371c9d4SSatish Balay template <typename Type>
1459371c9d4SSatish Balay struct AtomicAdd {
operator ()AtomicAdd1469371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_add(&x, y); }
1479371c9d4SSatish Balay };
1489371c9d4SSatish Balay template <typename Type>
1499371c9d4SSatish Balay struct AtomicBAND {
operator ()AtomicBAND1509371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_and(&x, y); }
1519371c9d4SSatish Balay };
1529371c9d4SSatish Balay template <typename Type>
1539371c9d4SSatish Balay struct AtomicBOR {
operator ()AtomicBOR1549371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_or(&x, y); }
1559371c9d4SSatish Balay };
1569371c9d4SSatish Balay template <typename Type>
1579371c9d4SSatish Balay struct AtomicBXOR {
operator ()AtomicBXOR1589371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_fetch_xor(&x, y); }
1599371c9d4SSatish Balay };
1609371c9d4SSatish Balay template <typename Type>
1619371c9d4SSatish Balay struct AtomicLAND {
operator ()AtomicLAND162d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const
163d71ae5a4SJacob Faibussowitsch   {
1649371c9d4SSatish Balay     const Type zero = 0, one = ~0;
1659371c9d4SSatish Balay     Kokkos::atomic_and(&x, y ? one : zero);
1669371c9d4SSatish Balay   }
1679371c9d4SSatish Balay };
1689371c9d4SSatish Balay template <typename Type>
1699371c9d4SSatish Balay struct AtomicLOR {
operator ()AtomicLOR170d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const
171d71ae5a4SJacob Faibussowitsch   {
1729371c9d4SSatish Balay     const Type zero = 0, one = 1;
1739371c9d4SSatish Balay     Kokkos::atomic_or(&x, y ? one : zero);
1749371c9d4SSatish Balay   }
1759371c9d4SSatish Balay };
1769371c9d4SSatish Balay template <typename Type>
1779371c9d4SSatish Balay struct AtomicMult {
operator ()AtomicMult1789371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_fetch_mul(&x, y); }
1799371c9d4SSatish Balay };
1809371c9d4SSatish Balay template <typename Type>
1819371c9d4SSatish Balay struct AtomicMin {
operator ()AtomicMin1829371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_fetch_min(&x, y); }
1839371c9d4SSatish Balay };
1849371c9d4SSatish Balay template <typename Type>
1859371c9d4SSatish Balay struct AtomicMax {
operator ()AtomicMax1869371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_fetch_max(&x, y); }
1879371c9d4SSatish Balay };
188914b7a73SJunchao Zhang /* TODO: struct AtomicLXOR  */
1899371c9d4SSatish Balay template <typename Type>
1909371c9d4SSatish Balay struct AtomicFetchAdd {
operator ()AtomicFetchAdd1919371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const { return Kokkos::atomic_fetch_add(&x, y); }
1929371c9d4SSatish Balay };
193914b7a73SJunchao Zhang 
194914b7a73SJunchao Zhang /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
MapTidToIndex(const PetscInt * opt,PetscInt tid)195d71ae5a4SJacob Faibussowitsch static KOKKOS_INLINE_FUNCTION PetscInt MapTidToIndex(const PetscInt *opt, PetscInt tid)
196d71ae5a4SJacob Faibussowitsch {
197914b7a73SJunchao Zhang   PetscInt        i, j, k, m, n, r;
198914b7a73SJunchao Zhang   const PetscInt *offset, *start, *dx, *dy, *X, *Y;
199914b7a73SJunchao Zhang 
200914b7a73SJunchao Zhang   n      = opt[0];
201914b7a73SJunchao Zhang   offset = opt + 1;
202914b7a73SJunchao Zhang   start  = opt + n + 2;
203914b7a73SJunchao Zhang   dx     = opt + 2 * n + 2;
204914b7a73SJunchao Zhang   dy     = opt + 3 * n + 2;
205914b7a73SJunchao Zhang   X      = opt + 5 * n + 2;
206914b7a73SJunchao Zhang   Y      = opt + 6 * n + 2;
2079371c9d4SSatish Balay   for (r = 0; r < n; r++) {
2089371c9d4SSatish Balay     if (tid < offset[r + 1]) break;
2099371c9d4SSatish Balay   }
210914b7a73SJunchao Zhang   m = (tid - offset[r]);
211914b7a73SJunchao Zhang   k = m / (dx[r] * dy[r]);
212914b7a73SJunchao Zhang   j = (m - k * dx[r] * dy[r]) / dx[r];
213914b7a73SJunchao Zhang   i = m - k * dx[r] * dy[r] - j * dx[r];
214914b7a73SJunchao Zhang 
2154ad8454bSPierre Jolivet   return start[r] + k * X[r] * Y[r] + j * X[r] + i;
216914b7a73SJunchao Zhang }
217914b7a73SJunchao Zhang 
218914b7a73SJunchao Zhang /*====================================================================================*/
219914b7a73SJunchao Zhang /*  Wrappers for Pack/Unpack/Scatter kernels. Function pointers are stored in 'link'         */
220914b7a73SJunchao Zhang /*====================================================================================*/
221914b7a73SJunchao Zhang 
222914b7a73SJunchao Zhang /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
223914b7a73SJunchao Zhang    <Type> is PetscReal, which is the primitive type we operate on.
224914b7a73SJunchao Zhang    <bs>   is 16, which says <unit> contains 16 primitive types.
225914b7a73SJunchao Zhang    <BS>   is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
226914b7a73SJunchao Zhang    <EQ>   is 0, which is (bs == BS ? 1 : 0)
227914b7a73SJunchao Zhang 
228914b7a73SJunchao Zhang   If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
229914b7a73SJunchao Zhang   For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
230914b7a73SJunchao Zhang */
231914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ>
Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt * idx,const void * data_,void * buf_)232d71ae5a4SJacob Faibussowitsch static PetscErrorCode Pack(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *data_, void *buf_)
233d71ae5a4SJacob Faibussowitsch {
234914b7a73SJunchao Zhang   const PetscInt      *iopt = opt ? opt->array : NULL;
235914b7a73SJunchao Zhang   const PetscInt       M = EQ ? 1 : link->bs / BS, MBS = M * BS; /* If EQ, then MBS will be a compile-time const */
236914b7a73SJunchao Zhang   const Type          *data = static_cast<const Type *>(data_);
237914b7a73SJunchao Zhang   Type                *buf  = static_cast<Type *>(buf_);
238*4df4a32cSJunchao Zhang   DeviceExecutionSpace exec = PetscGetKokkosExecutionSpace();
239914b7a73SJunchao Zhang 
240914b7a73SJunchao Zhang   PetscFunctionBegin;
2419371c9d4SSatish Balay   Kokkos::parallel_for(
2429371c9d4SSatish Balay     Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) {
243914b7a73SJunchao Zhang       /* iopt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
244914b7a73SJunchao Zhang        iopt == NULL && idx == NULL ==> the indices are contiguous;
245914b7a73SJunchao Zhang      */
246914b7a73SJunchao Zhang       PetscInt t = (iopt ? MapTidToIndex(iopt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
247914b7a73SJunchao Zhang       PetscInt s = tid * MBS;
248914b7a73SJunchao Zhang       for (int i = 0; i < MBS; i++) buf[s + i] = data[t + i];
249914b7a73SJunchao Zhang     });
2503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
251914b7a73SJunchao Zhang }
252914b7a73SJunchao Zhang 
253914b7a73SJunchao Zhang template <typename Type, class Op, PetscInt BS, PetscInt EQ>
UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt * idx,void * data_,const void * buf_)254d71ae5a4SJacob Faibussowitsch static PetscErrorCode UnpackAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data_, const void *buf_)
255d71ae5a4SJacob Faibussowitsch {
256914b7a73SJunchao Zhang   Op                   op;
257914b7a73SJunchao Zhang   const PetscInt      *iopt = opt ? opt->array : NULL;
258914b7a73SJunchao Zhang   const PetscInt       M = EQ ? 1 : link->bs / BS, MBS = M * BS;
259914b7a73SJunchao Zhang   Type                *data = static_cast<Type *>(data_);
260914b7a73SJunchao Zhang   const Type          *buf  = static_cast<const Type *>(buf_);
261*4df4a32cSJunchao Zhang   DeviceExecutionSpace exec = PetscGetKokkosExecutionSpace();
262914b7a73SJunchao Zhang 
263914b7a73SJunchao Zhang   PetscFunctionBegin;
2649371c9d4SSatish Balay   Kokkos::parallel_for(
2659371c9d4SSatish Balay     Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) {
266914b7a73SJunchao Zhang       PetscInt t = (iopt ? MapTidToIndex(iopt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
267914b7a73SJunchao Zhang       PetscInt s = tid * MBS;
268914b7a73SJunchao Zhang       for (int i = 0; i < MBS; i++) op(data[t + i], buf[s + i]);
269914b7a73SJunchao Zhang     });
2703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
271914b7a73SJunchao Zhang }
272914b7a73SJunchao Zhang 
273914b7a73SJunchao Zhang template <typename Type, class Op, PetscInt BS, PetscInt EQ>
FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt * idx,void * data,void * buf)274d71ae5a4SJacob Faibussowitsch static PetscErrorCode FetchAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, void *buf)
275d71ae5a4SJacob Faibussowitsch {
276914b7a73SJunchao Zhang   Op                   op;
277914b7a73SJunchao Zhang   const PetscInt      *ropt = opt ? opt->array : NULL;
278914b7a73SJunchao Zhang   const PetscInt       M = EQ ? 1 : link->bs / BS, MBS = M * BS;
279914b7a73SJunchao Zhang   Type                *rootdata = static_cast<Type *>(data), *leafbuf = static_cast<Type *>(buf);
280*4df4a32cSJunchao Zhang   DeviceExecutionSpace exec = PetscGetKokkosExecutionSpace();
281914b7a73SJunchao Zhang 
282914b7a73SJunchao Zhang   PetscFunctionBegin;
2839371c9d4SSatish Balay   Kokkos::parallel_for(
2849371c9d4SSatish Balay     Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) {
285914b7a73SJunchao Zhang       PetscInt r = (ropt ? MapTidToIndex(ropt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
286914b7a73SJunchao Zhang       PetscInt l = tid * MBS;
287914b7a73SJunchao Zhang       for (int i = 0; i < MBS; i++) leafbuf[l + i] = op(rootdata[r + i], leafbuf[l + i]);
288914b7a73SJunchao Zhang     });
2893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
290914b7a73SJunchao Zhang }
291914b7a73SJunchao Zhang 
292914b7a73SJunchao Zhang template <typename Type, class Op, PetscInt BS, PetscInt EQ>
ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt * srcIdx,const void * src_,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt * dstIdx,void * dst_)293d71ae5a4SJacob Faibussowitsch static PetscErrorCode ScatterAndOp(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src_, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst_)
294d71ae5a4SJacob Faibussowitsch {
295914b7a73SJunchao Zhang   PetscInt             srcx = 0, srcy = 0, srcX = 0, srcY = 0, dstx = 0, dsty = 0, dstX = 0, dstY = 0;
296914b7a73SJunchao Zhang   const PetscInt       M = (EQ) ? 1 : link->bs / BS, MBS = M * BS;
297914b7a73SJunchao Zhang   const Type          *src  = static_cast<const Type *>(src_);
298914b7a73SJunchao Zhang   Type                *dst  = static_cast<Type *>(dst_);
299*4df4a32cSJunchao Zhang   DeviceExecutionSpace exec = PetscGetKokkosExecutionSpace();
300914b7a73SJunchao Zhang 
301914b7a73SJunchao Zhang   PetscFunctionBegin;
302914b7a73SJunchao Zhang   /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
3039371c9d4SSatish Balay   if (srcOpt) {
3049371c9d4SSatish Balay     srcx     = srcOpt->dx[0];
3059371c9d4SSatish Balay     srcy     = srcOpt->dy[0];
3069371c9d4SSatish Balay     srcX     = srcOpt->X[0];
3079371c9d4SSatish Balay     srcY     = srcOpt->Y[0];
3089371c9d4SSatish Balay     srcStart = srcOpt->start[0];
3099371c9d4SSatish Balay     srcIdx   = NULL;
3109371c9d4SSatish Balay   } else if (!srcIdx) {
3119371c9d4SSatish Balay     srcx = srcX = count;
3129371c9d4SSatish Balay     srcy = srcY = 1;
3139371c9d4SSatish Balay   }
314914b7a73SJunchao Zhang 
3159371c9d4SSatish Balay   if (dstOpt) {
3169371c9d4SSatish Balay     dstx     = dstOpt->dx[0];
3179371c9d4SSatish Balay     dsty     = dstOpt->dy[0];
3189371c9d4SSatish Balay     dstX     = dstOpt->X[0];
3199371c9d4SSatish Balay     dstY     = dstOpt->Y[0];
3209371c9d4SSatish Balay     dstStart = dstOpt->start[0];
3219371c9d4SSatish Balay     dstIdx   = NULL;
3229371c9d4SSatish Balay   } else if (!dstIdx) {
3239371c9d4SSatish Balay     dstx = dstX = count;
3249371c9d4SSatish Balay     dsty = dstY = 1;
3259371c9d4SSatish Balay   }
326914b7a73SJunchao Zhang 
3279371c9d4SSatish Balay   Kokkos::parallel_for(
3289371c9d4SSatish Balay     Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) {
329914b7a73SJunchao Zhang       PetscInt i, j, k, s, t;
330914b7a73SJunchao Zhang       Op       op;
331914b7a73SJunchao Zhang       if (!srcIdx) { /* src is in 3D */
332914b7a73SJunchao Zhang         k = tid / (srcx * srcy);
333914b7a73SJunchao Zhang         j = (tid - k * srcx * srcy) / srcx;
334914b7a73SJunchao Zhang         i = tid - k * srcx * srcy - j * srcx;
335914b7a73SJunchao Zhang         s = srcStart + k * srcX * srcY + j * srcX + i;
336914b7a73SJunchao Zhang       } else { /* src is contiguous */
337914b7a73SJunchao Zhang         s = srcIdx[tid];
338914b7a73SJunchao Zhang       }
339914b7a73SJunchao Zhang 
340914b7a73SJunchao Zhang       if (!dstIdx) { /* 3D */
341914b7a73SJunchao Zhang         k = tid / (dstx * dsty);
342914b7a73SJunchao Zhang         j = (tid - k * dstx * dsty) / dstx;
343914b7a73SJunchao Zhang         i = tid - k * dstx * dsty - j * dstx;
344914b7a73SJunchao Zhang         t = dstStart + k * dstX * dstY + j * dstX + i;
345914b7a73SJunchao Zhang       } else { /* contiguous */
346914b7a73SJunchao Zhang         t = dstIdx[tid];
347914b7a73SJunchao Zhang       }
348914b7a73SJunchao Zhang 
349914b7a73SJunchao Zhang       s *= MBS;
350914b7a73SJunchao Zhang       t *= MBS;
351914b7a73SJunchao Zhang       for (i = 0; i < MBS; i++) op(dst[t + i], src[s + i]);
352914b7a73SJunchao Zhang     });
3533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
354914b7a73SJunchao Zhang }
355914b7a73SJunchao Zhang 
356914b7a73SJunchao Zhang /* Specialization for Insert since we may use memcpy */
357914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ>
ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt * srcIdx,const void * src_,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt * dstIdx,void * dst_)358d71ae5a4SJacob Faibussowitsch static PetscErrorCode ScatterAndInsert(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src_, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst_)
359d71ae5a4SJacob Faibussowitsch {
360914b7a73SJunchao Zhang   const Type          *src  = static_cast<const Type *>(src_);
361914b7a73SJunchao Zhang   Type                *dst  = static_cast<Type *>(dst_);
362*4df4a32cSJunchao Zhang   DeviceExecutionSpace exec = PetscGetKokkosExecutionSpace();
363914b7a73SJunchao Zhang 
364914b7a73SJunchao Zhang   PetscFunctionBegin;
3653ba16761SJacob Faibussowitsch   if (!count) PetscFunctionReturn(PETSC_SUCCESS);
366914b7a73SJunchao Zhang   /*src and dst are contiguous */
367914b7a73SJunchao Zhang   if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
368914b7a73SJunchao Zhang     size_t              sz = count * link->unitbytes;
369914b7a73SJunchao Zhang     deviceBuffer_t      dbuf(reinterpret_cast<char *>(dst + dstStart * link->bs), sz);
370914b7a73SJunchao Zhang     deviceConstBuffer_t sbuf(reinterpret_cast<const char *>(src + srcStart * link->bs), sz);
371914b7a73SJunchao Zhang     Kokkos::deep_copy(exec, dbuf, sbuf);
372914b7a73SJunchao Zhang   } else {
3739566063dSJacob Faibussowitsch     PetscCall(ScatterAndOp<Type, Insert<Type>, BS, EQ>(link, count, srcStart, srcOpt, srcIdx, src, dstStart, dstOpt, dstIdx, dst));
374914b7a73SJunchao Zhang   }
3753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
376914b7a73SJunchao Zhang }
377914b7a73SJunchao Zhang 
378914b7a73SJunchao Zhang template <typename Type, class Op, PetscInt BS, PetscInt EQ>
FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt * rootidx,void * rootdata_,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt * leafidx,const void * leafdata_,void * leafupdate_)379d71ae5a4SJacob Faibussowitsch static PetscErrorCode FetchAndOpLocal(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata_, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata_, void *leafupdate_)
380d71ae5a4SJacob Faibussowitsch {
381914b7a73SJunchao Zhang   Op                   op;
382914b7a73SJunchao Zhang   const PetscInt       M = (EQ) ? 1 : link->bs / BS, MBS = M * BS;
383914b7a73SJunchao Zhang   const PetscInt      *ropt     = rootopt ? rootopt->array : NULL;
384914b7a73SJunchao Zhang   const PetscInt      *lopt     = leafopt ? leafopt->array : NULL;
385914b7a73SJunchao Zhang   Type                *rootdata = static_cast<Type *>(rootdata_), *leafupdate = static_cast<Type *>(leafupdate_);
386914b7a73SJunchao Zhang   const Type          *leafdata = static_cast<const Type *>(leafdata_);
387*4df4a32cSJunchao Zhang   DeviceExecutionSpace exec     = PetscGetKokkosExecutionSpace();
388914b7a73SJunchao Zhang 
389914b7a73SJunchao Zhang   PetscFunctionBegin;
3909371c9d4SSatish Balay   Kokkos::parallel_for(
3919371c9d4SSatish Balay     Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) {
392914b7a73SJunchao Zhang       PetscInt r = (ropt ? MapTidToIndex(ropt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS;
393914b7a73SJunchao Zhang       PetscInt l = (lopt ? MapTidToIndex(lopt, tid) : (leafidx ? leafidx[tid] : leafstart + tid)) * MBS;
394914b7a73SJunchao Zhang       for (int i = 0; i < MBS; i++) leafupdate[l + i] = op(rootdata[r + i], leafdata[l + i]);
395914b7a73SJunchao Zhang     });
3963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
397914b7a73SJunchao Zhang }
398914b7a73SJunchao Zhang 
399914b7a73SJunchao Zhang /*====================================================================================*/
400914b7a73SJunchao Zhang /*  Init various types and instantiate pack/unpack function pointers                  */
401914b7a73SJunchao Zhang /*====================================================================================*/
402914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ>
PackInit_RealType(PetscSFLink link)403d71ae5a4SJacob Faibussowitsch static void PackInit_RealType(PetscSFLink link)
404d71ae5a4SJacob Faibussowitsch {
405914b7a73SJunchao Zhang   /* Pack/unpack for remote communication */
406914b7a73SJunchao Zhang   link->d_Pack            = Pack<Type, BS, EQ>;
407914b7a73SJunchao Zhang   link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>;
408914b7a73SJunchao Zhang   link->d_UnpackAndAdd    = UnpackAndOp<Type, Add<Type>, BS, EQ>;
409914b7a73SJunchao Zhang   link->d_UnpackAndMult   = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
410914b7a73SJunchao Zhang   link->d_UnpackAndMin    = UnpackAndOp<Type, Min<Type>, BS, EQ>;
411914b7a73SJunchao Zhang   link->d_UnpackAndMax    = UnpackAndOp<Type, Max<Type>, BS, EQ>;
412914b7a73SJunchao Zhang   link->d_FetchAndAdd     = FetchAndOp<Type, Add<Type>, BS, EQ>;
413914b7a73SJunchao Zhang   /* Scatter for local communication */
414914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>; /* Has special optimizations */
415914b7a73SJunchao Zhang   link->d_ScatterAndAdd    = ScatterAndOp<Type, Add<Type>, BS, EQ>;
416914b7a73SJunchao Zhang   link->d_ScatterAndMult   = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
417914b7a73SJunchao Zhang   link->d_ScatterAndMin    = ScatterAndOp<Type, Min<Type>, BS, EQ>;
418914b7a73SJunchao Zhang   link->d_ScatterAndMax    = ScatterAndOp<Type, Max<Type>, BS, EQ>;
419914b7a73SJunchao Zhang   link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
420914b7a73SJunchao Zhang   /* Atomic versions when there are data-race possibilities */
421914b7a73SJunchao Zhang   link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
422914b7a73SJunchao Zhang   link->da_UnpackAndAdd    = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
423914b7a73SJunchao Zhang   link->da_UnpackAndMult   = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
424914b7a73SJunchao Zhang   link->da_UnpackAndMin    = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
425914b7a73SJunchao Zhang   link->da_UnpackAndMax    = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
426914b7a73SJunchao Zhang   link->da_FetchAndAdd     = FetchAndOp<Type, AtomicFetchAdd<Type>, BS, EQ>;
427914b7a73SJunchao Zhang 
428914b7a73SJunchao Zhang   link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
429914b7a73SJunchao Zhang   link->da_ScatterAndAdd    = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
430914b7a73SJunchao Zhang   link->da_ScatterAndMult   = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
431914b7a73SJunchao Zhang   link->da_ScatterAndMin    = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
432914b7a73SJunchao Zhang   link->da_ScatterAndMax    = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
433914b7a73SJunchao Zhang   link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicFetchAdd<Type>, BS, EQ>;
434914b7a73SJunchao Zhang }
435914b7a73SJunchao Zhang 
436914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ>
PackInit_IntegerType(PetscSFLink link)437d71ae5a4SJacob Faibussowitsch static void PackInit_IntegerType(PetscSFLink link)
438d71ae5a4SJacob Faibussowitsch {
439914b7a73SJunchao Zhang   link->d_Pack            = Pack<Type, BS, EQ>;
440914b7a73SJunchao Zhang   link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>;
441914b7a73SJunchao Zhang   link->d_UnpackAndAdd    = UnpackAndOp<Type, Add<Type>, BS, EQ>;
442914b7a73SJunchao Zhang   link->d_UnpackAndMult   = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
443914b7a73SJunchao Zhang   link->d_UnpackAndMin    = UnpackAndOp<Type, Min<Type>, BS, EQ>;
444914b7a73SJunchao Zhang   link->d_UnpackAndMax    = UnpackAndOp<Type, Max<Type>, BS, EQ>;
445914b7a73SJunchao Zhang   link->d_UnpackAndLAND   = UnpackAndOp<Type, LAND<Type>, BS, EQ>;
446914b7a73SJunchao Zhang   link->d_UnpackAndLOR    = UnpackAndOp<Type, LOR<Type>, BS, EQ>;
447914b7a73SJunchao Zhang   link->d_UnpackAndLXOR   = UnpackAndOp<Type, LXOR<Type>, BS, EQ>;
448914b7a73SJunchao Zhang   link->d_UnpackAndBAND   = UnpackAndOp<Type, BAND<Type>, BS, EQ>;
449914b7a73SJunchao Zhang   link->d_UnpackAndBOR    = UnpackAndOp<Type, BOR<Type>, BS, EQ>;
450914b7a73SJunchao Zhang   link->d_UnpackAndBXOR   = UnpackAndOp<Type, BXOR<Type>, BS, EQ>;
451914b7a73SJunchao Zhang   link->d_FetchAndAdd     = FetchAndOp<Type, Add<Type>, BS, EQ>;
452914b7a73SJunchao Zhang 
453914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
454914b7a73SJunchao Zhang   link->d_ScatterAndAdd    = ScatterAndOp<Type, Add<Type>, BS, EQ>;
455914b7a73SJunchao Zhang   link->d_ScatterAndMult   = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
456914b7a73SJunchao Zhang   link->d_ScatterAndMin    = ScatterAndOp<Type, Min<Type>, BS, EQ>;
457914b7a73SJunchao Zhang   link->d_ScatterAndMax    = ScatterAndOp<Type, Max<Type>, BS, EQ>;
458914b7a73SJunchao Zhang   link->d_ScatterAndLAND   = ScatterAndOp<Type, LAND<Type>, BS, EQ>;
459914b7a73SJunchao Zhang   link->d_ScatterAndLOR    = ScatterAndOp<Type, LOR<Type>, BS, EQ>;
460914b7a73SJunchao Zhang   link->d_ScatterAndLXOR   = ScatterAndOp<Type, LXOR<Type>, BS, EQ>;
461914b7a73SJunchao Zhang   link->d_ScatterAndBAND   = ScatterAndOp<Type, BAND<Type>, BS, EQ>;
462914b7a73SJunchao Zhang   link->d_ScatterAndBOR    = ScatterAndOp<Type, BOR<Type>, BS, EQ>;
463914b7a73SJunchao Zhang   link->d_ScatterAndBXOR   = ScatterAndOp<Type, BXOR<Type>, BS, EQ>;
464914b7a73SJunchao Zhang   link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
465914b7a73SJunchao Zhang 
466914b7a73SJunchao Zhang   link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
467914b7a73SJunchao Zhang   link->da_UnpackAndAdd    = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
468914b7a73SJunchao Zhang   link->da_UnpackAndMult   = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
469914b7a73SJunchao Zhang   link->da_UnpackAndMin    = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
470914b7a73SJunchao Zhang   link->da_UnpackAndMax    = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
471914b7a73SJunchao Zhang   link->da_UnpackAndLAND   = UnpackAndOp<Type, AtomicLAND<Type>, BS, EQ>;
472914b7a73SJunchao Zhang   link->da_UnpackAndLOR    = UnpackAndOp<Type, AtomicLOR<Type>, BS, EQ>;
473914b7a73SJunchao Zhang   link->da_UnpackAndBAND   = UnpackAndOp<Type, AtomicBAND<Type>, BS, EQ>;
474914b7a73SJunchao Zhang   link->da_UnpackAndBOR    = UnpackAndOp<Type, AtomicBOR<Type>, BS, EQ>;
475914b7a73SJunchao Zhang   link->da_UnpackAndBXOR   = UnpackAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
476914b7a73SJunchao Zhang   link->da_FetchAndAdd     = FetchAndOp<Type, AtomicFetchAdd<Type>, BS, EQ>;
477914b7a73SJunchao Zhang 
478914b7a73SJunchao Zhang   link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
479914b7a73SJunchao Zhang   link->da_ScatterAndAdd    = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
480914b7a73SJunchao Zhang   link->da_ScatterAndMult   = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
481914b7a73SJunchao Zhang   link->da_ScatterAndMin    = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
482914b7a73SJunchao Zhang   link->da_ScatterAndMax    = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
483914b7a73SJunchao Zhang   link->da_ScatterAndLAND   = ScatterAndOp<Type, AtomicLAND<Type>, BS, EQ>;
484914b7a73SJunchao Zhang   link->da_ScatterAndLOR    = ScatterAndOp<Type, AtomicLOR<Type>, BS, EQ>;
485914b7a73SJunchao Zhang   link->da_ScatterAndBAND   = ScatterAndOp<Type, AtomicBAND<Type>, BS, EQ>;
486914b7a73SJunchao Zhang   link->da_ScatterAndBOR    = ScatterAndOp<Type, AtomicBOR<Type>, BS, EQ>;
487914b7a73SJunchao Zhang   link->da_ScatterAndBXOR   = ScatterAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
488914b7a73SJunchao Zhang   link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicFetchAdd<Type>, BS, EQ>;
489914b7a73SJunchao Zhang }
490914b7a73SJunchao Zhang 
491914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
492914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ>
PackInit_ComplexType(PetscSFLink link)493d71ae5a4SJacob Faibussowitsch static void PackInit_ComplexType(PetscSFLink link)
494d71ae5a4SJacob Faibussowitsch {
495914b7a73SJunchao Zhang   link->d_Pack            = Pack<Type, BS, EQ>;
496914b7a73SJunchao Zhang   link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>;
497914b7a73SJunchao Zhang   link->d_UnpackAndAdd    = UnpackAndOp<Type, Add<Type>, BS, EQ>;
498914b7a73SJunchao Zhang   link->d_UnpackAndMult   = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
499914b7a73SJunchao Zhang   link->d_FetchAndAdd     = FetchAndOp<Type, Add<Type>, BS, EQ>;
500914b7a73SJunchao Zhang 
501914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
502914b7a73SJunchao Zhang   link->d_ScatterAndAdd    = ScatterAndOp<Type, Add<Type>, BS, EQ>;
503914b7a73SJunchao Zhang   link->d_ScatterAndMult   = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
504914b7a73SJunchao Zhang   link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
505914b7a73SJunchao Zhang 
506914b7a73SJunchao Zhang   link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
507914b7a73SJunchao Zhang   link->da_UnpackAndAdd    = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
508914b7a73SJunchao Zhang   link->da_UnpackAndMult   = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
509914b7a73SJunchao Zhang   link->da_FetchAndAdd     = FetchAndOp<Type, AtomicFetchAdd<Type>, BS, EQ>;
510914b7a73SJunchao Zhang 
511914b7a73SJunchao Zhang   link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
512914b7a73SJunchao Zhang   link->da_ScatterAndAdd    = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
513914b7a73SJunchao Zhang   link->da_ScatterAndMult   = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
514914b7a73SJunchao Zhang   link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicFetchAdd<Type>, BS, EQ>;
515914b7a73SJunchao Zhang }
516914b7a73SJunchao Zhang #endif
517914b7a73SJunchao Zhang 
518914b7a73SJunchao Zhang template <typename Type>
PackInit_PairType(PetscSFLink link)519d71ae5a4SJacob Faibussowitsch static void PackInit_PairType(PetscSFLink link)
520d71ae5a4SJacob Faibussowitsch {
521914b7a73SJunchao Zhang   link->d_Pack            = Pack<Type, 1, 1>;
522914b7a73SJunchao Zhang   link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, 1, 1>;
523914b7a73SJunchao Zhang   link->d_UnpackAndMaxloc = UnpackAndOp<Type, Maxloc<Type>, 1, 1>;
524914b7a73SJunchao Zhang   link->d_UnpackAndMinloc = UnpackAndOp<Type, Minloc<Type>, 1, 1>;
525914b7a73SJunchao Zhang 
526914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndOp<Type, Insert<Type>, 1, 1>;
527914b7a73SJunchao Zhang   link->d_ScatterAndMaxloc = ScatterAndOp<Type, Maxloc<Type>, 1, 1>;
528914b7a73SJunchao Zhang   link->d_ScatterAndMinloc = ScatterAndOp<Type, Minloc<Type>, 1, 1>;
529914b7a73SJunchao Zhang   /* Atomics for pair types are not implemented yet */
530914b7a73SJunchao Zhang }
531914b7a73SJunchao Zhang 
532914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ>
PackInit_DumbType(PetscSFLink link)533d71ae5a4SJacob Faibussowitsch static void PackInit_DumbType(PetscSFLink link)
534d71ae5a4SJacob Faibussowitsch {
535914b7a73SJunchao Zhang   link->d_Pack             = Pack<Type, BS, EQ>;
536914b7a73SJunchao Zhang   link->d_UnpackAndInsert  = UnpackAndOp<Type, Insert<Type>, BS, EQ>;
537914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
538914b7a73SJunchao Zhang   /* Atomics for dumb types are not implemented yet */
539914b7a73SJunchao Zhang }
540914b7a73SJunchao Zhang 
541f4af43b4SJunchao Zhang /*
542f4af43b4SJunchao Zhang   Kokkos::DefaultExecutionSpace(stream) is a reference counted pointer object. It has a bug
543f4af43b4SJunchao Zhang   that one is not able to repeatedly create and destroy the object. SF's original design was each
544f4af43b4SJunchao Zhang   SFLink has a stream (NULL or not) and hence an execution space object. The bug prevents us from
545f4af43b4SJunchao Zhang   destroying multiple SFLinks with NULL stream and the default execution space object. To avoid
546f4af43b4SJunchao Zhang   memory leaks, SF_Kokkos only supports NULL stream, which is also petsc's default scheme. SF_Kokkos
547f4af43b4SJunchao Zhang   does not do its own new/delete. It just uses Kokkos::DefaultExecutionSpace(), which is a singliton
548f4af43b4SJunchao Zhang   object in Kokkos.
549f4af43b4SJunchao Zhang */
550f4af43b4SJunchao Zhang /*
551914b7a73SJunchao Zhang static PetscErrorCode PetscSFLinkDestroy_Kokkos(PetscSFLink link)
552914b7a73SJunchao Zhang {
553914b7a73SJunchao Zhang   PetscFunctionBegin;
5543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
555914b7a73SJunchao Zhang }
556f4af43b4SJunchao Zhang */
557914b7a73SJunchao Zhang 
558914b7a73SJunchao Zhang /* Some device-specific utilities */
PetscSFLinkSyncDevice_Kokkos(PetscSFLink PETSC_UNUSED link)559d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkSyncDevice_Kokkos(PetscSFLink PETSC_UNUSED link)
560d71ae5a4SJacob Faibussowitsch {
561914b7a73SJunchao Zhang   PetscFunctionBegin;
562914b7a73SJunchao Zhang   Kokkos::fence();
5633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
564914b7a73SJunchao Zhang }
565914b7a73SJunchao Zhang 
PetscSFLinkSyncStream_Kokkos(PetscSFLink PETSC_UNUSED link)566d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkSyncStream_Kokkos(PetscSFLink PETSC_UNUSED link)
567d71ae5a4SJacob Faibussowitsch {
568*4df4a32cSJunchao Zhang   DeviceExecutionSpace exec = PetscGetKokkosExecutionSpace();
5694d86920dSPierre Jolivet 
570914b7a73SJunchao Zhang   PetscFunctionBegin;
571914b7a73SJunchao Zhang   exec.fence();
5723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
573914b7a73SJunchao Zhang }
574914b7a73SJunchao Zhang 
PetscSFLinkMemcpy_Kokkos(PetscSFLink PETSC_UNUSED link,PetscMemType dstmtype,void * dst,PetscMemType srcmtype,const void * src,size_t n)575d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkMemcpy_Kokkos(PetscSFLink PETSC_UNUSED link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
576d71ae5a4SJacob Faibussowitsch {
577*4df4a32cSJunchao Zhang   DeviceExecutionSpace exec = PetscGetKokkosExecutionSpace();
578914b7a73SJunchao Zhang 
579914b7a73SJunchao Zhang   PetscFunctionBegin;
5803ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
58145402d8aSJunchao Zhang   if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) { // H2H
58245402d8aSJunchao Zhang     PetscCallCXX(exec.fence());                                   // make sure async kernels on src are finished, in case of unified memory as on AMD MI300A.
5839566063dSJacob Faibussowitsch     PetscCall(PetscMemcpy(dst, src, n));
584914b7a73SJunchao Zhang   } else {
585e36ced11SJunchao Zhang     if (PetscMemTypeDevice(dstmtype) && PetscMemTypeHost(srcmtype)) { // H2D
586914b7a73SJunchao Zhang       deviceBuffer_t    dbuf(static_cast<char *>(dst), n);
587914b7a73SJunchao Zhang       HostConstBuffer_t sbuf(static_cast<const char *>(src), n);
588e36ced11SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(exec, dbuf, sbuf));
5899566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu(n));
590e36ced11SJunchao Zhang     } else if (PetscMemTypeHost(dstmtype) && PetscMemTypeDevice(srcmtype)) { // D2H
591914b7a73SJunchao Zhang       HostBuffer_t        dbuf(static_cast<char *>(dst), n);
592914b7a73SJunchao Zhang       deviceConstBuffer_t sbuf(static_cast<const char *>(src), n);
593e36ced11SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(exec, dbuf, sbuf));
594e36ced11SJunchao Zhang       PetscCallCXX(exec.fence()); // make sure dbuf is ready for use immediately on host
5959566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuToCpu(n));
59645402d8aSJunchao Zhang     } else if (PetscMemTypeDevice(dstmtype) && PetscMemTypeDevice(srcmtype)) { // D2D
597914b7a73SJunchao Zhang       deviceBuffer_t      dbuf(static_cast<char *>(dst), n);
598914b7a73SJunchao Zhang       deviceConstBuffer_t sbuf(static_cast<const char *>(src), n);
599e36ced11SJunchao Zhang       PetscCallCXX(Kokkos::deep_copy(exec, dbuf, sbuf));
600914b7a73SJunchao Zhang     }
601914b7a73SJunchao Zhang   }
6023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
603914b7a73SJunchao Zhang }
604914b7a73SJunchao Zhang 
PetscSFMalloc_Kokkos(PetscMemType mtype,size_t size,void ** ptr)605d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFMalloc_Kokkos(PetscMemType mtype, size_t size, void **ptr)
606d71ae5a4SJacob Faibussowitsch {
607914b7a73SJunchao Zhang   PetscFunctionBegin;
6089566063dSJacob Faibussowitsch   if (PetscMemTypeHost(mtype)) PetscCall(PetscMalloc(size, ptr));
60971438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype)) {
6109566063dSJacob Faibussowitsch     if (!PetscKokkosInitialized) PetscCall(PetscKokkosInitializeCheck());
61145402d8aSJunchao Zhang     PetscCallCXX(*ptr = Kokkos::kokkos_malloc<DefaultMemorySpace>(size));
61298921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
6133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
614914b7a73SJunchao Zhang }
615914b7a73SJunchao Zhang 
PetscSFFree_Kokkos(PetscMemType mtype,void * ptr)616d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFFree_Kokkos(PetscMemType mtype, void *ptr)
617d71ae5a4SJacob Faibussowitsch {
618914b7a73SJunchao Zhang   PetscFunctionBegin;
6199566063dSJacob Faibussowitsch   if (PetscMemTypeHost(mtype)) PetscCall(PetscFree(ptr));
6209371c9d4SSatish Balay   else if (PetscMemTypeDevice(mtype)) {
62145402d8aSJunchao Zhang     PetscCallCXX(Kokkos::kokkos_free<DefaultMemorySpace>(ptr));
6229371c9d4SSatish Balay   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
6233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
624914b7a73SJunchao Zhang }
625914b7a73SJunchao Zhang 
62671438e86SJunchao Zhang /* Destructor when the link uses MPI for communication */
PetscSFLinkDestroy_Kokkos(PetscSF sf,PetscSFLink link)627d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkDestroy_Kokkos(PetscSF sf, PetscSFLink link)
628d71ae5a4SJacob Faibussowitsch {
62971438e86SJunchao Zhang   PetscFunctionBegin;
63071438e86SJunchao Zhang   for (int i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
6319566063dSJacob Faibussowitsch     PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]));
6329566063dSJacob Faibussowitsch     PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]));
63371438e86SJunchao Zhang   }
6343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
63571438e86SJunchao Zhang }
636914b7a73SJunchao Zhang 
637914b7a73SJunchao Zhang /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
PetscSFLinkSetUp_Kokkos(PetscSF PETSC_UNUSED sf,PetscSFLink link,MPI_Datatype unit)638d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF PETSC_UNUSED sf, PetscSFLink link, MPI_Datatype unit)
639d71ae5a4SJacob Faibussowitsch {
640914b7a73SJunchao Zhang   PetscInt  nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
641914b7a73SJunchao Zhang   PetscBool is2Int, is2PetscInt;
642914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
643914b7a73SJunchao Zhang   PetscInt nPetscComplex = 0;
644914b7a73SJunchao Zhang #endif
645914b7a73SJunchao Zhang 
646914b7a73SJunchao Zhang   PetscFunctionBegin;
6473ba16761SJacob Faibussowitsch   if (link->deviceinited) PetscFunctionReturn(PETSC_SUCCESS);
6489566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
6499566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar));
6509566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar));
651914b7a73SJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
6529566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt));
6539566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt));
6549566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal));
655914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
6569566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex));
657914b7a73SJunchao Zhang #endif
6589566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int));
6599566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt));
660914b7a73SJunchao Zhang 
661914b7a73SJunchao Zhang   if (is2Int) {
662914b7a73SJunchao Zhang     PackInit_PairType<Kokkos::pair<int, int>>(link);
663914b7a73SJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
664914b7a73SJunchao Zhang     PackInit_PairType<Kokkos::pair<PetscInt, PetscInt>>(link);
665914b7a73SJunchao Zhang   } else if (nPetscReal) {
666d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE) /* Skip the unimportant stuff to speed up SF device compilation time */
6679371c9d4SSatish Balay     if (nPetscReal == 8) PackInit_RealType<PetscReal, 8, 1>(link);
6689371c9d4SSatish Balay     else if (nPetscReal % 8 == 0) PackInit_RealType<PetscReal, 8, 0>(link);
6699371c9d4SSatish Balay     else if (nPetscReal == 4) PackInit_RealType<PetscReal, 4, 1>(link);
6709371c9d4SSatish Balay     else if (nPetscReal % 4 == 0) PackInit_RealType<PetscReal, 4, 0>(link);
6719371c9d4SSatish Balay     else if (nPetscReal == 2) PackInit_RealType<PetscReal, 2, 1>(link);
6729371c9d4SSatish Balay     else if (nPetscReal % 2 == 0) PackInit_RealType<PetscReal, 2, 0>(link);
6739371c9d4SSatish Balay     else if (nPetscReal == 1) PackInit_RealType<PetscReal, 1, 1>(link);
6749371c9d4SSatish Balay     else if (nPetscReal % 1 == 0)
675eee4e20aSJunchao Zhang #endif
676d941a2f0SJunchao Zhang       PackInit_RealType<PetscReal, 1, 0>(link);
677874d28e3SJunchao Zhang   } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
678d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE)
6799371c9d4SSatish Balay     if (nPetscInt == 8) PackInit_IntegerType<llint, 8, 1>(link);
6809371c9d4SSatish Balay     else if (nPetscInt % 8 == 0) PackInit_IntegerType<llint, 8, 0>(link);
6819371c9d4SSatish Balay     else if (nPetscInt == 4) PackInit_IntegerType<llint, 4, 1>(link);
6829371c9d4SSatish Balay     else if (nPetscInt % 4 == 0) PackInit_IntegerType<llint, 4, 0>(link);
6839371c9d4SSatish Balay     else if (nPetscInt == 2) PackInit_IntegerType<llint, 2, 1>(link);
6849371c9d4SSatish Balay     else if (nPetscInt % 2 == 0) PackInit_IntegerType<llint, 2, 0>(link);
6859371c9d4SSatish Balay     else if (nPetscInt == 1) PackInit_IntegerType<llint, 1, 1>(link);
6869371c9d4SSatish Balay     else if (nPetscInt % 1 == 0)
687eee4e20aSJunchao Zhang #endif
688d941a2f0SJunchao Zhang       PackInit_IntegerType<llint, 1, 0>(link);
689914b7a73SJunchao Zhang   } else if (nInt) {
690d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE)
6919371c9d4SSatish Balay     if (nInt == 8) PackInit_IntegerType<int, 8, 1>(link);
6929371c9d4SSatish Balay     else if (nInt % 8 == 0) PackInit_IntegerType<int, 8, 0>(link);
6939371c9d4SSatish Balay     else if (nInt == 4) PackInit_IntegerType<int, 4, 1>(link);
6949371c9d4SSatish Balay     else if (nInt % 4 == 0) PackInit_IntegerType<int, 4, 0>(link);
6959371c9d4SSatish Balay     else if (nInt == 2) PackInit_IntegerType<int, 2, 1>(link);
6969371c9d4SSatish Balay     else if (nInt % 2 == 0) PackInit_IntegerType<int, 2, 0>(link);
6979371c9d4SSatish Balay     else if (nInt == 1) PackInit_IntegerType<int, 1, 1>(link);
6989371c9d4SSatish Balay     else if (nInt % 1 == 0)
699eee4e20aSJunchao Zhang #endif
700d941a2f0SJunchao Zhang       PackInit_IntegerType<int, 1, 0>(link);
701914b7a73SJunchao Zhang   } else if (nSignedChar) {
702d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE)
7039371c9d4SSatish Balay     if (nSignedChar == 8) PackInit_IntegerType<char, 8, 1>(link);
7049371c9d4SSatish Balay     else if (nSignedChar % 8 == 0) PackInit_IntegerType<char, 8, 0>(link);
7059371c9d4SSatish Balay     else if (nSignedChar == 4) PackInit_IntegerType<char, 4, 1>(link);
7069371c9d4SSatish Balay     else if (nSignedChar % 4 == 0) PackInit_IntegerType<char, 4, 0>(link);
7079371c9d4SSatish Balay     else if (nSignedChar == 2) PackInit_IntegerType<char, 2, 1>(link);
7089371c9d4SSatish Balay     else if (nSignedChar % 2 == 0) PackInit_IntegerType<char, 2, 0>(link);
7099371c9d4SSatish Balay     else if (nSignedChar == 1) PackInit_IntegerType<char, 1, 1>(link);
7109371c9d4SSatish Balay     else if (nSignedChar % 1 == 0)
711eee4e20aSJunchao Zhang #endif
712d941a2f0SJunchao Zhang       PackInit_IntegerType<char, 1, 0>(link);
713914b7a73SJunchao Zhang   } else if (nUnsignedChar) {
714d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE)
7159371c9d4SSatish Balay     if (nUnsignedChar == 8) PackInit_IntegerType<unsigned char, 8, 1>(link);
7169371c9d4SSatish Balay     else if (nUnsignedChar % 8 == 0) PackInit_IntegerType<unsigned char, 8, 0>(link);
7179371c9d4SSatish Balay     else if (nUnsignedChar == 4) PackInit_IntegerType<unsigned char, 4, 1>(link);
7189371c9d4SSatish Balay     else if (nUnsignedChar % 4 == 0) PackInit_IntegerType<unsigned char, 4, 0>(link);
7199371c9d4SSatish Balay     else if (nUnsignedChar == 2) PackInit_IntegerType<unsigned char, 2, 1>(link);
7209371c9d4SSatish Balay     else if (nUnsignedChar % 2 == 0) PackInit_IntegerType<unsigned char, 2, 0>(link);
7219371c9d4SSatish Balay     else if (nUnsignedChar == 1) PackInit_IntegerType<unsigned char, 1, 1>(link);
7229371c9d4SSatish Balay     else if (nUnsignedChar % 1 == 0)
723eee4e20aSJunchao Zhang #endif
724d941a2f0SJunchao Zhang       PackInit_IntegerType<unsigned char, 1, 0>(link);
725914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
726914b7a73SJunchao Zhang   } else if (nPetscComplex) {
727d941a2f0SJunchao Zhang   #if !defined(PETSC_HAVE_DEVICE)
7289371c9d4SSatish Balay     if (nPetscComplex == 8) PackInit_ComplexType<Kokkos::complex<PetscReal>, 8, 1>(link);
7299371c9d4SSatish Balay     else if (nPetscComplex % 8 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>, 8, 0>(link);
7309371c9d4SSatish Balay     else if (nPetscComplex == 4) PackInit_ComplexType<Kokkos::complex<PetscReal>, 4, 1>(link);
7319371c9d4SSatish Balay     else if (nPetscComplex % 4 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>, 4, 0>(link);
7329371c9d4SSatish Balay     else if (nPetscComplex == 2) PackInit_ComplexType<Kokkos::complex<PetscReal>, 2, 1>(link);
7339371c9d4SSatish Balay     else if (nPetscComplex % 2 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>, 2, 0>(link);
7349371c9d4SSatish Balay     else if (nPetscComplex == 1) PackInit_ComplexType<Kokkos::complex<PetscReal>, 1, 1>(link);
7359371c9d4SSatish Balay     else if (nPetscComplex % 1 == 0)
736eee4e20aSJunchao Zhang   #endif
737d941a2f0SJunchao Zhang       PackInit_ComplexType<Kokkos::complex<PetscReal>, 1, 0>(link);
738914b7a73SJunchao Zhang #endif
739914b7a73SJunchao Zhang   } else {
740e1187f0dSToby Isaac     MPI_Aint nbyte;
741e1187f0dSToby Isaac 
742e1187f0dSToby Isaac     PetscCall(PetscSFGetDatatypeSize_Internal(PETSC_COMM_SELF, unit, &nbyte));
743914b7a73SJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
744d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE)
7459371c9d4SSatish Balay       if (nbyte == 4) PackInit_DumbType<char, 4, 1>(link);
7469371c9d4SSatish Balay       else if (nbyte % 4 == 0) PackInit_DumbType<char, 4, 0>(link);
7479371c9d4SSatish Balay       else if (nbyte == 2) PackInit_DumbType<char, 2, 1>(link);
7489371c9d4SSatish Balay       else if (nbyte % 2 == 0) PackInit_DumbType<char, 2, 0>(link);
7499371c9d4SSatish Balay       else if (nbyte == 1) PackInit_DumbType<char, 1, 1>(link);
7509371c9d4SSatish Balay       else if (nbyte % 1 == 0)
751eee4e20aSJunchao Zhang #endif
752d941a2f0SJunchao Zhang         PackInit_DumbType<char, 1, 0>(link);
753914b7a73SJunchao Zhang     } else {
754d279a5e3SJunchao Zhang       PetscCall(PetscIntCast(nbyte / sizeof(int), &nInt));
755d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE)
7569371c9d4SSatish Balay       if (nInt == 8) PackInit_DumbType<int, 8, 1>(link);
7579371c9d4SSatish Balay       else if (nInt % 8 == 0) PackInit_DumbType<int, 8, 0>(link);
7589371c9d4SSatish Balay       else if (nInt == 4) PackInit_DumbType<int, 4, 1>(link);
7599371c9d4SSatish Balay       else if (nInt % 4 == 0) PackInit_DumbType<int, 4, 0>(link);
7609371c9d4SSatish Balay       else if (nInt == 2) PackInit_DumbType<int, 2, 1>(link);
7619371c9d4SSatish Balay       else if (nInt % 2 == 0) PackInit_DumbType<int, 2, 0>(link);
7629371c9d4SSatish Balay       else if (nInt == 1) PackInit_DumbType<int, 1, 1>(link);
7639371c9d4SSatish Balay       else if (nInt % 1 == 0)
764eee4e20aSJunchao Zhang #endif
765d941a2f0SJunchao Zhang         PackInit_DumbType<int, 1, 0>(link);
766914b7a73SJunchao Zhang     }
767914b7a73SJunchao Zhang   }
768914b7a73SJunchao Zhang 
76971438e86SJunchao Zhang   link->SyncDevice   = PetscSFLinkSyncDevice_Kokkos;
77071438e86SJunchao Zhang   link->SyncStream   = PetscSFLinkSyncStream_Kokkos;
77120c24465SJunchao Zhang   link->Memcpy       = PetscSFLinkMemcpy_Kokkos;
77271438e86SJunchao Zhang   link->Destroy      = PetscSFLinkDestroy_Kokkos;
773914b7a73SJunchao Zhang   link->deviceinited = PETSC_TRUE;
7743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
775914b7a73SJunchao Zhang }
776