xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 21e3ffae2f3b73c0bd738cf6d0a809700fc04bb0)
1 #include "petsc/private/sfimpl.h"
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
6 
7 /*
8  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9  * therefore we pack data types manually. This file defines packing routines for the standard data types.
10  */
11 
12 #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d
13 
14 /* Operations working like s += t */
15 #define OP_BINARY(op, s, t) \
16   do { \
17     (s) = (s)op(t); \
18   } while (0) /* binary ops in the middle such as +, *, && etc. */
19 #define OP_FUNCTION(op, s, t) \
20   do { \
21     (s) = op((s), (t)); \
22   } while (0) /* ops like a function, such as PetscMax, PetscMin */
23 #define OP_LXOR(op, s, t) \
24   do { \
25     (s) = (!(s)) != (!(t)); \
26   } while (0) /* logical exclusive OR */
27 #define OP_ASSIGN(op, s, t) \
28   do { \
29     (s) = (t); \
30   } while (0)
31 /* Ref MPI MAXLOC */
32 #define OP_XLOC(op, s, t) \
33   do { \
34     if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \
35     else if (!((s).u op(t).u)) s = t; \
36   } while (0)
37 
38 /* DEF_PackFunc - macro defining a Pack routine
39 
40    Arguments of the macro:
41    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
42    .BS        Block size for vectorization. It is a factor of bsz.
43    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
44 
45    Arguments of the Pack routine:
46    +count     Number of indices in idx[].
47    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
48    .opt       Per-pack optimization plan. NULL means no such plan.
49    .idx       Indices of entries to packed.
50    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
51    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
52    -packed    Address of the packed data.
53  */
54 #define DEF_PackFunc(Type, BS, EQ) \
55   static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) \
56   { \
57     const Type    *u = (const Type *)unpacked, *u2; \
58     Type          *p = (Type *)packed, *p2; \
59     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
60     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
61     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
62     PetscFunctionBegin; \
63     if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \
64     else if (opt) { /* has optimizations available */ p2 = p; \
65       for (r = 0; r < opt->n; r++) { \
66         u2 = u + opt->start[r] * MBS; \
67         X  = opt->X[r]; \
68         Y  = opt->Y[r]; \
69         for (k = 0; k < opt->dz[r]; k++) \
70           for (j = 0; j < opt->dy[r]; j++) { \
71             PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \
72             p2 += opt->dx[r] * MBS; \
73           } \
74       } \
75     } else { \
76       for (i = 0; i < count; i++) \
77         for (j = 0; j < M; j++)    /* Decent compilers should eliminate this loop when M = const 1 */ \
78           for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
79             p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \
80     } \
81     PetscFunctionReturn(PETSC_SUCCESS); \
82   }
83 
84 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
85                 and inserts into a sparse array.
86 
87    Arguments:
88   .Type       Type of the data
89   .BS         Block size for vectorization
90   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
91 
92   Notes:
93    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
94  */
95 #define DEF_UnpackFunc(Type, BS, EQ) \
96   static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
97   { \
98     Type          *u = (Type *)unpacked, *u2; \
99     const Type    *p = (const Type *)packed; \
100     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
101     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
102     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
103     PetscFunctionBegin; \
104     if (!idx) { \
105       u += start * MBS; \
106       if (u != p) PetscCall(PetscArraycpy(u, p, count *MBS)); \
107     } else if (opt) { /* has optimizations available */ \
108       for (r = 0; r < opt->n; r++) { \
109         u2 = u + opt->start[r] * MBS; \
110         X  = opt->X[r]; \
111         Y  = opt->Y[r]; \
112         for (k = 0; k < opt->dz[r]; k++) \
113           for (j = 0; j < opt->dy[r]; j++) { \
114             PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \
115             p += opt->dx[r] * MBS; \
116           } \
117       } \
118     } else { \
119       for (i = 0; i < count; i++) \
120         for (j = 0; j < M; j++) \
121           for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \
122     } \
123     PetscFunctionReturn(PETSC_SUCCESS); \
124   }
125 
126 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
127 
128    Arguments:
129   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
130   .Type       Type of the data
131   .BS         Block size for vectorization
132   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
133   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
134   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
135  */
136 #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \
137   static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
138   { \
139     Type          *u = (Type *)unpacked, *u2; \
140     const Type    *p = (const Type *)packed; \
141     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
142     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
143     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
144     PetscFunctionBegin; \
145     if (!idx) { \
146       u += start * MBS; \
147       for (i = 0; i < count; i++) \
148         for (j = 0; j < M; j++) \
149           for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
150     } else if (opt) { /* idx[] has patterns */ \
151       for (r = 0; r < opt->n; r++) { \
152         u2 = u + opt->start[r] * MBS; \
153         X  = opt->X[r]; \
154         Y  = opt->Y[r]; \
155         for (k = 0; k < opt->dz[r]; k++) \
156           for (j = 0; j < opt->dy[r]; j++) { \
157             for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \
158             p += opt->dx[r] * MBS; \
159           } \
160       } \
161     } else { \
162       for (i = 0; i < count; i++) \
163         for (j = 0; j < M; j++) \
164           for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
165     } \
166     PetscFunctionReturn(PETSC_SUCCESS); \
167   }
168 
169 #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \
170   static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) \
171   { \
172     Type          *u = (Type *)unpacked, *p = (Type *)packed, tmp; \
173     PetscInt       i, j, k, r, l, bs = link->bs; \
174     const PetscInt M   = (EQ) ? 1 : bs / BS; \
175     const PetscInt MBS = M * BS; \
176     PetscFunctionBegin; \
177     for (i = 0; i < count; i++) { \
178       r = (!idx ? start + i : idx[i]) * MBS; \
179       l = i * MBS; \
180       for (j = 0; j < M; j++) \
181         for (k = 0; k < BS; k++) { \
182           tmp = u[r + j * BS + k]; \
183           OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \
184           p[l + j * BS + k] = tmp; \
185         } \
186     } \
187     PetscFunctionReturn(PETSC_SUCCESS); \
188   }
189 
190 #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \
191   static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) \
192   { \
193     const Type    *u = (const Type *)src; \
194     Type          *v = (Type *)dst; \
195     PetscInt       i, j, k, s, t, X, Y, bs = link->bs; \
196     const PetscInt M   = (EQ) ? 1 : bs / BS; \
197     const PetscInt MBS = M * BS; \
198     PetscFunctionBegin; \
199     if (!srcIdx) { /* src is contiguous */ \
200       u += srcStart * MBS; \
201       PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \
202     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
203       u += srcOpt->start[0] * MBS; \
204       v += dstStart * MBS; \
205       X = srcOpt->X[0]; \
206       Y = srcOpt->Y[0]; \
207       for (k = 0; k < srcOpt->dz[0]; k++) \
208         for (j = 0; j < srcOpt->dy[0]; j++) { \
209           for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \
210           v += srcOpt->dx[0] * MBS; \
211         } \
212     } else { /* all other cases */ \
213       for (i = 0; i < count; i++) { \
214         s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \
215         t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \
216         for (j = 0; j < M; j++) \
217           for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \
218       } \
219     } \
220     PetscFunctionReturn(PETSC_SUCCESS); \
221   }
222 
223 #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \
224   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) \
225   { \
226     Type          *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \
227     const Type    *ldata = (const Type *)leafdata; \
228     PetscInt       i, j, k, r, l, bs = link->bs; \
229     const PetscInt M   = (EQ) ? 1 : bs / BS; \
230     const PetscInt MBS = M * BS; \
231     PetscFunctionBegin; \
232     for (i = 0; i < count; i++) { \
233       r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \
234       l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \
235       for (j = 0; j < M; j++) \
236         for (k = 0; k < BS; k++) { \
237           lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \
238           OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \
239         } \
240     } \
241     PetscFunctionReturn(PETSC_SUCCESS); \
242   }
243 
244 /* Pack, Unpack/Fetch ops */
245 #define DEF_Pack(Type, BS, EQ) \
246   DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) \
247   { \
248     link->h_Pack             = CPPJoin4(Pack, Type, BS, EQ); \
249     link->h_UnpackAndInsert  = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \
250     link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \
251   }
252 
253 /* Add, Mult ops */
254 #define DEF_Add(Type, BS, EQ) \
255   DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) \
256   { \
257     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \
258     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult, Type, BS, EQ); \
259     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd, Type, BS, EQ); \
260     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \
261     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult, Type, BS, EQ); \
262     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \
263   }
264 
265 /* Max, Min ops */
266 #define DEF_Cmp(Type, BS, EQ) \
267   DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) \
268   { \
269     link->h_UnpackAndMax  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
270     link->h_UnpackAndMin  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
271     link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
272     link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
273   }
274 
275 /* Logical ops.
276   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
277   the compilation warning "empty macro arguments are undefined in ISO C90"
278  */
279 #define DEF_Log(Type, BS, EQ) \
280   DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) \
281   { \
282     link->h_UnpackAndLAND  = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \
283     link->h_UnpackAndLOR   = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \
284     link->h_UnpackAndLXOR  = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \
285     link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \
286     link->h_ScatterAndLOR  = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \
287     link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \
288   }
289 
290 /* Bitwise ops */
291 #define DEF_Bit(Type, BS, EQ) \
292   DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) \
293   { \
294     link->h_UnpackAndBAND  = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \
295     link->h_UnpackAndBOR   = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \
296     link->h_UnpackAndBXOR  = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \
297     link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \
298     link->h_ScatterAndBOR  = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \
299     link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \
300   }
301 
302 /* Maxloc, Minloc ops */
303 #define DEF_Xloc(Type, BS, EQ) \
304   DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) \
305   { \
306     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
307     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
308     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
309     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
310   }
311 
312 #define DEF_IntegerType(Type, BS, EQ) \
313   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) \
314   { \
315     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
316     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
317     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
318     CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \
319     CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \
320   }
321 
322 #define DEF_RealType(Type, BS, EQ) \
323   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) \
324   { \
325     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
326     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
327     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
328   }
329 
330 #if defined(PETSC_HAVE_COMPLEX)
331   #define DEF_ComplexType(Type, BS, EQ) \
332     DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) \
333     { \
334       CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
335       CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
336     }
337 #endif
338 
339 #define DEF_DumbType(Type, BS, EQ) \
340   DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) \
341   { \
342     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
343   }
344 
345 /* Maxloc, Minloc */
346 #define DEF_PairType(Type, BS, EQ) \
347   DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) \
348   { \
349     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
350     CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \
351   }
352 
353 DEF_IntegerType(PetscInt, 1, 1)   /* unit = 1 MPIU_INT  */
354   DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */
355   DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */
356   DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */
357   DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */
358   DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */
359   DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */
360   DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
361 
362 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
363   DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0)
364 #endif
365 
366   /* The typedefs are used to get a typename without space that CPPJoin can handle */
367   typedef signed char SignedChar;
368 DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0)
369 
370   typedef unsigned char UnsignedChar;
371 DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0)
372 
373   DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0)
374 #if defined(PETSC_HAVE_COMPLEX)
375     DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0)
376 #endif
377 
378 #define PairType(Type1, Type2) Type1##_##Type2
379       typedef struct {
380   int u;
381   int i;
382 } PairType(int, int);
383 typedef struct {
384   PetscInt u;
385   PetscInt i;
386 } PairType(PetscInt, PetscInt);
387 DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1)
388 
389   /* If we don't know the basic type, we treat it as a stream of chars or ints */
390   DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0)
391 
392     typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
393 DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0)
394 
395   PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link)
396 {
397   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
398   PetscInt       i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8;
399 
400   PetscFunctionBegin;
401   /* Destroy device-specific fields */
402   if (link->deviceinited) PetscCall((*link->Destroy)(sf, link));
403 
404   /* Destroy host related fields */
405   if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit));
406   if (!link->use_nvshmem) {
407     for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */
408       if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i]));
409     }
410     PetscCall(PetscFree(link->reqs));
411     for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
412       PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]));
413       PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]));
414     }
415   }
416   PetscCall(PetscFree(link));
417   PetscFunctionReturn(PETSC_SUCCESS);
418 }
419 
420 PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
421 {
422   PetscFunctionBegin;
423   PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata));
424 #if defined(PETSC_HAVE_NVSHMEM)
425   {
426     PetscBool use_nvshmem;
427     PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem));
428     if (use_nvshmem) {
429       PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
430       PetscFunctionReturn(PETSC_SUCCESS);
431     }
432   }
433 #endif
434   PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
435   PetscFunctionReturn(PETSC_SUCCESS);
436 }
437 
438 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
439    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
440 */
441 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs)
442 {
443   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
444   PetscInt           i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
445   const PetscInt    *rootoffset, *leafoffset;
446   MPI_Aint           disp;
447   MPI_Comm           comm          = PetscObjectComm((PetscObject)sf);
448   MPI_Datatype       unit          = link->unit;
449   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
450   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
451 
452   PetscFunctionBegin;
453   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
454   if (sf->persistent) {
455     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
456       PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
457       if (direction == PETSCSF_LEAF2ROOT) {
458         for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
459           disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
460           cnt  = rootoffset[i + 1] - rootoffset[i];
461           PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
462         }
463       } else { /* PETSCSF_ROOT2LEAF */
464         for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
465           disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
466           cnt  = rootoffset[i + 1] - rootoffset[i];
467           PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
468         }
469       }
470       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
471     }
472 
473     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
474       PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
475       if (direction == PETSCSF_LEAF2ROOT) {
476         for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
477           disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
478           cnt  = leafoffset[i + 1] - leafoffset[i];
479           PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
480         }
481       } else { /* PETSCSF_ROOT2LEAF */
482         for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
483           disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
484           cnt  = leafoffset[i + 1] - leafoffset[i];
485           PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
486         }
487       }
488       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
489     }
490   }
491   if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
492   if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
493   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
494   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
495   PetscFunctionReturn(PETSC_SUCCESS);
496 }
497 
498 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink)
499 {
500   PetscSFLink    link, *p;
501   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
502 
503   PetscFunctionBegin;
504   /* Look for types in cache */
505   for (p = &bas->inuse; (link = *p); p = &link->next) {
506     PetscBool match;
507     PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
508     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
509       switch (cmode) {
510       case PETSC_OWN_POINTER:
511         *p = link->next;
512         break; /* Remove from inuse list */
513       case PETSC_USE_POINTER:
514         break;
515       default:
516         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode");
517       }
518       *mylink = link;
519       PetscFunctionReturn(PETSC_SUCCESS);
520     }
521   }
522   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack");
523 }
524 
525 PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink)
526 {
527   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
528   PetscSFLink    link = *mylink;
529 
530   PetscFunctionBegin;
531   link->rootdata = NULL;
532   link->leafdata = NULL;
533   link->next     = bas->avail;
534   bas->avail     = link;
535   *mylink        = NULL;
536   PetscFunctionReturn(PETSC_SUCCESS);
537 }
538 
539 /* Error out on unsupported overlapped communications */
540 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
541 {
542   PetscSFLink    link, *p;
543   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
544   PetscBool      match;
545 
546   PetscFunctionBegin;
547   if (PetscDefined(USE_DEBUG)) {
548     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
549        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
550     */
551     if (rootdata || leafdata) {
552       for (p = &bas->inuse; (link = *p); p = &link->next) {
553         PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
554         PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata);
555       }
556     }
557   }
558   PetscFunctionReturn(PETSC_SUCCESS);
559 }
560 
561 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
562 {
563   PetscFunctionBegin;
564   if (n) PetscCall(PetscMemcpy(dst, src, n));
565   PetscFunctionReturn(PETSC_SUCCESS);
566 }
567 
568 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit)
569 {
570   PetscInt    nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
571   PetscBool   is2Int, is2PetscInt;
572   PetscMPIInt ni, na, nd, combiner;
573 #if defined(PETSC_HAVE_COMPLEX)
574   PetscInt nPetscComplex = 0;
575 #endif
576 
577   PetscFunctionBegin;
578   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar));
579   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar));
580   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
581   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt));
582   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt));
583   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal));
584 #if defined(PETSC_HAVE_COMPLEX)
585   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex));
586 #endif
587   PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int));
588   PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt));
589   /* TODO: shaell we also handle Fortran MPI_2REAL? */
590   PetscCallMPI(MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner));
591   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
592   link->bs        = 1;                                                           /* default */
593 
594   if (is2Int) {
595     PackInit_PairType_int_int_1_1(link);
596     link->bs        = 1;
597     link->unitbytes = 2 * sizeof(int);
598     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
599     link->basicunit = MPI_2INT;
600     link->unit      = MPI_2INT;
601   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
602     PackInit_PairType_PetscInt_PetscInt_1_1(link);
603     link->bs        = 1;
604     link->unitbytes = 2 * sizeof(PetscInt);
605     link->basicunit = MPIU_2INT;
606     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
607     link->unit      = MPIU_2INT;
608   } else if (nPetscReal) {
609     if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link);
610     else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link);
611     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link);
612     else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link);
613     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link);
614     else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link);
615     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link);
616     else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link);
617     link->bs        = nPetscReal;
618     link->unitbytes = nPetscReal * sizeof(PetscReal);
619     link->basicunit = MPIU_REAL;
620     if (link->bs == 1) {
621       link->isbuiltin = PETSC_TRUE;
622       link->unit      = MPIU_REAL;
623     }
624   } else if (nPetscInt) {
625     if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link);
626     else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
627     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link);
628     else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
629     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link);
630     else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
631     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link);
632     else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
633     link->bs        = nPetscInt;
634     link->unitbytes = nPetscInt * sizeof(PetscInt);
635     link->basicunit = MPIU_INT;
636     if (link->bs == 1) {
637       link->isbuiltin = PETSC_TRUE;
638       link->unit      = MPIU_INT;
639     }
640 #if defined(PETSC_USE_64BIT_INDICES)
641   } else if (nInt) {
642     if (nInt == 8) PackInit_IntegerType_int_8_1(link);
643     else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link);
644     else if (nInt == 4) PackInit_IntegerType_int_4_1(link);
645     else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link);
646     else if (nInt == 2) PackInit_IntegerType_int_2_1(link);
647     else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link);
648     else if (nInt == 1) PackInit_IntegerType_int_1_1(link);
649     else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link);
650     link->bs        = nInt;
651     link->unitbytes = nInt * sizeof(int);
652     link->basicunit = MPI_INT;
653     if (link->bs == 1) {
654       link->isbuiltin = PETSC_TRUE;
655       link->unit      = MPI_INT;
656     }
657 #endif
658   } else if (nSignedChar) {
659     if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link);
660     else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
661     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link);
662     else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
663     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link);
664     else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
665     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link);
666     else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
667     link->bs        = nSignedChar;
668     link->unitbytes = nSignedChar * sizeof(SignedChar);
669     link->basicunit = MPI_SIGNED_CHAR;
670     if (link->bs == 1) {
671       link->isbuiltin = PETSC_TRUE;
672       link->unit      = MPI_SIGNED_CHAR;
673     }
674   } else if (nUnsignedChar) {
675     if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link);
676     else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
677     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link);
678     else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
679     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link);
680     else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
681     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link);
682     else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
683     link->bs        = nUnsignedChar;
684     link->unitbytes = nUnsignedChar * sizeof(UnsignedChar);
685     link->basicunit = MPI_UNSIGNED_CHAR;
686     if (link->bs == 1) {
687       link->isbuiltin = PETSC_TRUE;
688       link->unit      = MPI_UNSIGNED_CHAR;
689     }
690 #if defined(PETSC_HAVE_COMPLEX)
691   } else if (nPetscComplex) {
692     if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link);
693     else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
694     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link);
695     else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
696     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link);
697     else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
698     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link);
699     else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
700     link->bs        = nPetscComplex;
701     link->unitbytes = nPetscComplex * sizeof(PetscComplex);
702     link->basicunit = MPIU_COMPLEX;
703     if (link->bs == 1) {
704       link->isbuiltin = PETSC_TRUE;
705       link->unit      = MPIU_COMPLEX;
706     }
707 #endif
708   } else {
709     MPI_Aint lb, nbyte;
710     PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte));
711     PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
712     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
713       if (nbyte == 4) PackInit_DumbType_char_4_1(link);
714       else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link);
715       else if (nbyte == 2) PackInit_DumbType_char_2_1(link);
716       else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link);
717       else if (nbyte == 1) PackInit_DumbType_char_1_1(link);
718       else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link);
719       link->bs        = nbyte;
720       link->unitbytes = nbyte;
721       link->basicunit = MPI_BYTE;
722     } else {
723       nInt = nbyte / sizeof(int);
724       if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link);
725       else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link);
726       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link);
727       else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link);
728       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link);
729       else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link);
730       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link);
731       else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link);
732       link->bs        = nInt;
733       link->unitbytes = nbyte;
734       link->basicunit = MPI_INT;
735     }
736     if (link->isbuiltin) link->unit = unit;
737   }
738 
739   if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit));
740 
741   link->Memcpy = PetscSFLinkMemcpy_Host;
742   PetscFunctionReturn(PETSC_SUCCESS);
743 }
744 
745 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *))
746 {
747   PetscFunctionBegin;
748   *UnpackAndOp = NULL;
749   if (PetscMemTypeHost(mtype)) {
750     if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
751     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
752     else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
753     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
754     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
755     else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
756     else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
757     else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
758     else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
759     else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
760     else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
761     else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
762     else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
763   }
764 #if defined(PETSC_HAVE_DEVICE)
765   else if (PetscMemTypeDevice(mtype) && !atomic) {
766     if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
767     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
768     else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
769     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
770     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
771     else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
772     else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
773     else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
774     else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
775     else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
776     else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
777     else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
778     else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
779   } else if (PetscMemTypeDevice(mtype) && atomic) {
780     if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
781     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
782     else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
783     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
784     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
785     else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
786     else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
787     else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
788     else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
789     else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
790     else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
791     else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
792     else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
793   }
794 #endif
795   PetscFunctionReturn(PETSC_SUCCESS);
796 }
797 
798 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *))
799 {
800   PetscFunctionBegin;
801   *ScatterAndOp = NULL;
802   if (PetscMemTypeHost(mtype)) {
803     if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
804     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
805     else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
806     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
807     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
808     else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
809     else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
810     else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
811     else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
812     else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
813     else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
814     else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
815     else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
816   }
817 #if defined(PETSC_HAVE_DEVICE)
818   else if (PetscMemTypeDevice(mtype) && !atomic) {
819     if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
820     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
821     else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
822     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
823     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
824     else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
825     else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
826     else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
827     else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
828     else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
829     else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
830     else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
831     else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
832   } else if (PetscMemTypeDevice(mtype) && atomic) {
833     if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
834     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
835     else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
836     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
837     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
838     else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
839     else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
840     else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
841     else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
842     else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
843     else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
844     else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
845     else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
846   }
847 #endif
848   PetscFunctionReturn(PETSC_SUCCESS);
849 }
850 
851 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *))
852 {
853   PetscFunctionBegin;
854   *FetchAndOp = NULL;
855   PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
856   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
857 #if defined(PETSC_HAVE_DEVICE)
858   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
859   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
860 #endif
861   PetscFunctionReturn(PETSC_SUCCESS);
862 }
863 
864 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *))
865 {
866   PetscFunctionBegin;
867   *FetchAndOpLocal = NULL;
868   PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
869   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
870 #if defined(PETSC_HAVE_DEVICE)
871   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
872   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
873 #endif
874   PetscFunctionReturn(PETSC_SUCCESS);
875 }
876 
877 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
878 {
879   PetscLogDouble flops;
880   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
881 
882   PetscFunctionBegin;
883   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
884     flops = bas->rootbuflen[scope] * link->bs;               /* # of roots in buffer x # of scalars in unit */
885 #if defined(PETSC_HAVE_DEVICE)
886     if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops));
887     else
888 #endif
889       PetscCall(PetscLogFlops(flops));
890   }
891   PetscFunctionReturn(PETSC_SUCCESS);
892 }
893 
894 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
895 {
896   PetscLogDouble flops;
897 
898   PetscFunctionBegin;
899   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
900     flops = sf->leafbuflen[scope] * link->bs;                /* # of roots in buffer x # of scalars in unit */
901 #if defined(PETSC_HAVE_DEVICE)
902     if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops));
903     else
904 #endif
905       PetscCall(PetscLogFlops(flops));
906   }
907   PetscFunctionReturn(PETSC_SUCCESS);
908 }
909 
910 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
911   Input Parameters:
912   +sf      - The StarForest
913   .link    - The link
914   .count   - Number of entries to unpack
915   .start   - The first index, significent when indices=NULL
916   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
917   .buf     - A contiguous buffer to unpack from
918   -op      - Operation after unpack
919 
920   Output Parameters:
921   .data    - The data to unpack to
922 */
923 static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op)
924 {
925   PetscFunctionBegin;
926 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
927   {
928     PetscInt i;
929     if (indices) {
930       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
931          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
932       */
933       for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op));
934     } else {
935       PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op));
936     }
937   }
938   PetscFunctionReturn(PETSC_SUCCESS);
939 #else
940   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
941 #endif
942 }
943 
944 static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op)
945 {
946   PetscFunctionBegin;
947 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
948   {
949     PetscInt i, disp;
950     if (!srcIdx) {
951       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op));
952     } else {
953       for (i = 0; i < count; i++) {
954         disp = dstIdx ? dstIdx[i] : dstStart + i;
955         PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op));
956       }
957     }
958   }
959   PetscFunctionReturn(PETSC_SUCCESS);
960 #else
961   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
962 #endif
963 }
964 
965 /*=============================================================================
966               Pack/Unpack/Fetch/Scatter routines
967  ============================================================================*/
968 
969 /* Pack rootdata to rootbuf
970   Input Parameters:
971   + sf       - The SF this packing works on.
972   . link     - It gives the memtype of the roots and also provides root buffer.
973   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
974   - rootdata - Where to read the roots.
975 
976   Notes:
977   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
978   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
979  */
980 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
981 {
982   const PetscInt *rootindices = NULL;
983   PetscInt        count, start;
984   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
985   PetscMemType   rootmtype                                                                                        = link->rootmtype;
986   PetscSFPackOpt opt                                                                                              = NULL;
987 
988   PetscFunctionBegin;
989   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
990   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
991     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
992     PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack));
993     PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
994   }
995   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
996   PetscFunctionReturn(PETSC_SUCCESS);
997 }
998 
999 /* Pack leafdata to leafbuf */
1000 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
1001 {
1002   const PetscInt *leafindices = NULL;
1003   PetscInt        count, start;
1004   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1005   PetscMemType   leafmtype                                                                                        = link->leafmtype;
1006   PetscSFPackOpt opt                                                                                              = NULL;
1007 
1008   PetscFunctionBegin;
1009   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
1010   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1011     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
1012     PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack));
1013     PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
1014   }
1015   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
1016   PetscFunctionReturn(PETSC_SUCCESS);
1017 }
1018 
1019 /* Pack rootdata to rootbuf, which are in the same memory space */
1020 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
1021 {
1022   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1023 
1024   PetscFunctionBegin;
1025   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1026     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1027     if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
1028   }
1029   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
1030   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata));
1031   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
1032   PetscFunctionReturn(PETSC_SUCCESS);
1033 }
1034 /* Pack leafdata to leafbuf, which are in the same memory space */
1035 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
1036 {
1037   PetscFunctionBegin;
1038   if (scope == PETSCSF_REMOTE) {
1039     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1040     if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
1041   }
1042   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
1043   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata));
1044   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
1045   PetscFunctionReturn(PETSC_SUCCESS);
1046 }
1047 
1048 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1049 {
1050   const PetscInt *rootindices = NULL;
1051   PetscInt        count, start;
1052   PetscSF_Basic  *bas                                                                                                    = (PetscSF_Basic *)sf->data;
1053   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1054   PetscMemType   rootmtype                                                                                               = link->rootmtype;
1055   PetscSFPackOpt opt                                                                                                     = NULL;
1056 
1057   PetscFunctionBegin;
1058   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1059     PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp));
1060     if (UnpackAndOp) {
1061       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
1062       PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
1063     } else {
1064       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices));
1065       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op));
1066     }
1067   }
1068   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op));
1069   PetscFunctionReturn(PETSC_SUCCESS);
1070 }
1071 
1072 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1073 {
1074   const PetscInt *leafindices = NULL;
1075   PetscInt        count, start;
1076   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1077   PetscMemType   leafmtype                                                                                               = link->leafmtype;
1078   PetscSFPackOpt opt                                                                                                     = NULL;
1079 
1080   PetscFunctionBegin;
1081   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1082     PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp));
1083     if (UnpackAndOp) {
1084       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
1085       PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
1086     } else {
1087       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices));
1088       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op));
1089     }
1090   }
1091   PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op));
1092   PetscFunctionReturn(PETSC_SUCCESS);
1093 }
1094 /* Unpack rootbuf to rootdata, which are in the same memory space */
1095 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1096 {
1097   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1098 
1099   PetscFunctionBegin;
1100   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
1101   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op));
1102   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1103   if (scope == PETSCSF_REMOTE) {
1104     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
1105     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1106   }
1107   PetscFunctionReturn(PETSC_SUCCESS);
1108 }
1109 
1110 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1111 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1112 {
1113   PetscFunctionBegin;
1114   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
1115   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op));
1116   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1117   if (scope == PETSCSF_REMOTE) {
1118     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
1119     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1120   }
1121   PetscFunctionReturn(PETSC_SUCCESS);
1122 }
1123 
1124 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1125 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op)
1126 {
1127   const PetscInt *rootindices = NULL;
1128   PetscInt        count, start;
1129   PetscSF_Basic  *bas                                                                                             = (PetscSF_Basic *)sf->data;
1130   PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL;
1131   PetscMemType   rootmtype                                                                                        = link->rootmtype;
1132   PetscSFPackOpt opt                                                                                              = NULL;
1133 
1134   PetscFunctionBegin;
1135   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
1136   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1137     /* Do FetchAndOp on rootdata with rootbuf */
1138     PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp));
1139     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices));
1140     PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype]));
1141   }
1142   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op));
1143   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1144   PetscFunctionReturn(PETSC_SUCCESS);
1145 }
1146 
1147 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op)
1148 {
1149   const PetscInt *rootindices = NULL, *leafindices = NULL;
1150   PetscInt        count, rootstart, leafstart;
1151   PetscSF_Basic  *bas                                                                                                                                                 = (PetscSF_Basic *)sf->data;
1152   PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL;
1153   PetscMemType   rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype;
1154   PetscSFPackOpt leafopt = NULL, rootopt = NULL;
1155   PetscInt       buflen = sf->leafbuflen[PETSCSF_LOCAL];
1156   char          *srcbuf = NULL, *dstbuf = NULL;
1157   PetscBool      dstdups;
1158 
1159   PetscFunctionBegin;
1160   if (!buflen) PetscFunctionReturn(PETSC_SUCCESS);
1161   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1162     if (direction == PETSCSF_ROOT2LEAF) {
1163       PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata));
1164       srcmtype = rootmtype;
1165       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1166       dstmtype = leafmtype;
1167       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1168     } else {
1169       PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata));
1170       srcmtype = leafmtype;
1171       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1172       dstmtype = rootmtype;
1173       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1174     }
1175     PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes));
1176     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1177     if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link));
1178     if (direction == PETSCSF_ROOT2LEAF) {
1179       PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op));
1180     } else {
1181       PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op));
1182     }
1183   } else {
1184     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1185     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
1186     PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp));
1187     if (ScatterAndOp) {
1188       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
1189       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
1190       if (direction == PETSCSF_ROOT2LEAF) {
1191         PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata));
1192       } else {
1193         PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata));
1194       }
1195     } else {
1196       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
1197       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
1198       if (direction == PETSCSF_ROOT2LEAF) {
1199         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op));
1200       } else {
1201         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op));
1202       }
1203     }
1204   }
1205   PetscFunctionReturn(PETSC_SUCCESS);
1206 }
1207 
1208 /* Fetch rootdata to leafdata and leafupdate locally */
1209 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1210 {
1211   const PetscInt *rootindices = NULL, *leafindices = NULL;
1212   PetscInt        count, rootstart, leafstart;
1213   PetscSF_Basic  *bas                                                                                                                                                            = (PetscSF_Basic *)sf->data;
1214   PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1215   const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype;
1216   PetscSFPackOpt     leafopt = NULL, rootopt = NULL;
1217 
1218   PetscFunctionBegin;
1219   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(PETSC_SUCCESS);
1220   if (rootmtype != leafmtype) {
1221     /* The local communication has to go through pack and unpack */
1222     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1223   } else {
1224     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
1225     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
1226     PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal));
1227     PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate));
1228   }
1229   PetscFunctionReturn(PETSC_SUCCESS);
1230 }
1231 
1232 /*
1233   Create per-rank pack/unpack optimizations based on indice patterns
1234 
1235    Input Parameters:
1236   +  n       - Number of destination ranks
1237   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1238   -  idx     - [*]   Array storing indices
1239 
1240    Output Parameters:
1241   +  opt     - Pack optimizations. NULL if no optimizations.
1242 */
1243 PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out)
1244 {
1245   PetscInt       r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y;
1246   PetscBool      optimizable = PETSC_TRUE;
1247   PetscSFPackOpt opt;
1248 
1249   PetscFunctionBegin;
1250   PetscCall(PetscMalloc1(1, &opt));
1251   PetscCall(PetscMalloc1(7 * n + 2, &opt->array));
1252   opt->n = opt->array[0] = n;
1253   opt->offset            = opt->array + 1;
1254   opt->start             = opt->array + n + 2;
1255   opt->dx                = opt->array + 2 * n + 2;
1256   opt->dy                = opt->array + 3 * n + 2;
1257   opt->dz                = opt->array + 4 * n + 2;
1258   opt->X                 = opt->array + 5 * n + 2;
1259   opt->Y                 = opt->array + 6 * n + 2;
1260 
1261   for (r = 0; r < n; r++) {            /* For each destination rank */
1262     m     = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1263     p     = offset[r];
1264     start = idx[p]; /* First index for this rank */
1265     p++;
1266 
1267     /* Search in X dimension */
1268     for (dx = 1; dx < m; dx++, p++) {
1269       if (start + dx != idx[p]) break;
1270     }
1271 
1272     dydz = m / dx;
1273     X    = dydz > 1 ? (idx[p] - start) : dx;
1274     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1275     if (m % dx || X <= 0) {
1276       optimizable = PETSC_FALSE;
1277       goto finish;
1278     }
1279     for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */
1280       for (i = 0; i < dx; i++, p++) {
1281         if (start + X * dy + i != idx[p]) {
1282           if (i) {
1283             optimizable = PETSC_FALSE;
1284             goto finish;
1285           } /* The pattern is violated in the middle of an x-walk */
1286           else
1287             goto Z_dimension;
1288         }
1289       }
1290     }
1291 
1292   Z_dimension:
1293     dz = m / (dx * dy);
1294     Y  = dz > 1 ? (idx[p] - start) / X : dy;
1295     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1296     if (m % (dx * dy) || Y <= 0) {
1297       optimizable = PETSC_FALSE;
1298       goto finish;
1299     }
1300     for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1301       for (j = 0; j < dy; j++) {
1302         for (i = 0; i < dx; i++, p++) {
1303           if (start + X * Y * k + X * j + i != idx[p]) {
1304             optimizable = PETSC_FALSE;
1305             goto finish;
1306           }
1307         }
1308       }
1309     }
1310     opt->start[r] = start;
1311     opt->dx[r]    = dx;
1312     opt->dy[r]    = dy;
1313     opt->dz[r]    = dz;
1314     opt->X[r]     = X;
1315     opt->Y[r]     = Y;
1316   }
1317 
1318 finish:
1319   /* If not optimizable, free arrays to save memory */
1320   if (!n || !optimizable) {
1321     PetscCall(PetscFree(opt->array));
1322     PetscCall(PetscFree(opt));
1323     *out = NULL;
1324   } else {
1325     opt->offset[0] = 0;
1326     for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r];
1327     *out = opt;
1328   }
1329   PetscFunctionReturn(PETSC_SUCCESS);
1330 }
1331 
1332 static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out)
1333 {
1334   PetscSFPackOpt opt = *out;
1335 
1336   PetscFunctionBegin;
1337   if (opt) {
1338     PetscCall(PetscSFFree(sf, mtype, opt->array));
1339     PetscCall(PetscFree(opt));
1340     *out = NULL;
1341   }
1342   PetscFunctionReturn(PETSC_SUCCESS);
1343 }
1344 
1345 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1346 {
1347   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1348   PetscInt       i, j;
1349 
1350   PetscFunctionBegin;
1351   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1352   for (i = 0; i < 2; i++) { /* Set defaults */
1353     sf->leafstart[i]   = 0;
1354     sf->leafcontig[i]  = PETSC_TRUE;
1355     sf->leafdups[i]    = PETSC_FALSE;
1356     bas->rootstart[i]  = 0;
1357     bas->rootcontig[i] = PETSC_TRUE;
1358     bas->rootdups[i]   = PETSC_FALSE;
1359   }
1360 
1361   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1362   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1363 
1364   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1365   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1366 
1367   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1368   for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */
1369     if (sf->rmine[i] != sf->leafstart[0] + i) {
1370       sf->leafcontig[0] = PETSC_FALSE;
1371       break;
1372     }
1373   }
1374   for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */
1375     if (sf->rmine[i] != sf->leafstart[1] + j) {
1376       sf->leafcontig[1] = PETSC_FALSE;
1377       break;
1378     }
1379   }
1380 
1381   /* If not, see if we can have per-rank optimizations by doing index analysis */
1382   if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]));
1383   if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1]));
1384 
1385   /* Are root indices for self and remote contiguous? */
1386   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1387   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1388 
1389   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1390   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1391 
1392   for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) {
1393     if (bas->irootloc[i] != bas->rootstart[0] + i) {
1394       bas->rootcontig[0] = PETSC_FALSE;
1395       break;
1396     }
1397   }
1398   for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) {
1399     if (bas->irootloc[i] != bas->rootstart[1] + j) {
1400       bas->rootcontig[1] = PETSC_FALSE;
1401       break;
1402     }
1403   }
1404 
1405   if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]));
1406   if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]));
1407 
1408   /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1409   if (PetscDefined(HAVE_DEVICE)) {
1410     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1411     if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]));
1412     if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1]));
1413     if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]));
1414     if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1]));
1415   }
1416   PetscFunctionReturn(PETSC_SUCCESS);
1417 }
1418 
1419 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1420 {
1421   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1422   PetscInt       i;
1423 
1424   PetscFunctionBegin;
1425   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
1426     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i]));
1427     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i]));
1428 #if defined(PETSC_HAVE_DEVICE)
1429     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i]));
1430     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i]));
1431 #endif
1432   }
1433   PetscFunctionReturn(PETSC_SUCCESS);
1434 }
1435