xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision a69119a591a03a9d906b29c0a4e9802e4d7c9795)
1 #include "petsc/private/sfimpl.h"
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
6 
7 /*
8  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9  * therefore we pack data types manually. This file defines packing routines for the standard data types.
10  */
11 
12 #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d
13 
14 /* Operations working like s += t */
15 #define OP_BINARY(op, s, t) \
16   do { (s) = (s)op(t); } while (0) /* binary ops in the middle such as +, *, && etc. */
17 #define OP_FUNCTION(op, s, t) \
18   do { (s) = op((s), (t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */
19 #define OP_LXOR(op, s, t) \
20   do { (s) = (!(s)) != (!(t)); } while (0) /* logical exclusive OR */
21 #define OP_ASSIGN(op, s, t) \
22   do { (s) = (t); } while (0)
23 /* Ref MPI MAXLOC */
24 #define OP_XLOC(op, s, t) \
25   do { \
26     if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \
27     else if (!((s).u op(t).u)) s = t; \
28   } while (0)
29 
30 /* DEF_PackFunc - macro defining a Pack routine
31 
32    Arguments of the macro:
33    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
34    .BS        Block size for vectorization. It is a factor of bsz.
35    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
36 
37    Arguments of the Pack routine:
38    +count     Number of indices in idx[].
39    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
40    .opt       Per-pack optimization plan. NULL means no such plan.
41    .idx       Indices of entries to packed.
42    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
43    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
44    -packed    Address of the packed data.
45  */
46 #define DEF_PackFunc(Type, BS, EQ) \
47   static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) { \
48     const Type    *u = (const Type *)unpacked, *u2; \
49     Type          *p = (Type *)packed, *p2; \
50     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
51     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
52     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
53     PetscFunctionBegin; \
54     if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \
55     else if (opt) { /* has optimizations available */ p2 = p; \
56       for (r = 0; r < opt->n; r++) { \
57         u2 = u + opt->start[r] * MBS; \
58         X  = opt->X[r]; \
59         Y  = opt->Y[r]; \
60         for (k = 0; k < opt->dz[r]; k++) \
61           for (j = 0; j < opt->dy[r]; j++) { \
62             PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \
63             p2 += opt->dx[r] * MBS; \
64           } \
65       } \
66     } else { \
67       for (i = 0; i < count; i++) \
68         for (j = 0; j < M; j++)    /* Decent compilers should eliminate this loop when M = const 1 */ \
69           for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
70             p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \
71     } \
72     PetscFunctionReturn(0); \
73   }
74 
75 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
76                 and inserts into a sparse array.
77 
78    Arguments:
79   .Type       Type of the data
80   .BS         Block size for vectorization
81   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
82 
83   Notes:
84    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
85  */
86 #define DEF_UnpackFunc(Type, BS, EQ) \
87   static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) { \
88     Type          *u = (Type *)unpacked, *u2; \
89     const Type    *p = (const Type *)packed; \
90     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
91     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
92     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
93     PetscFunctionBegin; \
94     if (!idx) { \
95       u += start * MBS; \
96       if (u != p) PetscCall(PetscArraycpy(u, p, count *MBS)); \
97     } else if (opt) { /* has optimizations available */ \
98       for (r = 0; r < opt->n; r++) { \
99         u2 = u + opt->start[r] * MBS; \
100         X  = opt->X[r]; \
101         Y  = opt->Y[r]; \
102         for (k = 0; k < opt->dz[r]; k++) \
103           for (j = 0; j < opt->dy[r]; j++) { \
104             PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \
105             p += opt->dx[r] * MBS; \
106           } \
107       } \
108     } else { \
109       for (i = 0; i < count; i++) \
110         for (j = 0; j < M; j++) \
111           for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \
112     } \
113     PetscFunctionReturn(0); \
114   }
115 
116 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
117 
118    Arguments:
119   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
120   .Type       Type of the data
121   .BS         Block size for vectorization
122   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
123   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
124   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
125  */
126 #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \
127   static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) { \
128     Type          *u = (Type *)unpacked, *u2; \
129     const Type    *p = (const Type *)packed; \
130     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
131     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
132     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
133     PetscFunctionBegin; \
134     if (!idx) { \
135       u += start * MBS; \
136       for (i = 0; i < count; i++) \
137         for (j = 0; j < M; j++) \
138           for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
139     } else if (opt) { /* idx[] has patterns */ \
140       for (r = 0; r < opt->n; r++) { \
141         u2 = u + opt->start[r] * MBS; \
142         X  = opt->X[r]; \
143         Y  = opt->Y[r]; \
144         for (k = 0; k < opt->dz[r]; k++) \
145           for (j = 0; j < opt->dy[r]; j++) { \
146             for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \
147             p += opt->dx[r] * MBS; \
148           } \
149       } \
150     } else { \
151       for (i = 0; i < count; i++) \
152         for (j = 0; j < M; j++) \
153           for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
154     } \
155     PetscFunctionReturn(0); \
156   }
157 
158 #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \
159   static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) { \
160     Type          *u = (Type *)unpacked, *p = (Type *)packed, tmp; \
161     PetscInt       i, j, k, r, l, bs = link->bs; \
162     const PetscInt M   = (EQ) ? 1 : bs / BS; \
163     const PetscInt MBS = M * BS; \
164     PetscFunctionBegin; \
165     for (i = 0; i < count; i++) { \
166       r = (!idx ? start + i : idx[i]) * MBS; \
167       l = i * MBS; \
168       for (j = 0; j < M; j++) \
169         for (k = 0; k < BS; k++) { \
170           tmp = u[r + j * BS + k]; \
171           OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \
172           p[l + j * BS + k] = tmp; \
173         } \
174     } \
175     PetscFunctionReturn(0); \
176   }
177 
178 #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \
179   static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) { \
180     const Type    *u = (const Type *)src; \
181     Type          *v = (Type *)dst; \
182     PetscInt       i, j, k, s, t, X, Y, bs = link->bs; \
183     const PetscInt M   = (EQ) ? 1 : bs / BS; \
184     const PetscInt MBS = M * BS; \
185     PetscFunctionBegin; \
186     if (!srcIdx) { /* src is contiguous */ \
187       u += srcStart * MBS; \
188       PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \
189     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
190       u += srcOpt->start[0] * MBS; \
191       v += dstStart * MBS; \
192       X = srcOpt->X[0]; \
193       Y = srcOpt->Y[0]; \
194       for (k = 0; k < srcOpt->dz[0]; k++) \
195         for (j = 0; j < srcOpt->dy[0]; j++) { \
196           for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \
197           v += srcOpt->dx[0] * MBS; \
198         } \
199     } else { /* all other cases */ \
200       for (i = 0; i < count; i++) { \
201         s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \
202         t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \
203         for (j = 0; j < M; j++) \
204           for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \
205       } \
206     } \
207     PetscFunctionReturn(0); \
208   }
209 
210 #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \
211   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) { \
212     Type          *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \
213     const Type    *ldata = (const Type *)leafdata; \
214     PetscInt       i, j, k, r, l, bs = link->bs; \
215     const PetscInt M   = (EQ) ? 1 : bs / BS; \
216     const PetscInt MBS = M * BS; \
217     PetscFunctionBegin; \
218     for (i = 0; i < count; i++) { \
219       r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \
220       l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \
221       for (j = 0; j < M; j++) \
222         for (k = 0; k < BS; k++) { \
223           lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \
224           OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \
225         } \
226     } \
227     PetscFunctionReturn(0); \
228   }
229 
230 /* Pack, Unpack/Fetch ops */
231 #define DEF_Pack(Type, BS, EQ) \
232   DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) { \
233     link->h_Pack             = CPPJoin4(Pack, Type, BS, EQ); \
234     link->h_UnpackAndInsert  = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \
235     link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \
236   }
237 
238 /* Add, Mult ops */
239 #define DEF_Add(Type, BS, EQ) \
240   DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) { \
241     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \
242     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult, Type, BS, EQ); \
243     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd, Type, BS, EQ); \
244     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \
245     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult, Type, BS, EQ); \
246     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \
247   }
248 
249 /* Max, Min ops */
250 #define DEF_Cmp(Type, BS, EQ) \
251   DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) { \
252     link->h_UnpackAndMax  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
253     link->h_UnpackAndMin  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
254     link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
255     link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
256   }
257 
258 /* Logical ops.
259   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
260   the compilation warning "empty macro arguments are undefined in ISO C90"
261  */
262 #define DEF_Log(Type, BS, EQ) \
263   DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) { \
264     link->h_UnpackAndLAND  = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \
265     link->h_UnpackAndLOR   = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \
266     link->h_UnpackAndLXOR  = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \
267     link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \
268     link->h_ScatterAndLOR  = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \
269     link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \
270   }
271 
272 /* Bitwise ops */
273 #define DEF_Bit(Type, BS, EQ) \
274   DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) { \
275     link->h_UnpackAndBAND  = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \
276     link->h_UnpackAndBOR   = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \
277     link->h_UnpackAndBXOR  = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \
278     link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \
279     link->h_ScatterAndBOR  = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \
280     link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \
281   }
282 
283 /* Maxloc, Minloc ops */
284 #define DEF_Xloc(Type, BS, EQ) \
285   DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) { \
286     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
287     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
288     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
289     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
290   }
291 
292 #define DEF_IntegerType(Type, BS, EQ) \
293   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) { \
294     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
295     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
296     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
297     CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \
298     CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \
299   }
300 
301 #define DEF_RealType(Type, BS, EQ) \
302   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) { \
303     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
304     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
305     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
306   }
307 
308 #if defined(PETSC_HAVE_COMPLEX)
309 #define DEF_ComplexType(Type, BS, EQ) \
310   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) { \
311     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
312     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
313   }
314 #endif
315 
316 #define DEF_DumbType(Type, BS, EQ) \
317   DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) { \
318     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
319   }
320 
321 /* Maxloc, Minloc */
322 #define DEF_PairType(Type, BS, EQ) \
323   DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) { \
324     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
325     CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \
326   }
327 
328 DEF_IntegerType(PetscInt, 1, 1)   /* unit = 1 MPIU_INT  */
329   DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */
330   DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */
331   DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */
332   DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */
333   DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */
334   DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */
335   DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
336 
337 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
338   DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0)
339 #endif
340 
341   /* The typedefs are used to get a typename without space that CPPJoin can handle */
342   typedef signed char SignedChar;
343 DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0)
344 
345   typedef unsigned char UnsignedChar;
346 DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0)
347 
348   DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0)
349 #if defined(PETSC_HAVE_COMPLEX)
350     DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0)
351 #endif
352 
353 #define PairType(Type1, Type2) Type1##_##Type2
354       typedef struct {
355   int u;
356   int i;
357 } PairType(int, int);
358 typedef struct {
359   PetscInt u;
360   PetscInt i;
361 } PairType(PetscInt, PetscInt);
362 DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1)
363 
364   /* If we don't know the basic type, we treat it as a stream of chars or ints */
365   DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0)
366 
367     typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
368 DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0)
369 
370   PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link) {
371   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
372   PetscInt       i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8;
373 
374   PetscFunctionBegin;
375   /* Destroy device-specific fields */
376   if (link->deviceinited) PetscCall((*link->Destroy)(sf, link));
377 
378   /* Destroy host related fields */
379   if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit));
380   if (!link->use_nvshmem) {
381     for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */
382       if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i]));
383     }
384     PetscCall(PetscFree(link->reqs));
385     for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
386       PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]));
387       PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]));
388     }
389   }
390   PetscCall(PetscFree(link));
391   PetscFunctionReturn(0);
392 }
393 
394 PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) {
395   PetscFunctionBegin;
396   PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata));
397 #if defined(PETSC_HAVE_NVSHMEM)
398   {
399     PetscBool use_nvshmem;
400     PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem));
401     if (use_nvshmem) {
402       PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
403       PetscFunctionReturn(0);
404     }
405   }
406 #endif
407   PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
408   PetscFunctionReturn(0);
409 }
410 
411 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
412    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
413 */
414 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs) {
415   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
416   PetscInt           i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
417   const PetscInt    *rootoffset, *leafoffset;
418   MPI_Aint           disp;
419   MPI_Comm           comm          = PetscObjectComm((PetscObject)sf);
420   MPI_Datatype       unit          = link->unit;
421   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
422   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
423 
424   PetscFunctionBegin;
425   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
426   if (sf->persistent) {
427     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
428       PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
429       if (direction == PETSCSF_LEAF2ROOT) {
430         for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
431           disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
432           cnt  = rootoffset[i + 1] - rootoffset[i];
433           PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
434         }
435       } else { /* PETSCSF_ROOT2LEAF */
436         for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
437           disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
438           cnt  = rootoffset[i + 1] - rootoffset[i];
439           PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
440         }
441       }
442       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
443     }
444 
445     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
446       PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
447       if (direction == PETSCSF_LEAF2ROOT) {
448         for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
449           disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
450           cnt  = leafoffset[i + 1] - leafoffset[i];
451           PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
452         }
453       } else { /* PETSCSF_ROOT2LEAF */
454         for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
455           disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
456           cnt  = leafoffset[i + 1] - leafoffset[i];
457           PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
458         }
459       }
460       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
461     }
462   }
463   if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
464   if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
465   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
466   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
467   PetscFunctionReturn(0);
468 }
469 
470 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink) {
471   PetscSFLink    link, *p;
472   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
473 
474   PetscFunctionBegin;
475   /* Look for types in cache */
476   for (p = &bas->inuse; (link = *p); p = &link->next) {
477     PetscBool match;
478     PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
479     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
480       switch (cmode) {
481       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
482       case PETSC_USE_POINTER: break;
483       default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode");
484       }
485       *mylink = link;
486       PetscFunctionReturn(0);
487     }
488   }
489   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack");
490 }
491 
492 PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink) {
493   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
494   PetscSFLink    link = *mylink;
495 
496   PetscFunctionBegin;
497   link->rootdata = NULL;
498   link->leafdata = NULL;
499   link->next     = bas->avail;
500   bas->avail     = link;
501   *mylink        = NULL;
502   PetscFunctionReturn(0);
503 }
504 
505 /* Error out on unsupported overlapped communications */
506 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata) {
507   PetscSFLink    link, *p;
508   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
509   PetscBool      match;
510 
511   PetscFunctionBegin;
512   if (PetscDefined(USE_DEBUG)) {
513     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
514        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
515     */
516     if (rootdata || leafdata) {
517       for (p = &bas->inuse; (link = *p); p = &link->next) {
518         PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
519         PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata);
520       }
521     }
522   }
523   PetscFunctionReturn(0);
524 }
525 
526 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n) {
527   PetscFunctionBegin;
528   if (n) PetscCall(PetscMemcpy(dst, src, n));
529   PetscFunctionReturn(0);
530 }
531 
532 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit) {
533   PetscInt    nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
534   PetscBool   is2Int, is2PetscInt;
535   PetscMPIInt ni, na, nd, combiner;
536 #if defined(PETSC_HAVE_COMPLEX)
537   PetscInt nPetscComplex = 0;
538 #endif
539 
540   PetscFunctionBegin;
541   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar));
542   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar));
543   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
544   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt));
545   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt));
546   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal));
547 #if defined(PETSC_HAVE_COMPLEX)
548   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex));
549 #endif
550   PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int));
551   PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt));
552   /* TODO: shaell we also handle Fortran MPI_2REAL? */
553   PetscCallMPI(MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner));
554   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
555   link->bs        = 1;                                                           /* default */
556 
557   if (is2Int) {
558     PackInit_PairType_int_int_1_1(link);
559     link->bs        = 1;
560     link->unitbytes = 2 * sizeof(int);
561     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
562     link->basicunit = MPI_2INT;
563     link->unit      = MPI_2INT;
564   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
565     PackInit_PairType_PetscInt_PetscInt_1_1(link);
566     link->bs        = 1;
567     link->unitbytes = 2 * sizeof(PetscInt);
568     link->basicunit = MPIU_2INT;
569     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
570     link->unit      = MPIU_2INT;
571   } else if (nPetscReal) {
572     if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link);
573     else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link);
574     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link);
575     else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link);
576     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link);
577     else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link);
578     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link);
579     else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link);
580     link->bs        = nPetscReal;
581     link->unitbytes = nPetscReal * sizeof(PetscReal);
582     link->basicunit = MPIU_REAL;
583     if (link->bs == 1) {
584       link->isbuiltin = PETSC_TRUE;
585       link->unit      = MPIU_REAL;
586     }
587   } else if (nPetscInt) {
588     if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link);
589     else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
590     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link);
591     else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
592     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link);
593     else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
594     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link);
595     else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
596     link->bs        = nPetscInt;
597     link->unitbytes = nPetscInt * sizeof(PetscInt);
598     link->basicunit = MPIU_INT;
599     if (link->bs == 1) {
600       link->isbuiltin = PETSC_TRUE;
601       link->unit      = MPIU_INT;
602     }
603 #if defined(PETSC_USE_64BIT_INDICES)
604   } else if (nInt) {
605     if (nInt == 8) PackInit_IntegerType_int_8_1(link);
606     else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link);
607     else if (nInt == 4) PackInit_IntegerType_int_4_1(link);
608     else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link);
609     else if (nInt == 2) PackInit_IntegerType_int_2_1(link);
610     else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link);
611     else if (nInt == 1) PackInit_IntegerType_int_1_1(link);
612     else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link);
613     link->bs        = nInt;
614     link->unitbytes = nInt * sizeof(int);
615     link->basicunit = MPI_INT;
616     if (link->bs == 1) {
617       link->isbuiltin = PETSC_TRUE;
618       link->unit      = MPI_INT;
619     }
620 #endif
621   } else if (nSignedChar) {
622     if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link);
623     else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
624     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link);
625     else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
626     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link);
627     else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
628     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link);
629     else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
630     link->bs        = nSignedChar;
631     link->unitbytes = nSignedChar * sizeof(SignedChar);
632     link->basicunit = MPI_SIGNED_CHAR;
633     if (link->bs == 1) {
634       link->isbuiltin = PETSC_TRUE;
635       link->unit      = MPI_SIGNED_CHAR;
636     }
637   } else if (nUnsignedChar) {
638     if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link);
639     else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
640     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link);
641     else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
642     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link);
643     else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
644     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link);
645     else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
646     link->bs        = nUnsignedChar;
647     link->unitbytes = nUnsignedChar * sizeof(UnsignedChar);
648     link->basicunit = MPI_UNSIGNED_CHAR;
649     if (link->bs == 1) {
650       link->isbuiltin = PETSC_TRUE;
651       link->unit      = MPI_UNSIGNED_CHAR;
652     }
653 #if defined(PETSC_HAVE_COMPLEX)
654   } else if (nPetscComplex) {
655     if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link);
656     else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
657     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link);
658     else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
659     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link);
660     else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
661     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link);
662     else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
663     link->bs        = nPetscComplex;
664     link->unitbytes = nPetscComplex * sizeof(PetscComplex);
665     link->basicunit = MPIU_COMPLEX;
666     if (link->bs == 1) {
667       link->isbuiltin = PETSC_TRUE;
668       link->unit      = MPIU_COMPLEX;
669     }
670 #endif
671   } else {
672     MPI_Aint lb, nbyte;
673     PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte));
674     PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
675     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
676       if (nbyte == 4) PackInit_DumbType_char_4_1(link);
677       else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link);
678       else if (nbyte == 2) PackInit_DumbType_char_2_1(link);
679       else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link);
680       else if (nbyte == 1) PackInit_DumbType_char_1_1(link);
681       else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link);
682       link->bs        = nbyte;
683       link->unitbytes = nbyte;
684       link->basicunit = MPI_BYTE;
685     } else {
686       nInt = nbyte / sizeof(int);
687       if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link);
688       else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link);
689       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link);
690       else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link);
691       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link);
692       else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link);
693       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link);
694       else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link);
695       link->bs        = nInt;
696       link->unitbytes = nbyte;
697       link->basicunit = MPI_INT;
698     }
699     if (link->isbuiltin) link->unit = unit;
700   }
701 
702   if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit));
703 
704   link->Memcpy = PetscSFLinkMemcpy_Host;
705   PetscFunctionReturn(0);
706 }
707 
708 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *)) {
709   PetscFunctionBegin;
710   *UnpackAndOp = NULL;
711   if (PetscMemTypeHost(mtype)) {
712     if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
713     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
714     else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
715     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
716     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
717     else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
718     else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
719     else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
720     else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
721     else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
722     else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
723     else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
724     else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
725   }
726 #if defined(PETSC_HAVE_DEVICE)
727   else if (PetscMemTypeDevice(mtype) && !atomic) {
728     if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
729     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
730     else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
731     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
732     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
733     else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
734     else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
735     else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
736     else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
737     else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
738     else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
739     else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
740     else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
741   } else if (PetscMemTypeDevice(mtype) && atomic) {
742     if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
743     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
744     else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
745     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
746     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
747     else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
748     else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
749     else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
750     else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
751     else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
752     else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
753     else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
754     else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
755   }
756 #endif
757   PetscFunctionReturn(0);
758 }
759 
760 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *)) {
761   PetscFunctionBegin;
762   *ScatterAndOp = NULL;
763   if (PetscMemTypeHost(mtype)) {
764     if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
765     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
766     else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
767     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
768     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
769     else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
770     else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
771     else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
772     else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
773     else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
774     else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
775     else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
776     else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
777   }
778 #if defined(PETSC_HAVE_DEVICE)
779   else if (PetscMemTypeDevice(mtype) && !atomic) {
780     if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
781     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
782     else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
783     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
784     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
785     else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
786     else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
787     else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
788     else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
789     else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
790     else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
791     else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
792     else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
793   } else if (PetscMemTypeDevice(mtype) && atomic) {
794     if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
795     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
796     else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
797     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
798     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
799     else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
800     else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
801     else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
802     else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
803     else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
804     else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
805     else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
806     else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
807   }
808 #endif
809   PetscFunctionReturn(0);
810 }
811 
812 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *)) {
813   PetscFunctionBegin;
814   *FetchAndOp = NULL;
815   PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
816   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
817 #if defined(PETSC_HAVE_DEVICE)
818   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
819   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
820 #endif
821   PetscFunctionReturn(0);
822 }
823 
824 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *)) {
825   PetscFunctionBegin;
826   *FetchAndOpLocal = NULL;
827   PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
828   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
829 #if defined(PETSC_HAVE_DEVICE)
830   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
831   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
832 #endif
833   PetscFunctionReturn(0);
834 }
835 
836 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) {
837   PetscLogDouble flops;
838   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
839 
840   PetscFunctionBegin;
841   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
842     flops = bas->rootbuflen[scope] * link->bs;               /* # of roots in buffer x # of scalars in unit */
843 #if defined(PETSC_HAVE_DEVICE)
844     if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops));
845     else
846 #endif
847       PetscCall(PetscLogFlops(flops));
848   }
849   PetscFunctionReturn(0);
850 }
851 
852 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) {
853   PetscLogDouble flops;
854 
855   PetscFunctionBegin;
856   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
857     flops = sf->leafbuflen[scope] * link->bs;                /* # of roots in buffer x # of scalars in unit */
858 #if defined(PETSC_HAVE_DEVICE)
859     if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops));
860     else
861 #endif
862       PetscCall(PetscLogFlops(flops));
863   }
864   PetscFunctionReturn(0);
865 }
866 
867 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
868   Input Parameters:
869   +sf      - The StarForest
870   .link    - The link
871   .count   - Number of entries to unpack
872   .start   - The first index, significent when indices=NULL
873   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
874   .buf     - A contiguous buffer to unpack from
875   -op      - Operation after unpack
876 
877   Output Parameters:
878   .data    - The data to unpack to
879 */
880 static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op) {
881   PetscFunctionBegin;
882 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
883   {
884     PetscInt i;
885     if (indices) {
886       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
887          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
888       */
889       for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op));
890     } else {
891       PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op));
892     }
893   }
894   PetscFunctionReturn(0);
895 #else
896   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
897 #endif
898 }
899 
900 static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op) {
901   PetscFunctionBegin;
902 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
903   {
904     PetscInt i, disp;
905     if (!srcIdx) {
906       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op));
907     } else {
908       for (i = 0; i < count; i++) {
909         disp = dstIdx ? dstIdx[i] : dstStart + i;
910         PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op));
911       }
912     }
913   }
914   PetscFunctionReturn(0);
915 #else
916   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
917 #endif
918 }
919 
920 /*=============================================================================
921               Pack/Unpack/Fetch/Scatter routines
922  ============================================================================*/
923 
924 /* Pack rootdata to rootbuf
925   Input Parameters:
926   + sf       - The SF this packing works on.
927   . link     - It gives the memtype of the roots and also provides root buffer.
928   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
929   - rootdata - Where to read the roots.
930 
931   Notes:
932   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
933   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
934  */
935 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) {
936   const PetscInt *rootindices = NULL;
937   PetscInt        count, start;
938   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
939   PetscMemType   rootmtype                                                                                        = link->rootmtype;
940   PetscSFPackOpt opt                                                                                              = NULL;
941 
942   PetscFunctionBegin;
943   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
944   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
945     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
946     PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack));
947     PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
948   }
949   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
950   PetscFunctionReturn(0);
951 }
952 
953 /* Pack leafdata to leafbuf */
954 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) {
955   const PetscInt *leafindices = NULL;
956   PetscInt        count, start;
957   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
958   PetscMemType   leafmtype                                                                                        = link->leafmtype;
959   PetscSFPackOpt opt                                                                                              = NULL;
960 
961   PetscFunctionBegin;
962   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
963   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
964     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
965     PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack));
966     PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
967   }
968   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
969   PetscFunctionReturn(0);
970 }
971 
972 /* Pack rootdata to rootbuf, which are in the same memory space */
973 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) {
974   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
975 
976   PetscFunctionBegin;
977   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
978     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
979     if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
980   }
981   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
982   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata));
983   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
984   PetscFunctionReturn(0);
985 }
986 /* Pack leafdata to leafbuf, which are in the same memory space */
987 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) {
988   PetscFunctionBegin;
989   if (scope == PETSCSF_REMOTE) {
990     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
991     if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
992   }
993   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
994   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata));
995   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
996   PetscFunctionReturn(0);
997 }
998 
999 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) {
1000   const PetscInt *rootindices = NULL;
1001   PetscInt        count, start;
1002   PetscSF_Basic  *bas                                                                                                    = (PetscSF_Basic *)sf->data;
1003   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1004   PetscMemType   rootmtype                                                                                               = link->rootmtype;
1005   PetscSFPackOpt opt                                                                                                     = NULL;
1006 
1007   PetscFunctionBegin;
1008   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1009     PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp));
1010     if (UnpackAndOp) {
1011       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
1012       PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
1013     } else {
1014       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices));
1015       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op));
1016     }
1017   }
1018   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op));
1019   PetscFunctionReturn(0);
1020 }
1021 
1022 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) {
1023   const PetscInt *leafindices = NULL;
1024   PetscInt        count, start;
1025   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1026   PetscMemType   leafmtype                                                                                               = link->leafmtype;
1027   PetscSFPackOpt opt                                                                                                     = NULL;
1028 
1029   PetscFunctionBegin;
1030   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1031     PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp));
1032     if (UnpackAndOp) {
1033       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
1034       PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
1035     } else {
1036       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices));
1037       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op));
1038     }
1039   }
1040   PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op));
1041   PetscFunctionReturn(0);
1042 }
1043 /* Unpack rootbuf to rootdata, which are in the same memory space */
1044 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) {
1045   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1046 
1047   PetscFunctionBegin;
1048   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
1049   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op));
1050   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1051   if (scope == PETSCSF_REMOTE) {
1052     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
1053     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1054   }
1055   PetscFunctionReturn(0);
1056 }
1057 
1058 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1059 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) {
1060   PetscFunctionBegin;
1061   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
1062   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op));
1063   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1064   if (scope == PETSCSF_REMOTE) {
1065     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
1066     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1067   }
1068   PetscFunctionReturn(0);
1069 }
1070 
1071 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1072 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op) {
1073   const PetscInt *rootindices = NULL;
1074   PetscInt        count, start;
1075   PetscSF_Basic  *bas                                                                                             = (PetscSF_Basic *)sf->data;
1076   PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL;
1077   PetscMemType   rootmtype                                                                                        = link->rootmtype;
1078   PetscSFPackOpt opt                                                                                              = NULL;
1079 
1080   PetscFunctionBegin;
1081   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
1082   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1083     /* Do FetchAndOp on rootdata with rootbuf */
1084     PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp));
1085     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices));
1086     PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype]));
1087   }
1088   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op));
1089   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1090   PetscFunctionReturn(0);
1091 }
1092 
1093 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op) {
1094   const PetscInt *rootindices = NULL, *leafindices = NULL;
1095   PetscInt        count, rootstart, leafstart;
1096   PetscSF_Basic  *bas                                                                                                                                                 = (PetscSF_Basic *)sf->data;
1097   PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL;
1098   PetscMemType   rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype;
1099   PetscSFPackOpt leafopt = NULL, rootopt = NULL;
1100   PetscInt       buflen = sf->leafbuflen[PETSCSF_LOCAL];
1101   char          *srcbuf = NULL, *dstbuf = NULL;
1102   PetscBool      dstdups;
1103 
1104   PetscFunctionBegin;
1105   if (!buflen) PetscFunctionReturn(0);
1106   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1107     if (direction == PETSCSF_ROOT2LEAF) {
1108       PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata));
1109       srcmtype = rootmtype;
1110       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1111       dstmtype = leafmtype;
1112       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1113     } else {
1114       PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata));
1115       srcmtype = leafmtype;
1116       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1117       dstmtype = rootmtype;
1118       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1119     }
1120     PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes));
1121     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1122     if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link));
1123     if (direction == PETSCSF_ROOT2LEAF) {
1124       PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op));
1125     } else {
1126       PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op));
1127     }
1128   } else {
1129     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1130     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
1131     PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp));
1132     if (ScatterAndOp) {
1133       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
1134       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
1135       if (direction == PETSCSF_ROOT2LEAF) {
1136         PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata));
1137       } else {
1138         PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata));
1139       }
1140     } else {
1141       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
1142       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
1143       if (direction == PETSCSF_ROOT2LEAF) {
1144         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op));
1145       } else {
1146         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op));
1147       }
1148     }
1149   }
1150   PetscFunctionReturn(0);
1151 }
1152 
1153 /* Fetch rootdata to leafdata and leafupdate locally */
1154 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op) {
1155   const PetscInt *rootindices = NULL, *leafindices = NULL;
1156   PetscInt        count, rootstart, leafstart;
1157   PetscSF_Basic  *bas                                                                                                                                                            = (PetscSF_Basic *)sf->data;
1158   PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1159   const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype;
1160   PetscSFPackOpt     leafopt = NULL, rootopt = NULL;
1161 
1162   PetscFunctionBegin;
1163   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1164   if (rootmtype != leafmtype) {
1165     /* The local communication has to go through pack and unpack */
1166     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1167   } else {
1168     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
1169     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
1170     PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal));
1171     PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate));
1172   }
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 /*
1177   Create per-rank pack/unpack optimizations based on indice patterns
1178 
1179    Input Parameters:
1180   +  n       - Number of destination ranks
1181   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1182   -  idx     - [*]   Array storing indices
1183 
1184    Output Parameters:
1185   +  opt     - Pack optimizations. NULL if no optimizations.
1186 */
1187 PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out) {
1188   PetscInt       r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y;
1189   PetscBool      optimizable = PETSC_TRUE;
1190   PetscSFPackOpt opt;
1191 
1192   PetscFunctionBegin;
1193   PetscCall(PetscMalloc1(1, &opt));
1194   PetscCall(PetscMalloc1(7 * n + 2, &opt->array));
1195   opt->n = opt->array[0] = n;
1196   opt->offset            = opt->array + 1;
1197   opt->start             = opt->array + n + 2;
1198   opt->dx                = opt->array + 2 * n + 2;
1199   opt->dy                = opt->array + 3 * n + 2;
1200   opt->dz                = opt->array + 4 * n + 2;
1201   opt->X                 = opt->array + 5 * n + 2;
1202   opt->Y                 = opt->array + 6 * n + 2;
1203 
1204   for (r = 0; r < n; r++) {            /* For each destination rank */
1205     m     = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1206     p     = offset[r];
1207     start = idx[p]; /* First index for this rank */
1208     p++;
1209 
1210     /* Search in X dimension */
1211     for (dx = 1; dx < m; dx++, p++) {
1212       if (start + dx != idx[p]) break;
1213     }
1214 
1215     dydz = m / dx;
1216     X    = dydz > 1 ? (idx[p] - start) : dx;
1217     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1218     if (m % dx || X <= 0) {
1219       optimizable = PETSC_FALSE;
1220       goto finish;
1221     }
1222     for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */
1223       for (i = 0; i < dx; i++, p++) {
1224         if (start + X * dy + i != idx[p]) {
1225           if (i) {
1226             optimizable = PETSC_FALSE;
1227             goto finish;
1228           } /* The pattern is violated in the middle of an x-walk */
1229           else
1230             goto Z_dimension;
1231         }
1232       }
1233     }
1234 
1235   Z_dimension:
1236     dz = m / (dx * dy);
1237     Y  = dz > 1 ? (idx[p] - start) / X : dy;
1238     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1239     if (m % (dx * dy) || Y <= 0) {
1240       optimizable = PETSC_FALSE;
1241       goto finish;
1242     }
1243     for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1244       for (j = 0; j < dy; j++) {
1245         for (i = 0; i < dx; i++, p++) {
1246           if (start + X * Y * k + X * j + i != idx[p]) {
1247             optimizable = PETSC_FALSE;
1248             goto finish;
1249           }
1250         }
1251       }
1252     }
1253     opt->start[r] = start;
1254     opt->dx[r]    = dx;
1255     opt->dy[r]    = dy;
1256     opt->dz[r]    = dz;
1257     opt->X[r]     = X;
1258     opt->Y[r]     = Y;
1259   }
1260 
1261 finish:
1262   /* If not optimizable, free arrays to save memory */
1263   if (!n || !optimizable) {
1264     PetscCall(PetscFree(opt->array));
1265     PetscCall(PetscFree(opt));
1266     *out = NULL;
1267   } else {
1268     opt->offset[0] = 0;
1269     for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r];
1270     *out = opt;
1271   }
1272   PetscFunctionReturn(0);
1273 }
1274 
1275 static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out) {
1276   PetscSFPackOpt opt = *out;
1277 
1278   PetscFunctionBegin;
1279   if (opt) {
1280     PetscCall(PetscSFFree(sf, mtype, opt->array));
1281     PetscCall(PetscFree(opt));
1282     *out = NULL;
1283   }
1284   PetscFunctionReturn(0);
1285 }
1286 
1287 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) {
1288   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1289   PetscInt       i, j;
1290 
1291   PetscFunctionBegin;
1292   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1293   for (i = 0; i < 2; i++) { /* Set defaults */
1294     sf->leafstart[i]   = 0;
1295     sf->leafcontig[i]  = PETSC_TRUE;
1296     sf->leafdups[i]    = PETSC_FALSE;
1297     bas->rootstart[i]  = 0;
1298     bas->rootcontig[i] = PETSC_TRUE;
1299     bas->rootdups[i]   = PETSC_FALSE;
1300   }
1301 
1302   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1303   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1304 
1305   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1306   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1307 
1308   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1309   for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */
1310     if (sf->rmine[i] != sf->leafstart[0] + i) {
1311       sf->leafcontig[0] = PETSC_FALSE;
1312       break;
1313     }
1314   }
1315   for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */
1316     if (sf->rmine[i] != sf->leafstart[1] + j) {
1317       sf->leafcontig[1] = PETSC_FALSE;
1318       break;
1319     }
1320   }
1321 
1322   /* If not, see if we can have per-rank optimizations by doing index analysis */
1323   if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]));
1324   if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1]));
1325 
1326   /* Are root indices for self and remote contiguous? */
1327   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1328   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1329 
1330   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1331   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1332 
1333   for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) {
1334     if (bas->irootloc[i] != bas->rootstart[0] + i) {
1335       bas->rootcontig[0] = PETSC_FALSE;
1336       break;
1337     }
1338   }
1339   for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) {
1340     if (bas->irootloc[i] != bas->rootstart[1] + j) {
1341       bas->rootcontig[1] = PETSC_FALSE;
1342       break;
1343     }
1344   }
1345 
1346   if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]));
1347   if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]));
1348 
1349   /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1350   if (PetscDefined(HAVE_DEVICE)) {
1351     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1352     if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]));
1353     if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1]));
1354     if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]));
1355     if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1]));
1356   }
1357   PetscFunctionReturn(0);
1358 }
1359 
1360 PetscErrorCode PetscSFResetPackFields(PetscSF sf) {
1361   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1362   PetscInt       i;
1363 
1364   PetscFunctionBegin;
1365   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
1366     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i]));
1367     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i]));
1368 #if defined(PETSC_HAVE_DEVICE)
1369     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i]));
1370     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i]));
1371 #endif
1372   }
1373   PetscFunctionReturn(0);
1374 }
1375