xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 7d0f5c0489d1b77f553b853d4f8ff06a242208dc) !
1 #include "petsc/private/sfimpl.h"
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
6 
7 /*
8  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9  * therefore we pack data types manually. This file defines packing routines for the standard data types.
10  */
11 
12 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
13 
14 /* Operations working like s += t */
15 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
16 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
17 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
18 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
19 /* Ref MPI MAXLOC */
20 #define OP_XLOC(op,s,t) \
21   do {                                       \
22     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23     else if (!((s).u op (t).u)) s = t;           \
24   } while (0)
25 
26 /* DEF_PackFunc - macro defining a Pack routine
27 
28    Arguments of the macro:
29    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30    .BS        Block size for vectorization. It is a factor of bsz.
31    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
32 
33    Arguments of the Pack routine:
34    +count     Number of indices in idx[].
35    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36    .opt       Per-pack optimization plan. NULL means no such plan.
37    .idx       Indices of entries to packed.
38    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40    -packed    Address of the packed data.
41  */
42 #define DEF_PackFunc(Type,BS,EQ) \
43   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44   {                                                                                                          \
45     const Type     *u = (const Type*)unpacked,*u2;                                                           \
46     Type           *p = (Type*)packed,*p2;                                                                   \
47     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
48     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
49     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
50     PetscFunctionBegin;                                                                                      \
51     if (!idx) PetscCall(PetscArraycpy(p,u+start*MBS,MBS*count));/* idx[] are contiguous */                     \
52     else if (opt) { /* has optimizations available */                                                        \
53       p2 = p;                                                                                                \
54       for (r=0; r<opt->n; r++) {                                                                             \
55         u2 = u + opt->start[r]*MBS;                                                                          \
56         X  = opt->X[r];                                                                                      \
57         Y  = opt->Y[r];                                                                                      \
58         for (k=0; k<opt->dz[r]; k++)                                                                         \
59           for (j=0; j<opt->dy[r]; j++) {                                                                     \
60             PetscCall(PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS));                                    \
61             p2  += opt->dx[r]*MBS;                                                                           \
62           }                                                                                                  \
63       }                                                                                                      \
64     } else {                                                                                                 \
65       for (i=0; i<count; i++)                                                                                \
66         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
67           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
68             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
69     }                                                                                                        \
70     PetscFunctionReturn(0);                                                                                  \
71   }
72 
73 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
74                 and inserts into a sparse array.
75 
76    Arguments:
77   .Type       Type of the data
78   .BS         Block size for vectorization
79   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
80 
81   Notes:
82    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
83  */
84 #define DEF_UnpackFunc(Type,BS,EQ)               \
85   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
86   {                                                                                                          \
87     Type           *u = (Type*)unpacked,*u2;                                                                 \
88     const Type     *p = (const Type*)packed;                                                                 \
89     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
90     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
91     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
92     PetscFunctionBegin;                                                                                      \
93     if (!idx) {                                                                                              \
94       u += start*MBS;                                                                                        \
95       if (u != p) PetscCall(PetscArraycpy(u,p,count*MBS));                                                     \
96     } else if (opt) { /* has optimizations available */                                                      \
97       for (r=0; r<opt->n; r++) {                                                                             \
98         u2 = u + opt->start[r]*MBS;                                                                          \
99         X  = opt->X[r];                                                                                      \
100         Y  = opt->Y[r];                                                                                      \
101         for (k=0; k<opt->dz[r]; k++)                                                                         \
102           for (j=0; j<opt->dy[r]; j++) {                                                                     \
103             PetscCall(PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS));                                     \
104             p   += opt->dx[r]*MBS;                                                                           \
105           }                                                                                                  \
106       }                                                                                                      \
107     } else {                                                                                                 \
108       for (i=0; i<count; i++)                                                                                \
109         for (j=0; j<M; j++)                                                                                  \
110           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
111     }                                                                                                        \
112     PetscFunctionReturn(0);                                                                                  \
113   }
114 
115 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
116 
117    Arguments:
118   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
119   .Type       Type of the data
120   .BS         Block size for vectorization
121   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
122   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
123   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
124  */
125 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
126   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
127   {                                                                                                          \
128     Type           *u = (Type*)unpacked,*u2;                                                                 \
129     const Type     *p = (const Type*)packed;                                                                 \
130     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
131     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
132     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
133     PetscFunctionBegin;                                                                                      \
134     if (!idx) {                                                                                              \
135       u += start*MBS;                                                                                        \
136       for (i=0; i<count; i++)                                                                                \
137         for (j=0; j<M; j++)                                                                                  \
138           for (k=0; k<BS; k++)                                                                               \
139             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
140     } else if (opt) { /* idx[] has patterns */                                                               \
141       for (r=0; r<opt->n; r++) {                                                                             \
142         u2 = u + opt->start[r]*MBS;                                                                          \
143         X  = opt->X[r];                                                                                      \
144         Y  = opt->Y[r];                                                                                      \
145         for (k=0; k<opt->dz[r]; k++)                                                                         \
146           for (j=0; j<opt->dy[r]; j++) {                                                                     \
147             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
148             p += opt->dx[r]*MBS;                                                                             \
149           }                                                                                                  \
150       }                                                                                                      \
151     } else {                                                                                                 \
152       for (i=0; i<count; i++)                                                                                \
153         for (j=0; j<M; j++)                                                                                  \
154           for (k=0; k<BS; k++)                                                                               \
155             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
156     }                                                                                                        \
157     PetscFunctionReturn(0);                                                                                  \
158   }
159 
160 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
161   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
162   {                                                                                                          \
163     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
164     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
165     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
166     const PetscInt MBS = M*BS;                                                                               \
167     PetscFunctionBegin;                                                                                      \
168     for (i=0; i<count; i++) {                                                                                \
169       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
170       l = i*MBS;                                                                                             \
171       for (j=0; j<M; j++)                                                                                    \
172         for (k=0; k<BS; k++) {                                                                               \
173           tmp = u[r+j*BS+k];                                                                                 \
174           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
175           p[l+j*BS+k] = tmp;                                                                                 \
176         }                                                                                                    \
177     }                                                                                                        \
178     PetscFunctionReturn(0);                                                                                  \
179   }
180 
181 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
182   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
183   {                                                                                                          \
184     const Type     *u = (const Type*)src;                                                                    \
185     Type           *v = (Type*)dst;                                                                          \
186     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
187     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
188     const PetscInt MBS = M*BS;                                                                               \
189     PetscFunctionBegin;                                                                                      \
190     if (!srcIdx) { /* src is contiguous */                                                                   \
191       u += srcStart*MBS;                                                                                     \
192       PetscCall(CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u));              \
193     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
194       u += srcOpt->start[0]*MBS;                                                                             \
195       v += dstStart*MBS;                                                                                     \
196       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
197       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
198         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
199           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
200           v += srcOpt->dx[0]*MBS;                                                                            \
201         }                                                                                                    \
202     } else { /* all other cases */                                                                           \
203       for (i=0; i<count; i++) {                                                                              \
204         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
205         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
206         for (j=0; j<M; j++)                                                                                  \
207           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
208       }                                                                                                      \
209     }                                                                                                        \
210     PetscFunctionReturn(0);                                                                                  \
211   }
212 
213 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
214   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
215   {                                                                                                          \
216     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
217     const Type     *ldata = (const Type*)leafdata;                                                           \
218     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
219     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
220     const PetscInt MBS = M*BS;                                                                               \
221     PetscFunctionBegin;                                                                                      \
222     for (i=0; i<count; i++) {                                                                                \
223       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
224       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
225       for (j=0; j<M; j++)                                                                                    \
226         for (k=0; k<BS; k++) {                                                                               \
227           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
228           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
229         }                                                                                                    \
230     }                                                                                                        \
231     PetscFunctionReturn(0);                                                                                  \
232   }
233 
234 /* Pack, Unpack/Fetch ops */
235 #define DEF_Pack(Type,BS,EQ)                                                      \
236   DEF_PackFunc(Type,BS,EQ)                                                        \
237   DEF_UnpackFunc(Type,BS,EQ)                                                      \
238   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
239   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
240     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
241     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
242     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
243   }
244 
245 /* Add, Mult ops */
246 #define DEF_Add(Type,BS,EQ)                                                       \
247   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
248   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
249   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
250   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
251   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
252   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
253   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
254     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
255     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
256     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
257     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
258     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
259     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
260   }
261 
262 /* Max, Min ops */
263 #define DEF_Cmp(Type,BS,EQ)                                                       \
264   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
265   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
266   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
267   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
268   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
269     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
270     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
271     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
272     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
273   }
274 
275 /* Logical ops.
276   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
277   the compilation warning "empty macro arguments are undefined in ISO C90"
278  */
279 #define DEF_Log(Type,BS,EQ)                                                       \
280   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
281   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
282   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
283   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
284   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
285   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
286   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
287     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
288     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
289     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
290     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
291     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
292     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
293   }
294 
295 /* Bitwise ops */
296 #define DEF_Bit(Type,BS,EQ)                                                       \
297   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
298   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
299   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
300   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
301   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
302   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
303   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
304     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
305     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
306     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
307     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
308     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
309     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
310   }
311 
312 /* Maxloc, Minloc ops */
313 #define DEF_Xloc(Type,BS,EQ)                                                      \
314   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
315   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
316   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
317   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
318   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
319     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
320     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
321     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
322     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
323   }
324 
325 #define DEF_IntegerType(Type,BS,EQ)                                               \
326   DEF_Pack(Type,BS,EQ)                                                            \
327   DEF_Add(Type,BS,EQ)                                                             \
328   DEF_Cmp(Type,BS,EQ)                                                             \
329   DEF_Log(Type,BS,EQ)                                                             \
330   DEF_Bit(Type,BS,EQ)                                                             \
331   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
332     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
333     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
334     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
335     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
336     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
337   }
338 
339 #define DEF_RealType(Type,BS,EQ)                                                  \
340   DEF_Pack(Type,BS,EQ)                                                            \
341   DEF_Add(Type,BS,EQ)                                                             \
342   DEF_Cmp(Type,BS,EQ)                                                             \
343   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
344     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
345     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
346     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
347   }
348 
349 #if defined(PETSC_HAVE_COMPLEX)
350 #define DEF_ComplexType(Type,BS,EQ)                                               \
351   DEF_Pack(Type,BS,EQ)                                                            \
352   DEF_Add(Type,BS,EQ)                                                             \
353   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
354     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
355     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
356   }
357 #endif
358 
359 #define DEF_DumbType(Type,BS,EQ)                                                  \
360   DEF_Pack(Type,BS,EQ)                                                            \
361   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
362     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
363   }
364 
365 /* Maxloc, Minloc */
366 #define DEF_PairType(Type,BS,EQ)                                                  \
367   DEF_Pack(Type,BS,EQ)                                                            \
368   DEF_Xloc(Type,BS,EQ)                                                            \
369   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
370     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
371     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
372   }
373 
374 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
375 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
376 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
377 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
378 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
379 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
380 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
381 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
382 
383 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
384 DEF_IntegerType(int,1,1)
385 DEF_IntegerType(int,2,1)
386 DEF_IntegerType(int,4,1)
387 DEF_IntegerType(int,8,1)
388 DEF_IntegerType(int,1,0)
389 DEF_IntegerType(int,2,0)
390 DEF_IntegerType(int,4,0)
391 DEF_IntegerType(int,8,0)
392 #endif
393 
394 /* The typedefs are used to get a typename without space that CPPJoin can handle */
395 typedef signed char SignedChar;
396 DEF_IntegerType(SignedChar,1,1)
397 DEF_IntegerType(SignedChar,2,1)
398 DEF_IntegerType(SignedChar,4,1)
399 DEF_IntegerType(SignedChar,8,1)
400 DEF_IntegerType(SignedChar,1,0)
401 DEF_IntegerType(SignedChar,2,0)
402 DEF_IntegerType(SignedChar,4,0)
403 DEF_IntegerType(SignedChar,8,0)
404 
405 typedef unsigned char UnsignedChar;
406 DEF_IntegerType(UnsignedChar,1,1)
407 DEF_IntegerType(UnsignedChar,2,1)
408 DEF_IntegerType(UnsignedChar,4,1)
409 DEF_IntegerType(UnsignedChar,8,1)
410 DEF_IntegerType(UnsignedChar,1,0)
411 DEF_IntegerType(UnsignedChar,2,0)
412 DEF_IntegerType(UnsignedChar,4,0)
413 DEF_IntegerType(UnsignedChar,8,0)
414 
415 DEF_RealType(PetscReal,1,1)
416 DEF_RealType(PetscReal,2,1)
417 DEF_RealType(PetscReal,4,1)
418 DEF_RealType(PetscReal,8,1)
419 DEF_RealType(PetscReal,1,0)
420 DEF_RealType(PetscReal,2,0)
421 DEF_RealType(PetscReal,4,0)
422 DEF_RealType(PetscReal,8,0)
423 
424 #if defined(PETSC_HAVE_COMPLEX)
425 DEF_ComplexType(PetscComplex,1,1)
426 DEF_ComplexType(PetscComplex,2,1)
427 DEF_ComplexType(PetscComplex,4,1)
428 DEF_ComplexType(PetscComplex,8,1)
429 DEF_ComplexType(PetscComplex,1,0)
430 DEF_ComplexType(PetscComplex,2,0)
431 DEF_ComplexType(PetscComplex,4,0)
432 DEF_ComplexType(PetscComplex,8,0)
433 #endif
434 
435 #define PairType(Type1,Type2) Type1##_##Type2
436 typedef struct {int u; int i;}           PairType(int,int);
437 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
438 DEF_PairType(PairType(int,int),1,1)
439 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
440 
441 /* If we don't know the basic type, we treat it as a stream of chars or ints */
442 DEF_DumbType(char,1,1)
443 DEF_DumbType(char,2,1)
444 DEF_DumbType(char,4,1)
445 DEF_DumbType(char,1,0)
446 DEF_DumbType(char,2,0)
447 DEF_DumbType(char,4,0)
448 
449 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
450 DEF_DumbType(DumbInt,1,1)
451 DEF_DumbType(DumbInt,2,1)
452 DEF_DumbType(DumbInt,4,1)
453 DEF_DumbType(DumbInt,8,1)
454 DEF_DumbType(DumbInt,1,0)
455 DEF_DumbType(DumbInt,2,0)
456 DEF_DumbType(DumbInt,4,0)
457 DEF_DumbType(DumbInt,8,0)
458 
459 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
460 {
461   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
462   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
463 
464   PetscFunctionBegin;
465   /* Destroy device-specific fields */
466   if (link->deviceinited) PetscCall((*link->Destroy)(sf,link));
467 
468   /* Destroy host related fields */
469   if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit));
470   if (!link->use_nvshmem) {
471     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
472       if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i]));
473     }
474     PetscCall(PetscFree(link->reqs));
475     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
476       PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]));
477       PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]));
478     }
479   }
480   PetscCall(PetscFree(link));
481   PetscFunctionReturn(0);
482 }
483 
484 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
485 {
486   PetscFunctionBegin;
487   PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata));
488  #if defined(PETSC_HAVE_NVSHMEM)
489   {
490     PetscBool use_nvshmem;
491     PetscCall(PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem));
492     if (use_nvshmem) {
493       PetscCall(PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink));
494       PetscFunctionReturn(0);
495     }
496   }
497  #endif
498   PetscCall(PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink));
499   PetscFunctionReturn(0);
500 }
501 
502 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
503    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
504 */
505 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
506 {
507   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
508   PetscInt             i,j,cnt,nrootranks,ndrootranks,nleafranks,ndleafranks;
509   const PetscInt       *rootoffset,*leafoffset;
510   MPI_Aint             disp;
511   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
512   MPI_Datatype         unit = link->unit;
513   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
514   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
515 
516   PetscFunctionBegin;
517   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
518   if (sf->persistent) {
519     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
520       PetscCall(PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL));
521       if (direction == PETSCSF_LEAF2ROOT) {
522         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
523           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
524           cnt  = rootoffset[i+1]-rootoffset[i];
525           PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j));
526         }
527       } else { /* PETSCSF_ROOT2LEAF */
528         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
529           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
530           cnt  = rootoffset[i+1]-rootoffset[i];
531           PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j));
532         }
533       }
534       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
535     }
536 
537     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
538       PetscCall(PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL));
539       if (direction == PETSCSF_LEAF2ROOT) {
540         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
541           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
542           cnt  = leafoffset[i+1]-leafoffset[i];
543           PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j));
544         }
545       } else { /* PETSCSF_ROOT2LEAF */
546         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
547           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
548           cnt  = leafoffset[i+1]-leafoffset[i];
549           PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j));
550         }
551       }
552       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
553     }
554   }
555   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
556   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
557   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
558   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
559   PetscFunctionReturn(0);
560 }
561 
562 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
563 {
564   PetscSFLink       link,*p;
565   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
566 
567   PetscFunctionBegin;
568   /* Look for types in cache */
569   for (p=&bas->inuse; (link=*p); p=&link->next) {
570     PetscBool match;
571     PetscCall(MPIPetsc_Type_compare(unit,link->unit,&match));
572     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
573       switch (cmode) {
574       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
575       case PETSC_USE_POINTER: break;
576       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
577       }
578       *mylink = link;
579       PetscFunctionReturn(0);
580     }
581   }
582   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
583 }
584 
585 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
586 {
587   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
588   PetscSFLink       link = *mylink;
589 
590   PetscFunctionBegin;
591   link->rootdata = NULL;
592   link->leafdata = NULL;
593   link->next     = bas->avail;
594   bas->avail     = link;
595   *mylink        = NULL;
596   PetscFunctionReturn(0);
597 }
598 
599 /* Error out on unsupported overlapped communications */
600 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
601 {
602   PetscSFLink       link,*p;
603   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
604   PetscBool         match;
605 
606   PetscFunctionBegin;
607   if (PetscDefined(USE_DEBUG)) {
608     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
609        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
610     */
611     if (rootdata || leafdata) {
612       for (p=&bas->inuse; (link=*p); p=&link->next) {
613         PetscCall(MPIPetsc_Type_compare(unit,link->unit,&match));
614         PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata,PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
615       }
616     }
617   }
618   PetscFunctionReturn(0);
619 }
620 
621 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
622 {
623   PetscFunctionBegin;
624   if (n) PetscCall(PetscMemcpy(dst,src,n));
625   PetscFunctionReturn(0);
626 }
627 
628 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
629 {
630   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
631   PetscBool      is2Int,is2PetscInt;
632   PetscMPIInt    ni,na,nd,combiner;
633 #if defined(PETSC_HAVE_COMPLEX)
634   PetscInt       nPetscComplex=0;
635 #endif
636 
637   PetscFunctionBegin;
638   PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar));
639   PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar));
640   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
641   PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt));
642   PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt));
643   PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal));
644 #if defined(PETSC_HAVE_COMPLEX)
645   PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex));
646 #endif
647   PetscCall(MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int));
648   PetscCall(MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt));
649   /* TODO: shaell we also handle Fortran MPI_2REAL? */
650   PetscCallMPI(MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner));
651   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
652   link->bs = 1; /* default */
653 
654   if (is2Int) {
655     PackInit_PairType_int_int_1_1(link);
656     link->bs        = 1;
657     link->unitbytes = 2*sizeof(int);
658     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
659     link->basicunit = MPI_2INT;
660     link->unit      = MPI_2INT;
661   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
662     PackInit_PairType_PetscInt_PetscInt_1_1(link);
663     link->bs        = 1;
664     link->unitbytes = 2*sizeof(PetscInt);
665     link->basicunit = MPIU_2INT;
666     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
667     link->unit      = MPIU_2INT;
668   } else if (nPetscReal) {
669     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
670     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
671     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
672     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
673     link->bs        = nPetscReal;
674     link->unitbytes = nPetscReal*sizeof(PetscReal);
675     link->basicunit = MPIU_REAL;
676     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
677   } else if (nPetscInt) {
678     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
679     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
680     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
681     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
682     link->bs        = nPetscInt;
683     link->unitbytes = nPetscInt*sizeof(PetscInt);
684     link->basicunit = MPIU_INT;
685     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
686 #if defined(PETSC_USE_64BIT_INDICES)
687   } else if (nInt) {
688     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
689     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
690     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
691     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
692     link->bs        = nInt;
693     link->unitbytes = nInt*sizeof(int);
694     link->basicunit = MPI_INT;
695     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
696 #endif
697   } else if (nSignedChar) {
698     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
699     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
700     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
701     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
702     link->bs        = nSignedChar;
703     link->unitbytes = nSignedChar*sizeof(SignedChar);
704     link->basicunit = MPI_SIGNED_CHAR;
705     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
706   }  else if (nUnsignedChar) {
707     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
708     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
709     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
710     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
711     link->bs        = nUnsignedChar;
712     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
713     link->basicunit = MPI_UNSIGNED_CHAR;
714     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
715 #if defined(PETSC_HAVE_COMPLEX)
716   } else if (nPetscComplex) {
717     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
718     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
719     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
720     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
721     link->bs        = nPetscComplex;
722     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
723     link->basicunit = MPIU_COMPLEX;
724     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
725 #endif
726   } else {
727     MPI_Aint lb,nbyte;
728     PetscCallMPI(MPI_Type_get_extent(unit,&lb,&nbyte));
729     PetscCheck(lb == 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb);
730     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
731       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
732       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
733       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
734       link->bs        = nbyte;
735       link->unitbytes = nbyte;
736       link->basicunit = MPI_BYTE;
737     } else {
738       nInt = nbyte / sizeof(int);
739       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
740       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
741       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
742       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
743       link->bs        = nInt;
744       link->unitbytes = nbyte;
745       link->basicunit = MPI_INT;
746     }
747     if (link->isbuiltin) link->unit = unit;
748   }
749 
750   if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit,&link->unit));
751 
752   link->Memcpy = PetscSFLinkMemcpy_Host;
753   PetscFunctionReturn(0);
754 }
755 
756 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
757 {
758   PetscFunctionBegin;
759   *UnpackAndOp = NULL;
760   if (PetscMemTypeHost(mtype)) {
761     if      (op == MPI_REPLACE)               *UnpackAndOp = link->h_UnpackAndInsert;
762     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
763     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
764     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
765     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
766     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
767     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
768     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
769     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
770     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
771     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
772     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
773     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
774   }
775 #if defined(PETSC_HAVE_DEVICE)
776   else if (PetscMemTypeDevice(mtype) && !atomic) {
777     if      (op == MPI_REPLACE)               *UnpackAndOp = link->d_UnpackAndInsert;
778     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
779     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
780     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
781     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
782     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
783     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
784     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
785     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
786     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
787     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
788     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
789     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
790   } else if (PetscMemTypeDevice(mtype) && atomic) {
791     if      (op == MPI_REPLACE)               *UnpackAndOp = link->da_UnpackAndInsert;
792     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
793     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
794     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
795     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
796     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
797     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
798     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
799     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
800     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
801     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
802     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
803     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
804   }
805 #endif
806   PetscFunctionReturn(0);
807 }
808 
809 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
810 {
811   PetscFunctionBegin;
812   *ScatterAndOp = NULL;
813   if (PetscMemTypeHost(mtype)) {
814     if      (op == MPI_REPLACE)               *ScatterAndOp = link->h_ScatterAndInsert;
815     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
816     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
817     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
818     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
819     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
820     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
821     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
822     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
823     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
824     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
825     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
826     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
827   }
828 #if defined(PETSC_HAVE_DEVICE)
829   else if (PetscMemTypeDevice(mtype) && !atomic) {
830     if      (op == MPI_REPLACE)               *ScatterAndOp = link->d_ScatterAndInsert;
831     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
832     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
833     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
834     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
835     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
836     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
837     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
838     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
839     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
840     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
841     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
842     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
843   } else if (PetscMemTypeDevice(mtype) && atomic) {
844     if      (op == MPI_REPLACE)               *ScatterAndOp = link->da_ScatterAndInsert;
845     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
846     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
847     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
848     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
849     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
850     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
851     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
852     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
853     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
854     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
855     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
856     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
857   }
858 #endif
859   PetscFunctionReturn(0);
860 }
861 
862 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
863 {
864   PetscFunctionBegin;
865   *FetchAndOp = NULL;
866   PetscCheck(op == MPI_SUM || op == MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
867   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
868 #if defined(PETSC_HAVE_DEVICE)
869   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
870   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOp = link->da_FetchAndAdd;
871 #endif
872   PetscFunctionReturn(0);
873 }
874 
875 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
876 {
877   PetscFunctionBegin;
878   *FetchAndOpLocal = NULL;
879   PetscCheck(op == MPI_SUM || op == MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
880   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
881 #if defined(PETSC_HAVE_DEVICE)
882   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
883   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
884 #endif
885   PetscFunctionReturn(0);
886 }
887 
888 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
889 {
890   PetscLogDouble flops;
891   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
892 
893   PetscFunctionBegin;
894   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
895     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
896 #if defined(PETSC_HAVE_DEVICE)
897     if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops)); else
898 #endif
899     PetscCall(PetscLogFlops(flops));
900   }
901   PetscFunctionReturn(0);
902 }
903 
904 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
905 {
906   PetscLogDouble flops;
907 
908   PetscFunctionBegin;
909   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
910     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
911 #if defined(PETSC_HAVE_DEVICE)
912     if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops)); else
913 #endif
914     PetscCall(PetscLogFlops(flops));
915   }
916   PetscFunctionReturn(0);
917 }
918 
919 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
920   Input Parameters:
921   +sf      - The StarForest
922   .link    - The link
923   .count   - Number of entries to unpack
924   .start   - The first index, significent when indices=NULL
925   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
926   .buf     - A contiguous buffer to unpack from
927   -op      - Operation after unpack
928 
929   Output Parameters:
930   .data    - The data to unpack to
931 */
932 static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
933 {
934   PetscFunctionBegin;
935 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
936   {
937     PetscInt       i;
938     if (indices) {
939       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
940          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
941       */
942       for (i=0; i<count; i++) PetscCallMPI(MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op));
943     } else {
944       PetscCallMPI(MPIU_Reduce_local(buf,(char*)data+start*link->unitbytes,count,link->unit,op));
945     }
946   }
947   PetscFunctionReturn(0);
948 #else
949   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
950 #endif
951 }
952 
953 static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
954 {
955   PetscFunctionBegin;
956 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
957   {
958     PetscInt       i,disp;
959     if (!srcIdx) {
960       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op));
961     } else {
962       for (i=0; i<count; i++) {
963         disp = dstIdx? dstIdx[i] : dstStart + i;
964         PetscCallMPI(MPIU_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op));
965       }
966     }
967   }
968   PetscFunctionReturn(0);
969 #else
970   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
971 #endif
972 }
973 
974 /*=============================================================================
975               Pack/Unpack/Fetch/Scatter routines
976  ============================================================================*/
977 
978 /* Pack rootdata to rootbuf
979   Input Parameters:
980   + sf       - The SF this packing works on.
981   . link     - It gives the memtype of the roots and also provides root buffer.
982   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
983   - rootdata - Where to read the roots.
984 
985   Notes:
986   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
987   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
988  */
989 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
990 {
991   const PetscInt   *rootindices = NULL;
992   PetscInt         count,start;
993   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
994   PetscMemType     rootmtype = link->rootmtype;
995   PetscSFPackOpt   opt = NULL;
996 
997   PetscFunctionBegin;
998   PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0));
999   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1000     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices));
1001     PetscCall(PetscSFLinkGetPack(link,rootmtype,&Pack));
1002     PetscCall((*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]));
1003   }
1004   PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0));
1005   PetscFunctionReturn(0);
1006 }
1007 
1008 /* Pack leafdata to leafbuf */
1009 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1010 {
1011   const PetscInt   *leafindices = NULL;
1012   PetscInt         count,start;
1013   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1014   PetscMemType     leafmtype = link->leafmtype;
1015   PetscSFPackOpt   opt = NULL;
1016 
1017   PetscFunctionBegin;
1018   PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0));
1019   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1020     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices));
1021     PetscCall(PetscSFLinkGetPack(link,leafmtype,&Pack));
1022     PetscCall((*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]));
1023   }
1024   PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0));
1025   PetscFunctionReturn(0);
1026 }
1027 
1028 /* Pack rootdata to rootbuf, which are in the same memory space */
1029 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1030 {
1031   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1032 
1033   PetscFunctionBegin;
1034   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1035     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1036     if (link->PrePack) PetscCall((*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
1037   }
1038   PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0));
1039   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf,link,scope,rootdata));
1040   PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0));
1041   PetscFunctionReturn(0);
1042 }
1043 /* Pack leafdata to leafbuf, which are in the same memory space */
1044 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1045 {
1046   PetscFunctionBegin;
1047   if (scope == PETSCSF_REMOTE) {
1048     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1049     if (link->PrePack) PetscCall((*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT));  /* Used by SF nvshmem */
1050   }
1051   PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0));
1052   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata));
1053   PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0));
1054   PetscFunctionReturn(0);
1055 }
1056 
1057 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1058 {
1059   const PetscInt   *rootindices = NULL;
1060   PetscInt         count,start;
1061   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1062   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1063   PetscMemType     rootmtype = link->rootmtype;
1064   PetscSFPackOpt   opt = NULL;
1065 
1066   PetscFunctionBegin;
1067   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1068     PetscCall(PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp));
1069     if (UnpackAndOp) {
1070       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices));
1071       PetscCall((*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]));
1072     } else {
1073       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices));
1074       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op));
1075     }
1076   }
1077   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op));
1078   PetscFunctionReturn(0);
1079 }
1080 
1081 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1082 {
1083   const PetscInt   *leafindices = NULL;
1084   PetscInt         count,start;
1085   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1086   PetscMemType     leafmtype = link->leafmtype;
1087   PetscSFPackOpt   opt = NULL;
1088 
1089   PetscFunctionBegin;
1090   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1091     PetscCall(PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp));
1092     if (UnpackAndOp) {
1093       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices));
1094       PetscCall((*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]));
1095     } else {
1096       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices));
1097       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op));
1098     }
1099   }
1100   PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op));
1101   PetscFunctionReturn(0);
1102 }
1103 /* Unpack rootbuf to rootdata, which are in the same memory space */
1104 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1105 {
1106   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1107 
1108   PetscFunctionBegin;
1109   PetscCall(PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0));
1110   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op));
1111   PetscCall(PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0));
1112   if (scope == PETSCSF_REMOTE) {
1113     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT));  /* Used by SF nvshmem */
1114     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1115   }
1116   PetscFunctionReturn(0);
1117 }
1118 
1119 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1120 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1121 {
1122   PetscFunctionBegin;
1123   PetscCall(PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0));
1124   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op));
1125   PetscCall(PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0));
1126   if (scope == PETSCSF_REMOTE) {
1127     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF));  /* Used by SF nvshmem */
1128     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1129   }
1130   PetscFunctionReturn(0);
1131 }
1132 
1133 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1134 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1135 {
1136   const PetscInt     *rootindices = NULL;
1137   PetscInt           count,start;
1138   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1139   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1140   PetscMemType       rootmtype = link->rootmtype;
1141   PetscSFPackOpt     opt = NULL;
1142 
1143   PetscFunctionBegin;
1144   PetscCall(PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0));
1145   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1146     /* Do FetchAndOp on rootdata with rootbuf */
1147     PetscCall(PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp));
1148     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices));
1149     PetscCall((*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]));
1150   }
1151   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op));
1152   PetscCall(PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0));
1153   PetscFunctionReturn(0);
1154 }
1155 
1156 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1157 {
1158   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1159   PetscInt             count,rootstart,leafstart;
1160   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1161   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1162   PetscMemType         rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1163   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1164   PetscInt             buflen = sf->leafbuflen[PETSCSF_LOCAL];
1165   char                 *srcbuf = NULL,*dstbuf = NULL;
1166   PetscBool            dstdups;
1167 
1168   PetscFunctionBegin;
1169   if (!buflen) PetscFunctionReturn(0);
1170   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1171     if (direction == PETSCSF_ROOT2LEAF) {
1172       PetscCall(PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata));
1173       srcmtype = rootmtype;
1174       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1175       dstmtype = leafmtype;
1176       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1177     } else {
1178       PetscCall(PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata));
1179       srcmtype = leafmtype;
1180       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1181       dstmtype = rootmtype;
1182       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1183     }
1184     PetscCall((*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes));
1185     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1186     if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link));
1187     if (direction == PETSCSF_ROOT2LEAF) {
1188       PetscCall(PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op));
1189     } else {
1190       PetscCall(PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op));
1191     }
1192   } else {
1193     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1194     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
1195     PetscCall(PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp));
1196     if (ScatterAndOp) {
1197       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices));
1198       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices));
1199       if (direction == PETSCSF_ROOT2LEAF) {
1200         PetscCall((*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata));
1201       } else {
1202         PetscCall((*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata));
1203       }
1204     } else {
1205       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices));
1206       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices));
1207       if (direction == PETSCSF_ROOT2LEAF) {
1208         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op));
1209       } else {
1210         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op));
1211       }
1212     }
1213   }
1214   PetscFunctionReturn(0);
1215 }
1216 
1217 /* Fetch rootdata to leafdata and leafupdate locally */
1218 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1219 {
1220   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1221   PetscInt             count,rootstart,leafstart;
1222   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1223   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1224   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1225   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1226 
1227   PetscFunctionBegin;
1228   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1229   if (rootmtype != leafmtype) {
1230     /* The local communication has to go through pack and unpack */
1231     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1232   } else {
1233     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices));
1234     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices));
1235     PetscCall(PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal));
1236     PetscCall((*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate));
1237   }
1238   PetscFunctionReturn(0);
1239 }
1240 
1241 /*
1242   Create per-rank pack/unpack optimizations based on indice patterns
1243 
1244    Input Parameters:
1245   +  n       - Number of destination ranks
1246   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1247   -  idx     - [*]   Array storing indices
1248 
1249    Output Parameters:
1250   +  opt     - Pack optimizations. NULL if no optimizations.
1251 */
1252 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1253 {
1254   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1255   PetscBool      optimizable = PETSC_TRUE;
1256   PetscSFPackOpt opt;
1257 
1258   PetscFunctionBegin;
1259   PetscCall(PetscMalloc1(1,&opt));
1260   PetscCall(PetscMalloc1(7*n+2,&opt->array));
1261   opt->n      = opt->array[0] = n;
1262   opt->offset = opt->array + 1;
1263   opt->start  = opt->array + n   + 2;
1264   opt->dx     = opt->array + 2*n + 2;
1265   opt->dy     = opt->array + 3*n + 2;
1266   opt->dz     = opt->array + 4*n + 2;
1267   opt->X      = opt->array + 5*n + 2;
1268   opt->Y      = opt->array + 6*n + 2;
1269 
1270   for (r=0; r<n; r++) { /* For each destination rank */
1271     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1272     p     = offset[r];
1273     start = idx[p]; /* First index for this rank */
1274     p++;
1275 
1276     /* Search in X dimension */
1277     for (dx=1; dx<m; dx++,p++) {
1278       if (start+dx != idx[p]) break;
1279     }
1280 
1281     dydz = m/dx;
1282     X    = dydz > 1 ? (idx[p]-start) : dx;
1283     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1284     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1285     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1286       for (i=0; i<dx; i++,p++) {
1287         if (start+X*dy+i != idx[p]) {
1288           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1289           else goto Z_dimension;
1290         }
1291       }
1292     }
1293 
1294 Z_dimension:
1295     dz = m/(dx*dy);
1296     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1297     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1298     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1299     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1300       for (j=0; j<dy; j++) {
1301         for (i=0; i<dx; i++,p++) {
1302           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1303         }
1304       }
1305     }
1306     opt->start[r] = start;
1307     opt->dx[r]    = dx;
1308     opt->dy[r]    = dy;
1309     opt->dz[r]    = dz;
1310     opt->X[r]     = X;
1311     opt->Y[r]     = Y;
1312   }
1313 
1314 finish:
1315   /* If not optimizable, free arrays to save memory */
1316   if (!n || !optimizable) {
1317     PetscCall(PetscFree(opt->array));
1318     PetscCall(PetscFree(opt));
1319     *out = NULL;
1320   } else {
1321     opt->offset[0] = 0;
1322     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1323     *out = opt;
1324   }
1325   PetscFunctionReturn(0);
1326 }
1327 
1328 static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1329 {
1330   PetscSFPackOpt opt = *out;
1331 
1332   PetscFunctionBegin;
1333   if (opt) {
1334     PetscCall(PetscSFFree(sf,mtype,opt->array));
1335     PetscCall(PetscFree(opt));
1336     *out = NULL;
1337   }
1338   PetscFunctionReturn(0);
1339 }
1340 
1341 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1342 {
1343   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1344   PetscInt       i,j;
1345 
1346   PetscFunctionBegin;
1347   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1348   for (i=0; i<2; i++) { /* Set defaults */
1349     sf->leafstart[i]   = 0;
1350     sf->leafcontig[i]  = PETSC_TRUE;
1351     sf->leafdups[i]    = PETSC_FALSE;
1352     bas->rootstart[i]  = 0;
1353     bas->rootcontig[i] = PETSC_TRUE;
1354     bas->rootdups[i]   = PETSC_FALSE;
1355   }
1356 
1357   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1358   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1359 
1360   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1361   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1362 
1363   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1364   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1365     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1366   }
1367   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1368     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1369   }
1370 
1371   /* If not, see if we can have per-rank optimizations by doing index analysis */
1372   if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]));
1373   if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]));
1374 
1375   /* Are root indices for self and remote contiguous? */
1376   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1377   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1378 
1379   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1380   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1381 
1382   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1383     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1384   }
1385   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1386     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1387   }
1388 
1389   if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]));
1390   if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]));
1391 
1392     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1393   if (PetscDefined(HAVE_DEVICE)) {
1394     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1395     if (!sf->leafcontig[0]  && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]));
1396     if (!sf->leafcontig[1]  && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]));
1397     if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]));
1398     if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]));
1399   }
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1404 {
1405   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1406   PetscInt       i;
1407 
1408   PetscFunctionBegin;
1409   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1410     PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]));
1411     PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]));
1412    #if defined(PETSC_HAVE_DEVICE)
1413     PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]));
1414     PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]));
1415    #endif
1416   }
1417   PetscFunctionReturn(0);
1418 }
1419