xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 3b5e53cdb169e94712b800a2d1c9e10766bebf6e)
1 #include "petsc/private/sfimpl.h"
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
6 
7 #if defined(PETSC_HAVE_CUDA)
8 #include <cuda_runtime.h>
9 #include <petsccublas.h>
10 #endif
11 /*
12  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
13  * therefore we pack data types manually. This file defines packing routines for the standard data types.
14  */
15 
16 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
17 
18 /* Operations working like s += t */
19 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
20 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
21 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
22 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
23 /* Ref MPI MAXLOC */
24 #define OP_XLOC(op,s,t) \
25   do {                                       \
26     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
27     else if (!((s).u op (t).u)) s = t;           \
28   } while (0)
29 
30 /* DEF_PackFunc - macro defining a Pack routine
31 
32    Arguments of the macro:
33    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
34    .BS        Block size for vectorization. It is a factor of bsz.
35    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
36 
37    Arguments of the Pack routine:
38    +count     Number of indices in idx[].
39    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
40    .opt       Per-pack optimization plan. NULL means no such plan.
41    .idx       Indices of entries to packed.
42    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
43    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
44    -packed    Address of the packed data.
45  */
46 #define DEF_PackFunc(Type,BS,EQ) \
47   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
48   {                                                                                                          \
49     PetscErrorCode ierr;                                                                                     \
50     const Type     *u = (const Type*)unpacked,*u2;                                                           \
51     Type           *p = (Type*)packed,*p2;                                                                   \
52     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
53     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
54     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
55     PetscFunctionBegin;                                                                                      \
56     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
57     else if (opt) { /* has optimizations available */                                                        \
58       p2 = p;                                                                                                \
59       for (r=0; r<opt->n; r++) {                                                                             \
60         u2 = u + opt->start[r]*MBS;                                                                          \
61         X  = opt->X[r];                                                                                      \
62         Y  = opt->Y[r];                                                                                      \
63         for (k=0; k<opt->dz[r]; k++)                                                                         \
64           for (j=0; j<opt->dy[r]; j++) {                                                                     \
65             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
66             p2  += opt->dx[r]*MBS;                                                                           \
67           }                                                                                                  \
68       }                                                                                                      \
69     } else {                                                                                                 \
70       for (i=0; i<count; i++)                                                                                \
71         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
72           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
73             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
74     }                                                                                                        \
75     PetscFunctionReturn(0);                                                                                  \
76   }
77 
78 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
79                 and inserts into a sparse array.
80 
81    Arguments:
82   .Type       Type of the data
83   .BS         Block size for vectorization
84   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
85 
86   Notes:
87    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
88  */
89 #define DEF_UnpackFunc(Type,BS,EQ)               \
90   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
91   {                                                                                                          \
92     PetscErrorCode ierr;                                                                                     \
93     Type           *u = (Type*)unpacked,*u2;                                                                 \
94     const Type     *p = (const Type*)packed;                                                                 \
95     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
96     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
97     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
98     PetscFunctionBegin;                                                                                      \
99     if (!idx) {                                                                                              \
100       u += start*MBS;                                                                                        \
101       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
102     } else if (opt) { /* has optimizations available */                                                      \
103       for (r=0; r<opt->n; r++) {                                                                             \
104         u2 = u + opt->start[r]*MBS;                                                                          \
105         X  = opt->X[r];                                                                                      \
106         Y  = opt->Y[r];                                                                                      \
107         for (k=0; k<opt->dz[r]; k++)                                                                         \
108           for (j=0; j<opt->dy[r]; j++) {                                                                     \
109             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
110             p   += opt->dx[r]*MBS;                                                                           \
111           }                                                                                                  \
112       }                                                                                                      \
113     } else {                                                                                                 \
114       for (i=0; i<count; i++)                                                                                \
115         for (j=0; j<M; j++)                                                                                  \
116           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
117     }                                                                                                        \
118     PetscFunctionReturn(0);                                                                                  \
119   }
120 
121 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
122 
123    Arguments:
124   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
125   .Type       Type of the data
126   .BS         Block size for vectorization
127   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
128   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
129   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
130  */
131 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
132   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
133   {                                                                                                          \
134     Type           *u = (Type*)unpacked,*u2;                                                                 \
135     const Type     *p = (const Type*)packed;                                                                 \
136     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
137     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
138     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
139     PetscFunctionBegin;                                                                                      \
140     if (!idx) {                                                                                              \
141       u += start*MBS;                                                                                        \
142       for (i=0; i<count; i++)                                                                                \
143         for (j=0; j<M; j++)                                                                                  \
144           for (k=0; k<BS; k++)                                                                               \
145             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
146     } else if (opt) { /* idx[] has patterns */                                                               \
147       for (r=0; r<opt->n; r++) {                                                                             \
148         u2 = u + opt->start[r]*MBS;                                                                          \
149         X  = opt->X[r];                                                                                      \
150         Y  = opt->Y[r];                                                                                      \
151         for (k=0; k<opt->dz[r]; k++)                                                                         \
152           for (j=0; j<opt->dy[r]; j++) {                                                                     \
153             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
154             p += opt->dx[r]*MBS;                                                                             \
155           }                                                                                                  \
156       }                                                                                                      \
157     } else {                                                                                                 \
158       for (i=0; i<count; i++)                                                                                \
159         for (j=0; j<M; j++)                                                                                  \
160           for (k=0; k<BS; k++)                                                                               \
161             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
162     }                                                                                                        \
163     PetscFunctionReturn(0);                                                                                  \
164   }
165 
166 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
167   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
168   {                                                                                                          \
169     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
170     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
171     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
172     const PetscInt MBS = M*BS;                                                                               \
173     PetscFunctionBegin;                                                                                      \
174     for (i=0; i<count; i++) {                                                                                \
175       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
176       l = i*MBS;                                                                                             \
177       for (j=0; j<M; j++)                                                                                    \
178         for (k=0; k<BS; k++) {                                                                               \
179           tmp = u[r+j*BS+k];                                                                                 \
180           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
181           p[l+j*BS+k] = tmp;                                                                                 \
182         }                                                                                                    \
183     }                                                                                                        \
184     PetscFunctionReturn(0);                                                                                  \
185   }
186 
187 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
188   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
189   {                                                                                                          \
190     PetscErrorCode ierr;                                                                                     \
191     const Type     *u = (const Type*)src;                                                                    \
192     Type           *v = (Type*)dst;                                                                          \
193     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
194     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
195     const PetscInt MBS = M*BS;                                                                               \
196     PetscFunctionBegin;                                                                                      \
197     if (!srcIdx) { /* src is contiguous */                                                                   \
198       u += srcStart*MBS;                                                                                     \
199       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
200     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
201       u += srcOpt->start[0]*MBS;                                                                             \
202       v += dstStart*MBS;                                                                                     \
203       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
204       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
205         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
206           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
207           v += srcOpt->dx[0]*MBS;                                                                            \
208         }                                                                                                    \
209     } else { /* all other cases */                                                                           \
210       for (i=0; i<count; i++) {                                                                              \
211         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
212         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
213         for (j=0; j<M; j++)                                                                                  \
214           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
215       }                                                                                                      \
216     }                                                                                                        \
217     PetscFunctionReturn(0);                                                                                  \
218   }
219 
220 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
221   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
222   {                                                                                                          \
223     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
224     const Type     *ldata = (const Type*)leafdata;                                                           \
225     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
226     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
227     const PetscInt MBS = M*BS;                                                                               \
228     PetscFunctionBegin;                                                                                      \
229     for (i=0; i<count; i++) {                                                                                \
230       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
231       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
232       for (j=0; j<M; j++)                                                                                    \
233         for (k=0; k<BS; k++) {                                                                               \
234           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
235           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
236         }                                                                                                    \
237     }                                                                                                        \
238     PetscFunctionReturn(0);                                                                                  \
239   }
240 
241 /* Pack, Unpack/Fetch ops */
242 #define DEF_Pack(Type,BS,EQ)                                                      \
243   DEF_PackFunc(Type,BS,EQ)                                                        \
244   DEF_UnpackFunc(Type,BS,EQ)                                                      \
245   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
246   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
247     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
248     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
249     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
250   }
251 
252 /* Add, Mult ops */
253 #define DEF_Add(Type,BS,EQ)                                                       \
254   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
255   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
256   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
257   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
258   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
259   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
260   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
261     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
262     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
263     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
264     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
265     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
266     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
267   }
268 
269 /* Max, Min ops */
270 #define DEF_Cmp(Type,BS,EQ)                                                       \
271   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
272   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
273   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
274   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
275   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
276     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
277     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
278     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
279     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
280   }
281 
282 /* Logical ops.
283   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
284   the compilation warning "empty macro arguments are undefined in ISO C90"
285  */
286 #define DEF_Log(Type,BS,EQ)                                                       \
287   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
288   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
289   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
290   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
291   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
292   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
293   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
294     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
295     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
296     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
297     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
298     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
299     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
300   }
301 
302 /* Bitwise ops */
303 #define DEF_Bit(Type,BS,EQ)                                                       \
304   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
305   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
306   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
307   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
308   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
309   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
310   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
311     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
312     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
313     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
314     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
315     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
316     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
317   }
318 
319 /* Maxloc, Minloc ops */
320 #define DEF_Xloc(Type,BS,EQ)                                                      \
321   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
322   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
323   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
324   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
325   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
326     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
327     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
328     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
329     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
330   }
331 
332 #define DEF_IntegerType(Type,BS,EQ)                                               \
333   DEF_Pack(Type,BS,EQ)                                                            \
334   DEF_Add(Type,BS,EQ)                                                             \
335   DEF_Cmp(Type,BS,EQ)                                                             \
336   DEF_Log(Type,BS,EQ)                                                             \
337   DEF_Bit(Type,BS,EQ)                                                             \
338   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
339     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
340     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
341     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
342     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
343     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
344   }
345 
346 #define DEF_RealType(Type,BS,EQ)                                                  \
347   DEF_Pack(Type,BS,EQ)                                                            \
348   DEF_Add(Type,BS,EQ)                                                             \
349   DEF_Cmp(Type,BS,EQ)                                                             \
350   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
351     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
352     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
353     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
354   }
355 
356 #if defined(PETSC_HAVE_COMPLEX)
357 #define DEF_ComplexType(Type,BS,EQ)                                               \
358   DEF_Pack(Type,BS,EQ)                                                            \
359   DEF_Add(Type,BS,EQ)                                                             \
360   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
361     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
362     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
363   }
364 #endif
365 
366 #define DEF_DumbType(Type,BS,EQ)                                                  \
367   DEF_Pack(Type,BS,EQ)                                                            \
368   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
369     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
370   }
371 
372 /* Maxloc, Minloc */
373 #define DEF_PairType(Type,BS,EQ)                                                  \
374   DEF_Pack(Type,BS,EQ)                                                            \
375   DEF_Xloc(Type,BS,EQ)                                                            \
376   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
377     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
378     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
379   }
380 
381 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
382 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
383 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
384 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
385 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
386 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
387 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
388 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
389 
390 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
391 DEF_IntegerType(int,1,1)
392 DEF_IntegerType(int,2,1)
393 DEF_IntegerType(int,4,1)
394 DEF_IntegerType(int,8,1)
395 DEF_IntegerType(int,1,0)
396 DEF_IntegerType(int,2,0)
397 DEF_IntegerType(int,4,0)
398 DEF_IntegerType(int,8,0)
399 #endif
400 
401 /* The typedefs are used to get a typename without space that CPPJoin can handle */
402 typedef signed char SignedChar;
403 DEF_IntegerType(SignedChar,1,1)
404 DEF_IntegerType(SignedChar,2,1)
405 DEF_IntegerType(SignedChar,4,1)
406 DEF_IntegerType(SignedChar,8,1)
407 DEF_IntegerType(SignedChar,1,0)
408 DEF_IntegerType(SignedChar,2,0)
409 DEF_IntegerType(SignedChar,4,0)
410 DEF_IntegerType(SignedChar,8,0)
411 
412 typedef unsigned char UnsignedChar;
413 DEF_IntegerType(UnsignedChar,1,1)
414 DEF_IntegerType(UnsignedChar,2,1)
415 DEF_IntegerType(UnsignedChar,4,1)
416 DEF_IntegerType(UnsignedChar,8,1)
417 DEF_IntegerType(UnsignedChar,1,0)
418 DEF_IntegerType(UnsignedChar,2,0)
419 DEF_IntegerType(UnsignedChar,4,0)
420 DEF_IntegerType(UnsignedChar,8,0)
421 
422 DEF_RealType(PetscReal,1,1)
423 DEF_RealType(PetscReal,2,1)
424 DEF_RealType(PetscReal,4,1)
425 DEF_RealType(PetscReal,8,1)
426 DEF_RealType(PetscReal,1,0)
427 DEF_RealType(PetscReal,2,0)
428 DEF_RealType(PetscReal,4,0)
429 DEF_RealType(PetscReal,8,0)
430 
431 #if defined(PETSC_HAVE_COMPLEX)
432 DEF_ComplexType(PetscComplex,1,1)
433 DEF_ComplexType(PetscComplex,2,1)
434 DEF_ComplexType(PetscComplex,4,1)
435 DEF_ComplexType(PetscComplex,8,1)
436 DEF_ComplexType(PetscComplex,1,0)
437 DEF_ComplexType(PetscComplex,2,0)
438 DEF_ComplexType(PetscComplex,4,0)
439 DEF_ComplexType(PetscComplex,8,0)
440 #endif
441 
442 #define PairType(Type1,Type2) Type1##_##Type2
443 typedef struct {int u; int i;}           PairType(int,int);
444 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
445 DEF_PairType(PairType(int,int),1,1)
446 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
447 
448 /* If we don't know the basic type, we treat it as a stream of chars or ints */
449 DEF_DumbType(char,1,1)
450 DEF_DumbType(char,2,1)
451 DEF_DumbType(char,4,1)
452 DEF_DumbType(char,1,0)
453 DEF_DumbType(char,2,0)
454 DEF_DumbType(char,4,0)
455 
456 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
457 DEF_DumbType(DumbInt,1,1)
458 DEF_DumbType(DumbInt,2,1)
459 DEF_DumbType(DumbInt,4,1)
460 DEF_DumbType(DumbInt,8,1)
461 DEF_DumbType(DumbInt,1,0)
462 DEF_DumbType(DumbInt,2,0)
463 DEF_DumbType(DumbInt,4,0)
464 DEF_DumbType(DumbInt,8,0)
465 
466 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
467 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
468 {
469   int ierr;
470   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
471   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
472   return MPI_SUCCESS;
473 }
474 #endif
475 
476 /*
477    The routine Creates a communication link for the given operation. It first looks up its link cache. If
478    there is a free & suitable one, it uses it. Otherwise it creates a new one.
479 
480    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
481    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
482    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
483    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
484 
485    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
486 
487    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
488 
489    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
490    need pack/unpack data.
491 */
492 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
493 {
494   PetscErrorCode    ierr;
495   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
496   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
497   PetscSFLink       *p,link;
498   PetscSFDirection  direction;
499   MPI_Request       *reqs = NULL;
500   PetscBool         match,rootdirect[2],leafdirect[2];
501   PetscMemType      rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1*/
502   PetscMemType      leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
503   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
504   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
505 
506   PetscFunctionBegin;
507   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
508 
509   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
510   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
511     if (sfop == PETSCSF_BCAST) {
512       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
513       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
514     } else if (sfop == PETSCSF_REDUCE) {
515       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
516       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
517     } else { /* PETSCSF_FETCH */
518       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
519       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
520     }
521   }
522 
523   if (sf->use_gpu_aware_mpi) {
524     rootmtype_mpi = rootmtype;
525     leafmtype_mpi = leafmtype;
526   } else {
527     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
528   }
529   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
530   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
531   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
532 
533   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
534   nrootreqs = bas->nrootreqs;
535   nleafreqs = sf->nleafreqs;
536 
537   /* Look for free links in cache */
538   for (p=&bas->avail; (link=*p); p=&link->next) {
539     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
540     if (match) {
541       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
542          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
543       */
544       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
545         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
546         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
547         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
548       }
549       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
550         reqs = link->leafreqs[direction][leafmtype][1];
551         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
552         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
553       }
554       *p = link->next; /* Remove from available list */
555       goto found;
556     }
557   }
558 
559   ierr = PetscNew(&link);CHKERRQ(ierr);
560   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
561   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
562 
563   nreqs = (nrootreqs+nleafreqs)*8;
564   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
565   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
566 
567   for (i=0; i<2; i++) { /* Two communication directions */
568     for (j=0; j<2; j++) { /* Two memory types */
569       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
570         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
571         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
572       }
573     }
574   }
575 
576 found:
577 
578 #if defined(PETSC_HAVE_DEVICE)
579   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
580     #if defined(PETSC_HAVE_CUDA)
581       if (sf->backend == PETSCSF_BACKEND_CUDA)   {ierr = PetscSFLinkSetUp_Cuda(sf,link,unit);CHKERRQ(ierr);}
582     #endif
583     #if defined(PETSC_HAVE_KOKKOS)
584       if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);}
585     #endif
586   }
587 #endif
588 
589   /* Allocate buffers along root/leafdata */
590   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
591     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
592     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
593     if (bas->rootbuflen[i]) {
594       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
595         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
596       } else { /* Have to have a separate rootbuf */
597         if (!link->rootbuf_alloc[i][rootmtype]) {
598           ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
599         }
600         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
601       }
602     }
603 
604     if (sf->leafbuflen[i]) {
605       if (leafdirect[i]) {
606         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
607       } else {
608         if (!link->leafbuf_alloc[i][leafmtype]) {
609           ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
610         }
611         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
612       }
613     }
614   }
615 
616 #if defined(PETSC_HAVE_DEVICE)
617   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
618   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
619     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
620       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
621     }
622     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
623   }
624   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
625     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
626       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
627     }
628     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
629   }
630 #endif
631 
632   /* Set `current` state of the link. They may change between different SF invocations with the same link */
633   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
634     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
635     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
636   }
637 
638   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
639   link->leafdata = leafdata;
640   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
641     link->rootdirect[i] = rootdirect[i];
642     link->leafdirect[i] = leafdirect[i];
643   }
644   link->rootdirect_mpi  = rootdirect_mpi;
645   link->leafdirect_mpi  = leafdirect_mpi;
646   link->rootmtype       = rootmtype;
647   link->leafmtype       = leafmtype;
648   link->rootmtype_mpi   = rootmtype_mpi;
649   link->leafmtype_mpi   = leafmtype_mpi;
650 
651   link->next            = bas->inuse;
652   bas->inuse            = link;
653   *mylink               = link;
654   PetscFunctionReturn(0);
655 }
656 
657 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
658    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
659 */
660 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
661 {
662   PetscErrorCode       ierr;
663   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
664   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
665   const PetscInt       *rootoffset,*leafoffset;
666   PetscMPIInt          n;
667   MPI_Aint             disp;
668   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
669   MPI_Datatype         unit = link->unit;
670   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
671   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
672 
673   PetscFunctionBegin;
674   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
675   if (sf->persistent) {
676     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
677       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
678       if (direction == PETSCSF_LEAF2ROOT) {
679         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
680           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
681           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
682           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
683         }
684       } else { /* PETSCSF_ROOT2LEAF */
685         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
686           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
687           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
688           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
689         }
690       }
691       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
692     }
693 
694     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
695       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
696       if (direction == PETSCSF_LEAF2ROOT) {
697         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
698           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
699           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
700           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
701         }
702       } else { /* PETSCSF_ROOT2LEAF */
703         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
704           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
705           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
706           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
707         }
708       }
709       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
710     }
711   }
712   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
713   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
714   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
715   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
716   PetscFunctionReturn(0);
717 }
718 
719 
720 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
721 {
722   PetscErrorCode    ierr;
723   PetscSFLink       link,*p;
724   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
725 
726   PetscFunctionBegin;
727   /* Look for types in cache */
728   for (p=&bas->inuse; (link=*p); p=&link->next) {
729     PetscBool match;
730     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
731     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
732       switch (cmode) {
733       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
734       case PETSC_USE_POINTER: break;
735       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
736       }
737       *mylink = link;
738       PetscFunctionReturn(0);
739     }
740   }
741   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
742   PetscFunctionReturn(0);
743 }
744 
745 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
746 {
747   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
748 
749   PetscFunctionBegin;
750   (*link)->rootdata = NULL;
751   (*link)->leafdata = NULL;
752   (*link)->next     = bas->avail;
753   bas->avail        = *link;
754   *link             = NULL;
755   PetscFunctionReturn(0);
756 }
757 
758 /* Destroy all links chained in 'avail' */
759 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
760 {
761   PetscErrorCode    ierr;
762   PetscSFLink       link = *avail,next;
763   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
764   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
765 
766   PetscFunctionBegin;
767   for (; link; link=next) {
768     next = link->next;
769     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
770     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
771       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
772     }
773     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
774     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
775       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
776       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
777       #if defined(PETSC_HAVE_DEVICE)
778       ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
779       ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
780       #endif
781     }
782   #if defined(PETSC_HAVE_DEVICE)
783     if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);}
784    #if defined(PETSC_HAVE_CUDA)
785     if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;}
786    #elif defined(PETSC_HAVE_HIP)
787     if (link->stream) {hipError_t  cerr = hipStreamDestroy(link->stream);CHKERRQ(cerr); link->stream = NULL;} /* TODO: CHKERRHIP? */
788    #endif
789   #endif
790     ierr = PetscFree(link);CHKERRQ(ierr);
791   }
792   *avail = NULL;
793   PetscFunctionReturn(0);
794 }
795 
796 /* Error out on unsupported overlapped communications */
797 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
798 {
799   PetscErrorCode    ierr;
800   PetscSFLink       link,*p;
801   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
802   PetscBool         match;
803 
804   PetscFunctionBegin;
805   if (!PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
806   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
807      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
808   */
809   if (rootdata || leafdata) {
810     for (p=&bas->inuse; (link=*p); p=&link->next) {
811       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
812       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
813     }
814   }
815   PetscFunctionReturn(0);
816 }
817 
818 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
819 {
820   PetscFunctionBegin;
821   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
822   PetscFunctionReturn(0);
823 }
824 
825 
826 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
827 {
828   PetscErrorCode ierr;
829   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
830   PetscBool      is2Int,is2PetscInt;
831   PetscMPIInt    ni,na,nd,combiner;
832 #if defined(PETSC_HAVE_COMPLEX)
833   PetscInt       nPetscComplex=0;
834 #endif
835 
836   PetscFunctionBegin;
837   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
838   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
839   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
840   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
841   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
842   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
843 #if defined(PETSC_HAVE_COMPLEX)
844   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
845 #endif
846   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
847   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
848   /* TODO: shaell we also handle Fortran MPI_2REAL? */
849   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
850   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
851   link->bs = 1; /* default */
852 
853   if (is2Int) {
854     PackInit_PairType_int_int_1_1(link);
855     link->bs        = 1;
856     link->unitbytes = 2*sizeof(int);
857     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
858     link->basicunit = MPI_2INT;
859     link->unit      = MPI_2INT;
860   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
861     PackInit_PairType_PetscInt_PetscInt_1_1(link);
862     link->bs        = 1;
863     link->unitbytes = 2*sizeof(PetscInt);
864     link->basicunit = MPIU_2INT;
865     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
866     link->unit      = MPIU_2INT;
867   } else if (nPetscReal) {
868     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
869     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
870     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
871     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
872     link->bs        = nPetscReal;
873     link->unitbytes = nPetscReal*sizeof(PetscReal);
874     link->basicunit = MPIU_REAL;
875     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
876   } else if (nPetscInt) {
877     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
878     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
879     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
880     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
881     link->bs        = nPetscInt;
882     link->unitbytes = nPetscInt*sizeof(PetscInt);
883     link->basicunit = MPIU_INT;
884     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
885 #if defined(PETSC_USE_64BIT_INDICES)
886   } else if (nInt) {
887     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
888     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
889     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
890     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
891     link->bs        = nInt;
892     link->unitbytes = nInt*sizeof(int);
893     link->basicunit = MPI_INT;
894     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
895 #endif
896   } else if (nSignedChar) {
897     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
898     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
899     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
900     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
901     link->bs        = nSignedChar;
902     link->unitbytes = nSignedChar*sizeof(SignedChar);
903     link->basicunit = MPI_SIGNED_CHAR;
904     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
905   }  else if (nUnsignedChar) {
906     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
907     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
908     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
909     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
910     link->bs        = nUnsignedChar;
911     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
912     link->basicunit = MPI_UNSIGNED_CHAR;
913     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
914 #if defined(PETSC_HAVE_COMPLEX)
915   } else if (nPetscComplex) {
916     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
917     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
918     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
919     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
920     link->bs        = nPetscComplex;
921     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
922     link->basicunit = MPIU_COMPLEX;
923     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
924 #endif
925   } else {
926     MPI_Aint lb,nbyte;
927     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
928     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
929     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
930       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
931       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
932       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
933       link->bs        = nbyte;
934       link->unitbytes = nbyte;
935       link->basicunit = MPI_BYTE;
936     } else {
937       nInt = nbyte / sizeof(int);
938       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
939       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
940       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
941       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
942       link->bs        = nInt;
943       link->unitbytes = nbyte;
944       link->basicunit = MPI_INT;
945     }
946     if (link->isbuiltin) link->unit = unit;
947   }
948 
949   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
950 
951   link->Memcpy = PetscSFLinkMemcpy_Host;
952   PetscFunctionReturn(0);
953 }
954 
955 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
956 {
957   PetscFunctionBegin;
958   *UnpackAndOp = NULL;
959   if (mtype == PETSC_MEMTYPE_HOST) {
960     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
961     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
962     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
963     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
964     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
965     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
966     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
967     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
968     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
969     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
970     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
971     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
972     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
973   }
974 #if defined(PETSC_HAVE_DEVICE)
975   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
976     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
977     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
978     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
979     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
980     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
981     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
982     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
983     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
984     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
985     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
986     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
987     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
988     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
989   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
990     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
991     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
992     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
993     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
994     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
995     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
996     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
997     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
998     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
999     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
1000     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
1001     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
1002     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
1003   }
1004 #endif
1005   PetscFunctionReturn(0);
1006 }
1007 
1008 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
1009 {
1010   PetscFunctionBegin;
1011   *ScatterAndOp = NULL;
1012   if (mtype == PETSC_MEMTYPE_HOST) {
1013     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
1014     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
1015     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
1016     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
1017     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
1018     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
1019     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
1020     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
1021     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
1022     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
1023     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
1024     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
1025     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
1026   }
1027 #if defined(PETSC_HAVE_DEVICE)
1028   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1029     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
1030     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1031     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
1032     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1033     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1034     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1035     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1036     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1037     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1038     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1039     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1040     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1041     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1042   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1043     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1044     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1045     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1046     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1047     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1048     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1049     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1050     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1051     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1052     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1053     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1054     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1055     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1056   }
1057 #endif
1058   PetscFunctionReturn(0);
1059 }
1060 
1061 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1062 {
1063   PetscFunctionBegin;
1064   *FetchAndOp = NULL;
1065   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1066   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1067 #if defined(PETSC_HAVE_DEVICE)
1068   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1069   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1070 #endif
1071   PetscFunctionReturn(0);
1072 }
1073 
1074 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1075 {
1076   PetscFunctionBegin;
1077   *FetchAndOpLocal = NULL;
1078   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1079   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1080 #if defined(PETSC_HAVE_DEVICE)
1081   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1082   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1083 #endif
1084   PetscFunctionReturn(0);
1085 }
1086 
1087 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1088 {
1089   PetscErrorCode ierr;
1090   PetscLogDouble flops;
1091   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1092 
1093 
1094   PetscFunctionBegin;
1095   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1096     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1097 #if defined(PETSC_HAVE_DEVICE)
1098     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1099 #endif
1100     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1101   }
1102   PetscFunctionReturn(0);
1103 }
1104 
1105 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1106 {
1107   PetscLogDouble flops;
1108   PetscErrorCode ierr;
1109 
1110   PetscFunctionBegin;
1111   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1112     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1113 #if defined(PETSC_HAVE_DEVICE)
1114     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1115 #endif
1116     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1117   }
1118   PetscFunctionReturn(0);
1119 }
1120 
1121 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1122   Input Arguments:
1123   +sf      - The StarForest
1124   .link    - The link
1125   .count   - Number of entries to unpack
1126   .start   - The first index, significent when indices=NULL
1127   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1128   .buf     - A contiguous buffer to unpack from
1129   -op      - Operation after unpack
1130 
1131   Output Arguments:
1132   .data    - The data to unpack to
1133 */
1134 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1135 {
1136   PetscFunctionBegin;
1137 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1138   {
1139     PetscErrorCode ierr;
1140     PetscInt       i;
1141     PetscMPIInt    n;
1142     if (indices) {
1143       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1144          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1145       */
1146       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1147     } else {
1148       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1149       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1150     }
1151   }
1152 #else
1153   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1154 #endif
1155   PetscFunctionReturn(0);
1156 }
1157 
1158 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1159 {
1160   PetscFunctionBegin;
1161 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1162   {
1163     PetscErrorCode ierr;
1164     PetscInt       i,disp;
1165     if (!srcIdx) {
1166       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
1167     } else {
1168       for (i=0; i<count; i++) {
1169         disp = dstIdx? dstIdx[i] : dstStart + i;
1170         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1171       }
1172     }
1173   }
1174 #else
1175   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1176 #endif
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 /*=============================================================================
1181               Pack/Unpack/Fetch/Scatter routines
1182  ============================================================================*/
1183 
1184 /* Pack rootdata to rootbuf
1185   Input Arguments:
1186   + sf       - The SF this packing works on.
1187   . link     - It gives the memtype of the roots and also provides root buffer.
1188   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1189   - rootdata - Where to read the roots.
1190 
1191   Notes:
1192   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1193   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1194  */
1195 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1196 {
1197   PetscErrorCode   ierr;
1198   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1199   const PetscInt   *rootindices = NULL;
1200   PetscInt         count,start;
1201   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1202   PetscMemType     rootmtype = link->rootmtype;
1203   PetscSFPackOpt   opt = NULL;
1204 
1205   PetscFunctionBegin;
1206   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1207   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1208   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1209     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1210     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1211     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1212   }
1213   if (scope == PETSCSF_REMOTE) {
1214     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1215     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1216   }
1217   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1218   PetscFunctionReturn(0);
1219 }
1220 
1221 /* Pack leafdata to leafbuf */
1222 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1223 {
1224   PetscErrorCode   ierr;
1225   const PetscInt   *leafindices = NULL;
1226   PetscInt         count,start;
1227   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1228   PetscMemType     leafmtype = link->leafmtype;
1229   PetscSFPackOpt   opt = NULL;
1230 
1231   PetscFunctionBegin;
1232   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1233   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1234   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1235     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1236     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1237     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1238   }
1239   if (scope == PETSCSF_REMOTE) {
1240     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1241     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1242   }
1243   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1244   PetscFunctionReturn(0);
1245 }
1246 
1247 /* Unpack rootbuf to rootdata */
1248 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1249 {
1250   PetscErrorCode   ierr;
1251   const PetscInt   *rootindices = NULL;
1252   PetscInt         count,start;
1253   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1254   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1255   PetscMemType     rootmtype = link->rootmtype;
1256   PetscSFPackOpt   opt = NULL;
1257 
1258   PetscFunctionBegin;
1259   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1260   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1261   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1262     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1263     if (UnpackAndOp) {
1264       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1265       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1266     } else {
1267       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1268       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1269     }
1270   }
1271   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1272   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1273   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1274   PetscFunctionReturn(0);
1275 }
1276 
1277 /* Unpack leafbuf to leafdata */
1278 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1279 {
1280   PetscErrorCode   ierr;
1281   const PetscInt   *leafindices = NULL;
1282   PetscInt         count,start;
1283   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1284   PetscMemType     leafmtype = link->leafmtype;
1285   PetscSFPackOpt   opt = NULL;
1286 
1287   PetscFunctionBegin;
1288   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1289   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1290   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1291     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1292     if (UnpackAndOp) {
1293       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1294       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1295     } else {
1296       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1297       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1298     }
1299   }
1300   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1301   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1302   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1303   PetscFunctionReturn(0);
1304 }
1305 
1306 /* FetchAndOp rootdata with rootbuf */
1307 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1308 {
1309   PetscErrorCode     ierr;
1310   const PetscInt     *rootindices = NULL;
1311   PetscInt           count,start;
1312   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1313   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1314   PetscMemType       rootmtype = link->rootmtype;
1315   PetscSFPackOpt     opt = NULL;
1316 
1317   PetscFunctionBegin;
1318   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1319   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1320   if (bas->rootbuflen[scope]) {
1321     /* Do FetchAndOp on rootdata with rootbuf */
1322     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1323     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1324     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1325   }
1326   if (scope == PETSCSF_REMOTE) {
1327     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1328     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1329   }
1330   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1331   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1332   PetscFunctionReturn(0);
1333 }
1334 
1335 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1336 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1337 {
1338   PetscErrorCode       ierr;
1339   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1340   PetscInt             count,rootstart,leafstart;
1341   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1342   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1343   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1344   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1345 
1346   PetscFunctionBegin;
1347   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1348   if (rootmtype != leafmtype) { /* Uncommon case */
1349      /* The local communication has to go through pack and unpack */
1350     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1351     ierr = (*link->Memcpy)(link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1352     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1353   } else {
1354     ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1355     if (ScatterAndOp) {
1356       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1357       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1358       ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1359     } else {
1360       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1361       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1362       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1363     }
1364   }
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 /* Reduce leafdata to rootdata locally */
1369 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1370 {
1371   PetscErrorCode       ierr;
1372   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1373   PetscInt             count,rootstart,leafstart;
1374   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1375   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1376   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1377   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1378 
1379   PetscFunctionBegin;
1380   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1381   if (rootmtype != leafmtype) {
1382     /* The local communication has to go through pack and unpack */
1383     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1384     ierr = (*link->Memcpy)(link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1385     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1386   } else {
1387     ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1388     if (ScatterAndOp) {
1389       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1390       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1391       ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1392     } else {
1393       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1394       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1395       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1396     }
1397   }
1398   PetscFunctionReturn(0);
1399 }
1400 
1401 /* Fetch rootdata to leafdata and leafupdate locally */
1402 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1403 {
1404   PetscErrorCode       ierr;
1405   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1406   PetscInt             count,rootstart,leafstart;
1407   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1408   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1409   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1410   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1411 
1412   PetscFunctionBegin;
1413   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1414   if (rootmtype != leafmtype) {
1415    /* The local communication has to go through pack and unpack */
1416     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1417   } else {
1418     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1419     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1420     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1421     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1422   }
1423   PetscFunctionReturn(0);
1424 }
1425 
1426 /*
1427   Create per-rank pack/unpack optimizations based on indice patterns
1428 
1429    Input Parameters:
1430   +  n       - Number of destination ranks
1431   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1432   -  idx     - [*]   Array storing indices
1433 
1434    Output Parameters:
1435   +  opt     - Pack optimizations. NULL if no optimizations.
1436 */
1437 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1438 {
1439   PetscErrorCode ierr;
1440   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1441   PetscBool      optimizable = PETSC_TRUE;
1442   PetscSFPackOpt opt;
1443 
1444   PetscFunctionBegin;
1445   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1446   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1447   opt->n      = opt->array[0] = n;
1448   opt->offset = opt->array + 1;
1449   opt->start  = opt->array + n   + 2;
1450   opt->dx     = opt->array + 2*n + 2;
1451   opt->dy     = opt->array + 3*n + 2;
1452   opt->dz     = opt->array + 4*n + 2;
1453   opt->X      = opt->array + 5*n + 2;
1454   opt->Y      = opt->array + 6*n + 2;
1455 
1456   for (r=0; r<n; r++) { /* For each destination rank */
1457     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1458     p     = offset[r];
1459     start = idx[p]; /* First index for this rank */
1460     p++;
1461 
1462     /* Search in X dimension */
1463     for (dx=1; dx<m; dx++,p++) {
1464       if (start+dx != idx[p]) break;
1465     }
1466 
1467     dydz = m/dx;
1468     X    = dydz > 1 ? (idx[p]-start) : dx;
1469     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1470     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1471     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1472       for (i=0; i<dx; i++,p++) {
1473         if (start+X*dy+i != idx[p]) {
1474           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1475           else goto Z_dimension;
1476         }
1477       }
1478     }
1479 
1480 Z_dimension:
1481     dz = m/(dx*dy);
1482     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1483     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1484     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1485     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1486       for (j=0; j<dy; j++) {
1487         for (i=0; i<dx; i++,p++) {
1488           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1489         }
1490       }
1491     }
1492     opt->start[r] = start;
1493     opt->dx[r]    = dx;
1494     opt->dy[r]    = dy;
1495     opt->dz[r]    = dz;
1496     opt->X[r]     = X;
1497     opt->Y[r]     = Y;
1498   }
1499 
1500 finish:
1501   /* If not optimizable, free arrays to save memory */
1502   if (!n || !optimizable) {
1503     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1504     ierr = PetscFree(opt);CHKERRQ(ierr);
1505     *out = NULL;
1506   } else {
1507     opt->offset[0] = 0;
1508     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1509     *out = opt;
1510   }
1511   PetscFunctionReturn(0);
1512 }
1513 
1514 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1515 {
1516   PetscErrorCode ierr;
1517   PetscSFPackOpt opt = *out;
1518 
1519   PetscFunctionBegin;
1520   if (opt) {
1521     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
1522     ierr = PetscFree(opt);CHKERRQ(ierr);
1523     *out = NULL;
1524   }
1525   PetscFunctionReturn(0);
1526 }
1527 
1528 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1529 {
1530   PetscErrorCode ierr;
1531   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1532   PetscInt       i,j;
1533 
1534   PetscFunctionBegin;
1535   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1536   for (i=0; i<2; i++) { /* Set defaults */
1537     sf->leafstart[i]   = 0;
1538     sf->leafcontig[i]  = PETSC_TRUE;
1539     sf->leafdups[i]    = PETSC_FALSE;
1540     bas->rootstart[i]  = 0;
1541     bas->rootcontig[i] = PETSC_TRUE;
1542     bas->rootdups[i]   = PETSC_FALSE;
1543   }
1544 
1545   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1546   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1547 
1548   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1549   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1550 
1551   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1552   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1553     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1554   }
1555   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1556     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1557   }
1558 
1559   /* If not, see if we can have per-rank optimizations by doing index analysis */
1560   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1561   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1562 
1563   /* Are root indices for self and remote contiguous? */
1564   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1565   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1566 
1567   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1568   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1569 
1570   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1571     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1572   }
1573   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1574     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1575   }
1576 
1577   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1578   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1579 
1580 #if defined(PETSC_HAVE_DEVICE)
1581     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1582   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1583   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1584   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1585   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1586 #endif
1587 
1588   PetscFunctionReturn(0);
1589 }
1590 
1591 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1592 {
1593   PetscErrorCode ierr;
1594   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1595   PetscInt       i;
1596 
1597   PetscFunctionBegin;
1598   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1599     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1600     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1601 #if defined(PETSC_HAVE_DEVICE)
1602     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1603     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1604 #endif
1605   }
1606   PetscFunctionReturn(0);
1607 }
1608