xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision efbe7e8a80d07327753dbe0b33efee01e046af3f)
1 #include "petsc/private/sfimpl.h"
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
6 
7 #if defined(PETSC_HAVE_CUDA)
8 #include <cuda_runtime.h>
9 #include <petsccublas.h>
10 #endif
11 /*
12  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
13  * therefore we pack data types manually. This file defines packing routines for the standard data types.
14  */
15 
16 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
17 
18 /* Operations working like s += t */
19 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
20 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
21 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
22 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
23 /* Ref MPI MAXLOC */
24 #define OP_XLOC(op,s,t) \
25   do {                                       \
26     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
27     else if (!((s).u op (t).u)) s = t;           \
28   } while (0)
29 
30 /* DEF_PackFunc - macro defining a Pack routine
31 
32    Arguments of the macro:
33    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
34    .BS        Block size for vectorization. It is a factor of bsz.
35    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
36 
37    Arguments of the Pack routine:
38    +count     Number of indices in idx[].
39    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
40    .opt       Per-pack optimization plan. NULL means no such plan.
41    .idx       Indices of entries to packed.
42    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
43    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
44    -packed    Address of the packed data.
45  */
46 #define DEF_PackFunc(Type,BS,EQ) \
47   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
48   {                                                                                                          \
49     PetscErrorCode ierr;                                                                                     \
50     const Type     *u = (const Type*)unpacked,*u2;                                                           \
51     Type           *p = (Type*)packed,*p2;                                                                   \
52     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
53     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
54     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
55     PetscFunctionBegin;                                                                                      \
56     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
57     else if (opt) { /* has optimizations available */                                                        \
58       p2 = p;                                                                                                \
59       for (r=0; r<opt->n; r++) {                                                                             \
60         u2 = u + opt->start[r]*MBS;                                                                          \
61         X  = opt->X[r];                                                                                      \
62         Y  = opt->Y[r];                                                                                      \
63         for (k=0; k<opt->dz[r]; k++)                                                                         \
64           for (j=0; j<opt->dy[r]; j++) {                                                                     \
65             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
66             p2  += opt->dx[r]*MBS;                                                                           \
67           }                                                                                                  \
68       }                                                                                                      \
69     } else {                                                                                                 \
70       for (i=0; i<count; i++)                                                                                \
71         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
72           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
73             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
74     }                                                                                                        \
75     PetscFunctionReturn(0);                                                                                  \
76   }
77 
78 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
79                 and inserts into a sparse array.
80 
81    Arguments:
82   .Type       Type of the data
83   .BS         Block size for vectorization
84   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
85 
86   Notes:
87    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
88  */
89 #define DEF_UnpackFunc(Type,BS,EQ)               \
90   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
91   {                                                                                                          \
92     PetscErrorCode ierr;                                                                                     \
93     Type           *u = (Type*)unpacked,*u2;                                                                 \
94     const Type     *p = (const Type*)packed;                                                                 \
95     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
96     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
97     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
98     PetscFunctionBegin;                                                                                      \
99     if (!idx) {                                                                                              \
100       u += start*MBS;                                                                                        \
101       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
102     } else if (opt) { /* has optimizations available */                                                      \
103       for (r=0; r<opt->n; r++) {                                                                             \
104         u2 = u + opt->start[r]*MBS;                                                                          \
105         X  = opt->X[r];                                                                                      \
106         Y  = opt->Y[r];                                                                                      \
107         for (k=0; k<opt->dz[r]; k++)                                                                         \
108           for (j=0; j<opt->dy[r]; j++) {                                                                     \
109             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
110             p   += opt->dx[r]*MBS;                                                                           \
111           }                                                                                                  \
112       }                                                                                                      \
113     } else {                                                                                                 \
114       for (i=0; i<count; i++)                                                                                \
115         for (j=0; j<M; j++)                                                                                  \
116           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
117     }                                                                                                        \
118     PetscFunctionReturn(0);                                                                                  \
119   }
120 
121 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
122 
123    Arguments:
124   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
125   .Type       Type of the data
126   .BS         Block size for vectorization
127   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
128   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
129   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
130  */
131 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
132   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
133   {                                                                                                          \
134     Type           *u = (Type*)unpacked,*u2;                                                                 \
135     const Type     *p = (const Type*)packed;                                                                 \
136     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
137     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
138     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
139     PetscFunctionBegin;                                                                                      \
140     if (!idx) {                                                                                              \
141       u += start*MBS;                                                                                        \
142       for (i=0; i<count; i++)                                                                                \
143         for (j=0; j<M; j++)                                                                                  \
144           for (k=0; k<BS; k++)                                                                               \
145             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
146     } else if (opt) { /* idx[] has patterns */                                                               \
147       for (r=0; r<opt->n; r++) {                                                                             \
148         u2 = u + opt->start[r]*MBS;                                                                          \
149         X  = opt->X[r];                                                                                      \
150         Y  = opt->Y[r];                                                                                      \
151         for (k=0; k<opt->dz[r]; k++)                                                                         \
152           for (j=0; j<opt->dy[r]; j++) {                                                                     \
153             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
154             p += opt->dx[r]*MBS;                                                                             \
155           }                                                                                                  \
156       }                                                                                                      \
157     } else {                                                                                                 \
158       for (i=0; i<count; i++)                                                                                \
159         for (j=0; j<M; j++)                                                                                  \
160           for (k=0; k<BS; k++)                                                                               \
161             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
162     }                                                                                                        \
163     PetscFunctionReturn(0);                                                                                  \
164   }
165 
166 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
167   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
168   {                                                                                                          \
169     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
170     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
171     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
172     const PetscInt MBS = M*BS;                                                                               \
173     PetscFunctionBegin;                                                                                      \
174     for (i=0; i<count; i++) {                                                                                \
175       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
176       l = i*MBS;                                                                                             \
177       for (j=0; j<M; j++)                                                                                    \
178         for (k=0; k<BS; k++) {                                                                               \
179           tmp = u[r+j*BS+k];                                                                                 \
180           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
181           p[l+j*BS+k] = tmp;                                                                                 \
182         }                                                                                                    \
183     }                                                                                                        \
184     PetscFunctionReturn(0);                                                                                  \
185   }
186 
187 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
188   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
189   {                                                                                                          \
190     PetscErrorCode ierr;                                                                                     \
191     const Type     *u = (const Type*)src;                                                                    \
192     Type           *v = (Type*)dst;                                                                          \
193     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
194     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
195     const PetscInt MBS = M*BS;                                                                               \
196     PetscFunctionBegin;                                                                                      \
197     if (!srcIdx) { /* src is contiguous */                                                                   \
198       u += srcStart*MBS;                                                                                     \
199       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
200     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
201       u += srcOpt->start[0]*MBS;                                                                             \
202       v += dstStart*MBS;                                                                                     \
203       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
204       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
205         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
206           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
207           v += srcOpt->dx[0]*MBS;                                                                            \
208         }                                                                                                    \
209     } else { /* all other cases */                                                                           \
210       for (i=0; i<count; i++) {                                                                              \
211         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
212         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
213         for (j=0; j<M; j++)                                                                                  \
214           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
215       }                                                                                                      \
216     }                                                                                                        \
217     PetscFunctionReturn(0);                                                                                  \
218   }
219 
220 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
221   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
222   {                                                                                                          \
223     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
224     const Type     *ldata = (const Type*)leafdata;                                                           \
225     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
226     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
227     const PetscInt MBS = M*BS;                                                                               \
228     PetscFunctionBegin;                                                                                      \
229     for (i=0; i<count; i++) {                                                                                \
230       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
231       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
232       for (j=0; j<M; j++)                                                                                    \
233         for (k=0; k<BS; k++) {                                                                               \
234           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
235           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
236         }                                                                                                    \
237     }                                                                                                        \
238     PetscFunctionReturn(0);                                                                                  \
239   }
240 
241 /* Pack, Unpack/Fetch ops */
242 #define DEF_Pack(Type,BS,EQ)                                                      \
243   DEF_PackFunc(Type,BS,EQ)                                                        \
244   DEF_UnpackFunc(Type,BS,EQ)                                                      \
245   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
246   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
247     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
248     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
249     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
250   }
251 
252 /* Add, Mult ops */
253 #define DEF_Add(Type,BS,EQ)                                                       \
254   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
255   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
256   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
257   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
258   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
259   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
260   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
261     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
262     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
263     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
264     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
265     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
266     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
267   }
268 
269 /* Max, Min ops */
270 #define DEF_Cmp(Type,BS,EQ)                                                       \
271   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
272   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
273   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
274   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
275   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
276     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
277     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
278     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
279     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
280   }
281 
282 /* Logical ops.
283   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
284   the compilation warning "empty macro arguments are undefined in ISO C90"
285  */
286 #define DEF_Log(Type,BS,EQ)                                                       \
287   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
288   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
289   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
290   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
291   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
292   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
293   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
294     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
295     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
296     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
297     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
298     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
299     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
300   }
301 
302 /* Bitwise ops */
303 #define DEF_Bit(Type,BS,EQ)                                                       \
304   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
305   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
306   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
307   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
308   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
309   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
310   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
311     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
312     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
313     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
314     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
315     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
316     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
317   }
318 
319 /* Maxloc, Minloc ops */
320 #define DEF_Xloc(Type,BS,EQ)                                                      \
321   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
322   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
323   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
324   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
325   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
326     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
327     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
328     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
329     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
330   }
331 
332 #define DEF_IntegerType(Type,BS,EQ)                                               \
333   DEF_Pack(Type,BS,EQ)                                                            \
334   DEF_Add(Type,BS,EQ)                                                             \
335   DEF_Cmp(Type,BS,EQ)                                                             \
336   DEF_Log(Type,BS,EQ)                                                             \
337   DEF_Bit(Type,BS,EQ)                                                             \
338   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
339     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
340     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
341     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
342     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
343     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
344   }
345 
346 #define DEF_RealType(Type,BS,EQ)                                                  \
347   DEF_Pack(Type,BS,EQ)                                                            \
348   DEF_Add(Type,BS,EQ)                                                             \
349   DEF_Cmp(Type,BS,EQ)                                                             \
350   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
351     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
352     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
353     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
354   }
355 
356 #if defined(PETSC_HAVE_COMPLEX)
357 #define DEF_ComplexType(Type,BS,EQ)                                               \
358   DEF_Pack(Type,BS,EQ)                                                            \
359   DEF_Add(Type,BS,EQ)                                                             \
360   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
361     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
362     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
363   }
364 #endif
365 
366 #define DEF_DumbType(Type,BS,EQ)                                                  \
367   DEF_Pack(Type,BS,EQ)                                                            \
368   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
369     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
370   }
371 
372 /* Maxloc, Minloc */
373 #define DEF_PairType(Type,BS,EQ)                                                  \
374   DEF_Pack(Type,BS,EQ)                                                            \
375   DEF_Xloc(Type,BS,EQ)                                                            \
376   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
377     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
378     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
379   }
380 
381 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
382 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
383 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
384 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
385 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
386 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
387 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
388 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
389 
390 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
391 DEF_IntegerType(int,1,1)
392 DEF_IntegerType(int,2,1)
393 DEF_IntegerType(int,4,1)
394 DEF_IntegerType(int,8,1)
395 DEF_IntegerType(int,1,0)
396 DEF_IntegerType(int,2,0)
397 DEF_IntegerType(int,4,0)
398 DEF_IntegerType(int,8,0)
399 #endif
400 
401 /* The typedefs are used to get a typename without space that CPPJoin can handle */
402 typedef signed char SignedChar;
403 DEF_IntegerType(SignedChar,1,1)
404 DEF_IntegerType(SignedChar,2,1)
405 DEF_IntegerType(SignedChar,4,1)
406 DEF_IntegerType(SignedChar,8,1)
407 DEF_IntegerType(SignedChar,1,0)
408 DEF_IntegerType(SignedChar,2,0)
409 DEF_IntegerType(SignedChar,4,0)
410 DEF_IntegerType(SignedChar,8,0)
411 
412 typedef unsigned char UnsignedChar;
413 DEF_IntegerType(UnsignedChar,1,1)
414 DEF_IntegerType(UnsignedChar,2,1)
415 DEF_IntegerType(UnsignedChar,4,1)
416 DEF_IntegerType(UnsignedChar,8,1)
417 DEF_IntegerType(UnsignedChar,1,0)
418 DEF_IntegerType(UnsignedChar,2,0)
419 DEF_IntegerType(UnsignedChar,4,0)
420 DEF_IntegerType(UnsignedChar,8,0)
421 
422 DEF_RealType(PetscReal,1,1)
423 DEF_RealType(PetscReal,2,1)
424 DEF_RealType(PetscReal,4,1)
425 DEF_RealType(PetscReal,8,1)
426 DEF_RealType(PetscReal,1,0)
427 DEF_RealType(PetscReal,2,0)
428 DEF_RealType(PetscReal,4,0)
429 DEF_RealType(PetscReal,8,0)
430 
431 #if defined(PETSC_HAVE_COMPLEX)
432 DEF_ComplexType(PetscComplex,1,1)
433 DEF_ComplexType(PetscComplex,2,1)
434 DEF_ComplexType(PetscComplex,4,1)
435 DEF_ComplexType(PetscComplex,8,1)
436 DEF_ComplexType(PetscComplex,1,0)
437 DEF_ComplexType(PetscComplex,2,0)
438 DEF_ComplexType(PetscComplex,4,0)
439 DEF_ComplexType(PetscComplex,8,0)
440 #endif
441 
442 #define PairType(Type1,Type2) Type1##_##Type2
443 typedef struct {int u; int i;}           PairType(int,int);
444 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
445 DEF_PairType(PairType(int,int),1,1)
446 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
447 
448 /* If we don't know the basic type, we treat it as a stream of chars or ints */
449 DEF_DumbType(char,1,1)
450 DEF_DumbType(char,2,1)
451 DEF_DumbType(char,4,1)
452 DEF_DumbType(char,1,0)
453 DEF_DumbType(char,2,0)
454 DEF_DumbType(char,4,0)
455 
456 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
457 DEF_DumbType(DumbInt,1,1)
458 DEF_DumbType(DumbInt,2,1)
459 DEF_DumbType(DumbInt,4,1)
460 DEF_DumbType(DumbInt,8,1)
461 DEF_DumbType(DumbInt,1,0)
462 DEF_DumbType(DumbInt,2,0)
463 DEF_DumbType(DumbInt,4,0)
464 DEF_DumbType(DumbInt,8,0)
465 
466 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
467 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
468 {
469   int ierr;
470   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
471   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
472   return MPI_SUCCESS;
473 }
474 #endif
475 
476 /*
477    The routine Creates a communication link for the given operation. It first looks up its link cache. If
478    there is a free & suitable one, it uses it. Otherwise it creates a new one.
479 
480    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
481    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
482    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
483    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
484 
485    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
486 
487    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
488 
489    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
490    need pack/unpack data.
491 */
492 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
493 {
494   PetscErrorCode    ierr;
495   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
496   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
497   PetscSFLink       *p,link;
498   PetscSFDirection  direction;
499   MPI_Request       *reqs = NULL;
500   PetscBool         match,rootdirect[2],leafdirect[2];
501   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
502   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
503 
504   PetscFunctionBegin;
505   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
506 
507   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
508   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
509     if (sfop == PETSCSF_BCAST) {
510       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
511       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
512     } else if (sfop == PETSCSF_REDUCE) {
513       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
514       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
515     } else { /* PETSCSF_FETCH */
516       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
517       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
518     }
519   }
520 
521   if (sf->use_gpu_aware_mpi) {
522     rootmtype_mpi = rootmtype;
523     leafmtype_mpi = leafmtype;
524   } else {
525     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
526   }
527   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
528   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
529   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
530 
531   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
532   nrootreqs = bas->nrootreqs;
533   nleafreqs = sf->nleafreqs;
534 
535   /* Look for free links in cache */
536   for (p=&bas->avail; (link=*p); p=&link->next) {
537     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
538     if (match) {
539       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
540          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
541       */
542       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
543         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
544         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
545         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
546       }
547       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
548         reqs = link->leafreqs[direction][leafmtype][1];
549         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
550         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
551       }
552       *p = link->next; /* Remove from available list */
553       goto found;
554     }
555   }
556 
557   ierr = PetscNew(&link);CHKERRQ(ierr);
558   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
559   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
560 
561   nreqs = (nrootreqs+nleafreqs)*8;
562   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
563   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
564 
565   for (i=0; i<2; i++) { /* Two communication directions */
566     for (j=0; j<2; j++) { /* Two memory types */
567       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
568         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
569         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
570       }
571     }
572   }
573 
574 found:
575 
576 #if defined(PETSC_HAVE_DEVICE)
577   if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {
578     #if defined(PETSC_HAVE_CUDA)
579       if (sf->backend == PETSCSF_BACKEND_CUDA)   {ierr = PetscSFLinkSetUp_Cuda(sf,link,unit);CHKERRQ(ierr);}
580     #endif
581     #if defined(PETSC_HAVE_KOKKOS)
582       if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);}
583     #endif
584   }
585 #endif
586 
587   /* Allocate buffers along root/leafdata */
588   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
589     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
590     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
591     if (bas->rootbuflen[i]) {
592       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
593         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
594       } else { /* Have to have a separate rootbuf */
595         if (!link->rootbuf_alloc[i][rootmtype]) {
596           ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
597         }
598         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
599       }
600     }
601 
602     if (sf->leafbuflen[i]) {
603       if (leafdirect[i]) {
604         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
605       } else {
606         if (!link->leafbuf_alloc[i][leafmtype]) {
607           ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
608         }
609         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
610       }
611     }
612   }
613 
614 #if defined(PETSC_HAVE_DEVICE)
615   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
616   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
617     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
618       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
619     }
620     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
621   }
622   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
623     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
624       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
625     }
626     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
627   }
628 #endif
629 
630   /* Set `current` state of the link. They may change between different SF invocations with the same link */
631   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
632     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
633     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
634   }
635 
636   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
637   link->leafdata = leafdata;
638   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
639     link->rootdirect[i] = rootdirect[i];
640     link->leafdirect[i] = leafdirect[i];
641   }
642   link->rootdirect_mpi  = rootdirect_mpi;
643   link->leafdirect_mpi  = leafdirect_mpi;
644   link->rootmtype       = rootmtype;
645   link->leafmtype       = leafmtype;
646   link->rootmtype_mpi   = rootmtype_mpi;
647   link->leafmtype_mpi   = leafmtype_mpi;
648 
649   link->next            = bas->inuse;
650   bas->inuse            = link;
651   *mylink               = link;
652   PetscFunctionReturn(0);
653 }
654 
655 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
656    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
657 */
658 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
659 {
660   PetscErrorCode       ierr;
661   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
662   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
663   const PetscInt       *rootoffset,*leafoffset;
664   PetscMPIInt          n;
665   MPI_Aint             disp;
666   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
667   MPI_Datatype         unit = link->unit;
668   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
669   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
670 
671   PetscFunctionBegin;
672   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
673   if (sf->persistent) {
674     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
675       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
676       if (direction == PETSCSF_LEAF2ROOT) {
677         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
678           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
679           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
680           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
681         }
682       } else { /* PETSCSF_ROOT2LEAF */
683         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
684           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
685           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
686           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
687         }
688       }
689       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
690     }
691 
692     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
693       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
694       if (direction == PETSCSF_LEAF2ROOT) {
695         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
696           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
697           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
698           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
699         }
700       } else { /* PETSCSF_ROOT2LEAF */
701         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
702           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
703           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
704           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
705         }
706       }
707       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
708     }
709   }
710   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
711   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
712   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
713   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
714   PetscFunctionReturn(0);
715 }
716 
717 
718 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
719 {
720   PetscErrorCode    ierr;
721   PetscSFLink       link,*p;
722   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
723 
724   PetscFunctionBegin;
725   /* Look for types in cache */
726   for (p=&bas->inuse; (link=*p); p=&link->next) {
727     PetscBool match;
728     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
729     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
730       switch (cmode) {
731       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
732       case PETSC_USE_POINTER: break;
733       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
734       }
735       *mylink = link;
736       PetscFunctionReturn(0);
737     }
738   }
739   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
740   PetscFunctionReturn(0);
741 }
742 
743 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
744 {
745   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
746 
747   PetscFunctionBegin;
748   (*link)->rootdata = NULL;
749   (*link)->leafdata = NULL;
750   (*link)->next     = bas->avail;
751   bas->avail        = *link;
752   *link             = NULL;
753   PetscFunctionReturn(0);
754 }
755 
756 /* Destroy all links chained in 'avail' */
757 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
758 {
759   PetscErrorCode    ierr;
760   PetscSFLink       link = *avail,next;
761   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
762   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
763 
764   PetscFunctionBegin;
765   for (; link; link=next) {
766     next = link->next;
767     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
768     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
769       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
770     }
771     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
772     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
773       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
774       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
775       #if defined(PETSC_HAVE_DEVICE)
776       ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
777       ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
778       #endif
779     }
780   #if defined(PETSC_HAVE_DEVICE)
781     if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);}
782    #if defined(PETSC_HAVE_CUDA)
783     if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;}
784    #elif defined(PETSC_HAVE_HIP)
785     if (link->stream) {hipError_t  cerr = hipStreamDestroy(link->stream);CHKERRQ(cerr); link->stream = NULL;} /* TODO: CHKERRHIP? */
786    #endif
787   #endif
788     ierr = PetscFree(link);CHKERRQ(ierr);
789   }
790   *avail = NULL;
791   PetscFunctionReturn(0);
792 }
793 
794 /* Error out on unsupported overlapped communications */
795 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
796 {
797   PetscErrorCode    ierr;
798   PetscSFLink       link,*p;
799   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
800   PetscBool         match;
801 
802   PetscFunctionBegin;
803   if (!PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
804   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
805      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
806   */
807   if (rootdata || leafdata) {
808     for (p=&bas->inuse; (link=*p); p=&link->next) {
809       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
810       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
811     }
812   }
813   PetscFunctionReturn(0);
814 }
815 
816 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
817 {
818   PetscFunctionBegin;
819   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
820   PetscFunctionReturn(0);
821 }
822 
823 
824 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
825 {
826   PetscErrorCode ierr;
827   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
828   PetscBool      is2Int,is2PetscInt;
829   PetscMPIInt    ni,na,nd,combiner;
830 #if defined(PETSC_HAVE_COMPLEX)
831   PetscInt       nPetscComplex=0;
832 #endif
833 
834   PetscFunctionBegin;
835   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
836   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
837   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
838   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
839   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
840   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
841 #if defined(PETSC_HAVE_COMPLEX)
842   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
843 #endif
844   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
845   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
846   /* TODO: shaell we also handle Fortran MPI_2REAL? */
847   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
848   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
849   link->bs = 1; /* default */
850 
851   if (is2Int) {
852     PackInit_PairType_int_int_1_1(link);
853     link->bs        = 1;
854     link->unitbytes = 2*sizeof(int);
855     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
856     link->basicunit = MPI_2INT;
857     link->unit      = MPI_2INT;
858   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
859     PackInit_PairType_PetscInt_PetscInt_1_1(link);
860     link->bs        = 1;
861     link->unitbytes = 2*sizeof(PetscInt);
862     link->basicunit = MPIU_2INT;
863     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
864     link->unit      = MPIU_2INT;
865   } else if (nPetscReal) {
866     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
867     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
868     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
869     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
870     link->bs        = nPetscReal;
871     link->unitbytes = nPetscReal*sizeof(PetscReal);
872     link->basicunit = MPIU_REAL;
873     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
874   } else if (nPetscInt) {
875     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
876     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
877     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
878     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
879     link->bs        = nPetscInt;
880     link->unitbytes = nPetscInt*sizeof(PetscInt);
881     link->basicunit = MPIU_INT;
882     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
883 #if defined(PETSC_USE_64BIT_INDICES)
884   } else if (nInt) {
885     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
886     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
887     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
888     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
889     link->bs        = nInt;
890     link->unitbytes = nInt*sizeof(int);
891     link->basicunit = MPI_INT;
892     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
893 #endif
894   } else if (nSignedChar) {
895     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
896     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
897     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
898     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
899     link->bs        = nSignedChar;
900     link->unitbytes = nSignedChar*sizeof(SignedChar);
901     link->basicunit = MPI_SIGNED_CHAR;
902     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
903   }  else if (nUnsignedChar) {
904     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
905     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
906     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
907     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
908     link->bs        = nUnsignedChar;
909     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
910     link->basicunit = MPI_UNSIGNED_CHAR;
911     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
912 #if defined(PETSC_HAVE_COMPLEX)
913   } else if (nPetscComplex) {
914     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
915     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
916     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
917     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
918     link->bs        = nPetscComplex;
919     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
920     link->basicunit = MPIU_COMPLEX;
921     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
922 #endif
923   } else {
924     MPI_Aint lb,nbyte;
925     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
926     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
927     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
928       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
929       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
930       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
931       link->bs        = nbyte;
932       link->unitbytes = nbyte;
933       link->basicunit = MPI_BYTE;
934     } else {
935       nInt = nbyte / sizeof(int);
936       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
937       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
938       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
939       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
940       link->bs        = nInt;
941       link->unitbytes = nbyte;
942       link->basicunit = MPI_INT;
943     }
944     if (link->isbuiltin) link->unit = unit;
945   }
946 
947   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
948 
949   link->Memcpy = PetscSFLinkMemcpy_Host;
950   PetscFunctionReturn(0);
951 }
952 
953 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
954 {
955   PetscFunctionBegin;
956   *UnpackAndOp = NULL;
957   if (mtype == PETSC_MEMTYPE_HOST) {
958     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
959     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
960     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
961     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
962     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
963     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
964     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
965     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
966     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
967     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
968     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
969     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
970     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
971   }
972 #if defined(PETSC_HAVE_DEVICE)
973   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
974     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
975     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
976     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
977     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
978     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
979     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
980     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
981     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
982     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
983     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
984     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
985     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
986     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
987   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
988     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
989     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
990     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
991     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
992     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
993     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
994     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
995     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
996     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
997     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
998     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
999     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
1000     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
1001   }
1002 #endif
1003   PetscFunctionReturn(0);
1004 }
1005 
1006 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
1007 {
1008   PetscFunctionBegin;
1009   *ScatterAndOp = NULL;
1010   if (mtype == PETSC_MEMTYPE_HOST) {
1011     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
1012     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
1013     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
1014     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
1015     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
1016     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
1017     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
1018     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
1019     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
1020     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
1021     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
1022     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
1023     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
1024   }
1025 #if defined(PETSC_HAVE_DEVICE)
1026   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1027     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
1028     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1029     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
1030     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1031     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1032     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1033     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1034     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1035     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1036     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1037     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1038     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1039     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1040   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1041     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1042     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1043     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1044     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1045     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1046     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1047     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1048     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1049     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1050     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1051     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1052     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1053     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1054   }
1055 #endif
1056   PetscFunctionReturn(0);
1057 }
1058 
1059 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1060 {
1061   PetscFunctionBegin;
1062   *FetchAndOp = NULL;
1063   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1064   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1065 #if defined(PETSC_HAVE_DEVICE)
1066   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1067   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1068 #endif
1069   PetscFunctionReturn(0);
1070 }
1071 
1072 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1073 {
1074   PetscFunctionBegin;
1075   *FetchAndOpLocal = NULL;
1076   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1077   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1078 #if defined(PETSC_HAVE_DEVICE)
1079   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1080   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1081 #endif
1082   PetscFunctionReturn(0);
1083 }
1084 
1085 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1086 {
1087   PetscErrorCode ierr;
1088   PetscLogDouble flops;
1089   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1090 
1091 
1092   PetscFunctionBegin;
1093   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1094     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1095 #if defined(PETSC_HAVE_DEVICE)
1096     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1097 #endif
1098     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1099   }
1100   PetscFunctionReturn(0);
1101 }
1102 
1103 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1104 {
1105   PetscLogDouble flops;
1106   PetscErrorCode ierr;
1107 
1108   PetscFunctionBegin;
1109   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1110     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1111 #if defined(PETSC_HAVE_DEVICE)
1112     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1113 #endif
1114     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1115   }
1116   PetscFunctionReturn(0);
1117 }
1118 
1119 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1120   Input Arguments:
1121   +sf      - The StarForest
1122   .link    - The link
1123   .count   - Number of entries to unpack
1124   .start   - The first index, significent when indices=NULL
1125   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1126   .buf     - A contiguous buffer to unpack from
1127   -op      - Operation after unpack
1128 
1129   Output Arguments:
1130   .data    - The data to unpack to
1131 */
1132 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1133 {
1134   PetscFunctionBegin;
1135 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1136   {
1137     PetscErrorCode ierr;
1138     PetscInt       i;
1139     PetscMPIInt    n;
1140     if (indices) {
1141       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1142          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1143       */
1144       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1145     } else {
1146       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1147       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1148     }
1149   }
1150 #else
1151   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1152 #endif
1153   PetscFunctionReturn(0);
1154 }
1155 
1156 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1157 {
1158   PetscFunctionBegin;
1159 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1160   {
1161     PetscErrorCode ierr;
1162     PetscInt       i,disp;
1163     if (!srcIdx) {
1164       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
1165     } else {
1166       for (i=0; i<count; i++) {
1167         disp = dstIdx? dstIdx[i] : dstStart + i;
1168         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1169       }
1170     }
1171   }
1172 #else
1173   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1174 #endif
1175   PetscFunctionReturn(0);
1176 }
1177 
1178 /*=============================================================================
1179               Pack/Unpack/Fetch/Scatter routines
1180  ============================================================================*/
1181 
1182 /* Pack rootdata to rootbuf
1183   Input Arguments:
1184   + sf       - The SF this packing works on.
1185   . link     - It gives the memtype of the roots and also provides root buffer.
1186   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1187   - rootdata - Where to read the roots.
1188 
1189   Notes:
1190   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1191   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1192  */
1193 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1194 {
1195   PetscErrorCode   ierr;
1196   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1197   const PetscInt   *rootindices = NULL;
1198   PetscInt         count,start;
1199   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1200   PetscMemType     rootmtype = link->rootmtype;
1201   PetscSFPackOpt   opt = NULL;
1202 
1203   PetscFunctionBegin;
1204   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1205   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1206   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1207     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1208     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1209     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1210   }
1211   if (scope == PETSCSF_REMOTE) {
1212     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1213     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1214   }
1215   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1216   PetscFunctionReturn(0);
1217 }
1218 
1219 /* Pack leafdata to leafbuf */
1220 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1221 {
1222   PetscErrorCode   ierr;
1223   const PetscInt   *leafindices = NULL;
1224   PetscInt         count,start;
1225   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1226   PetscMemType     leafmtype = link->leafmtype;
1227   PetscSFPackOpt   opt = NULL;
1228 
1229   PetscFunctionBegin;
1230   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1231   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1232   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1233     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1234     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1235     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1236   }
1237   if (scope == PETSCSF_REMOTE) {
1238     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1239     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1240   }
1241   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1242   PetscFunctionReturn(0);
1243 }
1244 
1245 /* Unpack rootbuf to rootdata */
1246 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1247 {
1248   PetscErrorCode   ierr;
1249   const PetscInt   *rootindices = NULL;
1250   PetscInt         count,start;
1251   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1252   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1253   PetscMemType     rootmtype = link->rootmtype;
1254   PetscSFPackOpt   opt = NULL;
1255 
1256   PetscFunctionBegin;
1257   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1258   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1259   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1260     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1261     if (UnpackAndOp) {
1262       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1263       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1264     } else {
1265       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1266       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1267     }
1268   }
1269   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1270   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1271   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1272   PetscFunctionReturn(0);
1273 }
1274 
1275 /* Unpack leafbuf to leafdata */
1276 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1277 {
1278   PetscErrorCode   ierr;
1279   const PetscInt   *leafindices = NULL;
1280   PetscInt         count,start;
1281   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1282   PetscMemType     leafmtype = link->leafmtype;
1283   PetscSFPackOpt   opt = NULL;
1284 
1285   PetscFunctionBegin;
1286   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1287   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1288   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1289     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1290     if (UnpackAndOp) {
1291       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1292       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1293     } else {
1294       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1295       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1296     }
1297   }
1298   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1299   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1300   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1301   PetscFunctionReturn(0);
1302 }
1303 
1304 /* FetchAndOp rootdata with rootbuf */
1305 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1306 {
1307   PetscErrorCode     ierr;
1308   const PetscInt     *rootindices = NULL;
1309   PetscInt           count,start;
1310   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1311   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1312   PetscMemType       rootmtype = link->rootmtype;
1313   PetscSFPackOpt     opt = NULL;
1314 
1315   PetscFunctionBegin;
1316   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1317   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1318   if (bas->rootbuflen[scope]) {
1319     /* Do FetchAndOp on rootdata with rootbuf */
1320     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1321     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1322     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1323   }
1324   if (scope == PETSCSF_REMOTE) {
1325     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1326     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1327   }
1328   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1329   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1330   PetscFunctionReturn(0);
1331 }
1332 
1333 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1334 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1335 {
1336   PetscErrorCode       ierr;
1337   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1338   PetscInt             count,rootstart,leafstart;
1339   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1340   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1341   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1342   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1343 
1344   PetscFunctionBegin;
1345   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1346   if (rootmtype != leafmtype) { /* Uncommon case */
1347      /* The local communication has to go through pack and unpack */
1348     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1349     ierr = (*link->Memcpy)(link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1350     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1351   } else {
1352     ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1353     if (ScatterAndOp) {
1354       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1355       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1356       ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1357     } else {
1358       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1359       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1360       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1361     }
1362   }
1363   PetscFunctionReturn(0);
1364 }
1365 
1366 /* Reduce leafdata to rootdata locally */
1367 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1368 {
1369   PetscErrorCode       ierr;
1370   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1371   PetscInt             count,rootstart,leafstart;
1372   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1373   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1374   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1375   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1376 
1377   PetscFunctionBegin;
1378   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1379   if (rootmtype != leafmtype) {
1380     /* The local communication has to go through pack and unpack */
1381     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1382     ierr = (*link->Memcpy)(link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1383     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1384   } else {
1385     ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1386     if (ScatterAndOp) {
1387       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1388       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1389       ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1390     } else {
1391       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1392       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1393       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1394     }
1395   }
1396   PetscFunctionReturn(0);
1397 }
1398 
1399 /* Fetch rootdata to leafdata and leafupdate locally */
1400 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1401 {
1402   PetscErrorCode       ierr;
1403   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1404   PetscInt             count,rootstart,leafstart;
1405   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1406   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1407   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1408   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1409 
1410   PetscFunctionBegin;
1411   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1412   if (rootmtype != leafmtype) {
1413    /* The local communication has to go through pack and unpack */
1414     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1415   } else {
1416     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1417     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1418     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1419     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1420   }
1421   PetscFunctionReturn(0);
1422 }
1423 
1424 /*
1425   Create per-rank pack/unpack optimizations based on indice patterns
1426 
1427    Input Parameters:
1428   +  n       - Number of destination ranks
1429   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1430   -  idx     - [*]   Array storing indices
1431 
1432    Output Parameters:
1433   +  opt     - Pack optimizations. NULL if no optimizations.
1434 */
1435 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1436 {
1437   PetscErrorCode ierr;
1438   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1439   PetscBool      optimizable = PETSC_TRUE;
1440   PetscSFPackOpt opt;
1441 
1442   PetscFunctionBegin;
1443   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1444   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1445   opt->n      = opt->array[0] = n;
1446   opt->offset = opt->array + 1;
1447   opt->start  = opt->array + n   + 2;
1448   opt->dx     = opt->array + 2*n + 2;
1449   opt->dy     = opt->array + 3*n + 2;
1450   opt->dz     = opt->array + 4*n + 2;
1451   opt->X      = opt->array + 5*n + 2;
1452   opt->Y      = opt->array + 6*n + 2;
1453 
1454   for (r=0; r<n; r++) { /* For each destination rank */
1455     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1456     p     = offset[r];
1457     start = idx[p]; /* First index for this rank */
1458     p++;
1459 
1460     /* Search in X dimension */
1461     for (dx=1; dx<m; dx++,p++) {
1462       if (start+dx != idx[p]) break;
1463     }
1464 
1465     dydz = m/dx;
1466     X    = dydz > 1 ? (idx[p]-start) : dx;
1467     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1468     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1469     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1470       for (i=0; i<dx; i++,p++) {
1471         if (start+X*dy+i != idx[p]) {
1472           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1473           else goto Z_dimension;
1474         }
1475       }
1476     }
1477 
1478 Z_dimension:
1479     dz = m/(dx*dy);
1480     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1481     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1482     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1483     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1484       for (j=0; j<dy; j++) {
1485         for (i=0; i<dx; i++,p++) {
1486           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1487         }
1488       }
1489     }
1490     opt->start[r] = start;
1491     opt->dx[r]    = dx;
1492     opt->dy[r]    = dy;
1493     opt->dz[r]    = dz;
1494     opt->X[r]     = X;
1495     opt->Y[r]     = Y;
1496   }
1497 
1498 finish:
1499   /* If not optimizable, free arrays to save memory */
1500   if (!n || !optimizable) {
1501     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1502     ierr = PetscFree(opt);CHKERRQ(ierr);
1503     *out = NULL;
1504   } else {
1505     opt->offset[0] = 0;
1506     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1507     *out = opt;
1508   }
1509   PetscFunctionReturn(0);
1510 }
1511 
1512 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1513 {
1514   PetscErrorCode ierr;
1515   PetscSFPackOpt opt = *out;
1516 
1517   PetscFunctionBegin;
1518   if (opt) {
1519     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
1520     ierr = PetscFree(opt);CHKERRQ(ierr);
1521     *out = NULL;
1522   }
1523   PetscFunctionReturn(0);
1524 }
1525 
1526 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1527 {
1528   PetscErrorCode ierr;
1529   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1530   PetscInt       i,j;
1531 
1532   PetscFunctionBegin;
1533   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1534   for (i=0; i<2; i++) { /* Set defaults */
1535     sf->leafstart[i]   = 0;
1536     sf->leafcontig[i]  = PETSC_TRUE;
1537     sf->leafdups[i]    = PETSC_FALSE;
1538     bas->rootstart[i]  = 0;
1539     bas->rootcontig[i] = PETSC_TRUE;
1540     bas->rootdups[i]   = PETSC_FALSE;
1541   }
1542 
1543   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1544   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1545 
1546   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1547   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1548 
1549   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1550   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1551     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1552   }
1553   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1554     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1555   }
1556 
1557   /* If not, see if we can have per-rank optimizations by doing index analysis */
1558   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1559   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1560 
1561   /* Are root indices for self and remote contiguous? */
1562   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1563   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1564 
1565   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1566   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1567 
1568   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1569     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1570   }
1571   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1572     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1573   }
1574 
1575   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1576   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1577 
1578 #if defined(PETSC_HAVE_DEVICE)
1579     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1580   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1581   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1582   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1583   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1584 #endif
1585 
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1590 {
1591   PetscErrorCode ierr;
1592   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1593   PetscInt       i;
1594 
1595   PetscFunctionBegin;
1596   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1597     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1598     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1599 #if defined(PETSC_HAVE_DEVICE)
1600     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1601     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1602 #endif
1603   }
1604   PetscFunctionReturn(0);
1605 }
1606