xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 2da392cc7c10228af19ad9843ce5155178acb644)
1 #include <../src/vec/is/sf/impls/basic/sfpack.h>
2 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
3 
4 #if defined(PETSC_HAVE_CUDA)
5 #include <cuda_runtime.h>
6 #endif
7 /*
8  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9  * therefore we pack data types manually. This file defines packing routines for the standard data types.
10  */
11 
12 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
13 
14 /* Operations working like s += t */
15 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
16 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
17 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
18 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
19 /* Ref MPI MAXLOC */
20 #define OP_XLOC(op,s,t) \
21   do {                                       \
22     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23     else if (!((s).u op (t).u)) s = t;           \
24   } while (0)
25 
26 /* DEF_PackFunc - macro defining a Pack routine
27 
28    Arguments of the macro:
29    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30    .BS        Block size for vectorization. It is a factor of bsz.
31    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
32 
33    Arguments of the Pack routine:
34    +count     Number of indices in idx[].
35    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36    .opt       Per-pack optimization plan. NULL means no such plan.
37    .idx       Indices of entries to packed.
38    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40    -packed    Address of the packed data.
41  */
42 #define DEF_PackFunc(Type,BS,EQ) \
43   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44   {                                                                                                          \
45     PetscErrorCode ierr;                                                                                     \
46     const Type     *u = (const Type*)unpacked,*u2;                                                           \
47     Type           *p = (Type*)packed,*p2;                                                                   \
48     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
49     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
50     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
51     PetscFunctionBegin;                                                                                      \
52     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
53     else if (opt) { /* has optimizations available */                                                        \
54       p2 = p;                                                                                                \
55       for (r=0; r<opt->n; r++) {                                                                             \
56         u2 = u + opt->start[r]*MBS;                                                                          \
57         X  = opt->X[r];                                                                                      \
58         Y  = opt->Y[r];                                                                                      \
59         for (k=0; k<opt->dz[r]; k++)                                                                         \
60           for (j=0; j<opt->dy[r]; j++) {                                                                     \
61             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
62             p2  += opt->dx[r]*MBS;                                                                           \
63           }                                                                                                  \
64       }                                                                                                      \
65     } else {                                                                                                 \
66       for (i=0; i<count; i++)                                                                                \
67         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
68           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
69             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
70     }                                                                                                        \
71     PetscFunctionReturn(0);                                                                                  \
72   }
73 
74 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
75                 and inserts into a sparse array.
76 
77    Arguments:
78   .Type       Type of the data
79   .BS         Block size for vectorization
80   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
81 
82   Notes:
83    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
84  */
85 #define DEF_UnpackFunc(Type,BS,EQ)               \
86   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
87   {                                                                                                          \
88     PetscErrorCode ierr;                                                                                     \
89     Type           *u = (Type*)unpacked,*u2;                                                                 \
90     const Type     *p = (const Type*)packed;                                                                 \
91     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
92     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
93     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
94     PetscFunctionBegin;                                                                                      \
95     if (!idx) {                                                                                              \
96       u += start*MBS;                                                                                        \
97       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
98     } else if (opt) { /* has optimizations available */                                                      \
99       for (r=0; r<opt->n; r++) {                                                                             \
100         u2 = u + opt->start[r]*MBS;                                                                          \
101         X  = opt->X[r];                                                                                      \
102         Y  = opt->Y[r];                                                                                      \
103         for (k=0; k<opt->dz[r]; k++)                                                                         \
104           for (j=0; j<opt->dy[r]; j++) {                                                                     \
105             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
106             p   += opt->dx[r]*MBS;                                                                           \
107           }                                                                                                  \
108       }                                                                                                      \
109     } else {                                                                                                 \
110       for (i=0; i<count; i++)                                                                                \
111         for (j=0; j<M; j++)                                                                                  \
112           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
113     }                                                                                                        \
114     PetscFunctionReturn(0);                                                                                  \
115   }
116 
117 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
118 
119    Arguments:
120   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
121   .Type       Type of the data
122   .BS         Block size for vectorization
123   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
124   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
125   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
126  */
127 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
128   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
129   {                                                                                                          \
130     Type           *u = (Type*)unpacked,*u2;                                                                 \
131     const Type     *p = (const Type*)packed;                                                                 \
132     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
133     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
134     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
135     PetscFunctionBegin;                                                                                      \
136     if (!idx) {                                                                                              \
137       u += start*MBS;                                                                                        \
138       for (i=0; i<count; i++)                                                                                \
139         for (j=0; j<M; j++)                                                                                  \
140           for (k=0; k<BS; k++)                                                                               \
141             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
142     } else if (opt) { /* idx[] has patterns */                                                               \
143       for (r=0; r<opt->n; r++) {                                                                             \
144         u2 = u + opt->start[r]*MBS;                                                                          \
145         X  = opt->X[r];                                                                                      \
146         Y  = opt->Y[r];                                                                                      \
147         for (k=0; k<opt->dz[r]; k++)                                                                         \
148           for (j=0; j<opt->dy[r]; j++) {                                                                     \
149             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
150             p += opt->dx[r]*MBS;                                                                             \
151           }                                                                                                  \
152       }                                                                                                      \
153     } else {                                                                                                 \
154       for (i=0; i<count; i++)                                                                                \
155         for (j=0; j<M; j++)                                                                                  \
156           for (k=0; k<BS; k++)                                                                               \
157             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
158     }                                                                                                        \
159     PetscFunctionReturn(0);                                                                                  \
160   }
161 
162 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
163   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
164   {                                                                                                          \
165     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
166     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
167     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
168     const PetscInt MBS = M*BS;                                                                               \
169     PetscFunctionBegin;                                                                                      \
170     for (i=0; i<count; i++) {                                                                                \
171       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
172       l = i*MBS;                                                                                             \
173       for (j=0; j<M; j++)                                                                                    \
174         for (k=0; k<BS; k++) {                                                                               \
175           tmp = u[r+j*BS+k];                                                                                 \
176           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
177           p[l+j*BS+k] = tmp;                                                                                 \
178         }                                                                                                    \
179     }                                                                                                        \
180     PetscFunctionReturn(0);                                                                                  \
181   }
182 
183 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
184   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
185   {                                                                                                          \
186     PetscErrorCode ierr;                                                                                     \
187     const Type     *u = (const Type*)src;                                                                    \
188     Type           *v = (Type*)dst;                                                                          \
189     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
190     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
191     const PetscInt MBS = M*BS;                                                                               \
192     PetscFunctionBegin;                                                                                      \
193     if (!srcIdx) { /* src is contiguous */                                                                   \
194       u += srcStart*MBS;                                                                                     \
195       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
196     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
197       u += srcOpt->start[0]*MBS;                                                                             \
198       v += dstStart*MBS;                                                                                     \
199       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
200       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
201         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
202           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
203           v += srcOpt->dx[0]*MBS;                                                                            \
204         }                                                                                                    \
205     } else { /* all other cases */                                                                           \
206       for (i=0; i<count; i++) {                                                                              \
207         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
208         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
209         for (j=0; j<M; j++)                                                                                  \
210           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
211       }                                                                                                      \
212     }                                                                                                        \
213     PetscFunctionReturn(0);                                                                                  \
214   }
215 
216 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
217   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
218   {                                                                                                          \
219     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
220     const Type     *ldata = (const Type*)leafdata;                                                           \
221     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
222     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
223     const PetscInt MBS = M*BS;                                                                               \
224     PetscFunctionBegin;                                                                                      \
225     for (i=0; i<count; i++) {                                                                                \
226       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
227       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
228       for (j=0; j<M; j++)                                                                                    \
229         for (k=0; k<BS; k++) {                                                                               \
230           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
231           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
232         }                                                                                                    \
233     }                                                                                                        \
234     PetscFunctionReturn(0);                                                                                  \
235   }
236 
237 /* Pack, Unpack/Fetch ops */
238 #define DEF_Pack(Type,BS,EQ)                                                      \
239   DEF_PackFunc(Type,BS,EQ)                                                        \
240   DEF_UnpackFunc(Type,BS,EQ)                                                      \
241   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
242   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
243     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
244     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
245     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
246   }
247 
248 /* Add, Mult ops */
249 #define DEF_Add(Type,BS,EQ)                                                       \
250   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
251   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
252   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
253   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
254   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
255   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
256   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
257     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
258     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
259     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
260     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
261     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
262     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
263   }
264 
265 /* Max, Min ops */
266 #define DEF_Cmp(Type,BS,EQ)                                                       \
267   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
268   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
269   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
270   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
271   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
272     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
273     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
274     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
275     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
276   }
277 
278 /* Logical ops.
279   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
280   the compilation warning "empty macro arguments are undefined in ISO C90"
281  */
282 #define DEF_Log(Type,BS,EQ)                                                       \
283   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
284   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
285   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
286   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
287   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
288   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
289   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
290     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
291     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
292     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
293     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
294     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
295     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
296   }
297 
298 /* Bitwise ops */
299 #define DEF_Bit(Type,BS,EQ)                                                       \
300   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
301   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
302   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
303   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
304   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
305   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
306   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
307     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
308     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
309     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
310     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
311     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
312     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
313   }
314 
315 /* Maxloc, Minloc ops */
316 #define DEF_Xloc(Type,BS,EQ)                                                      \
317   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
318   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
319   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
320   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
321   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
322     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
323     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
324     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
325     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
326   }
327 
328 #define DEF_IntegerType(Type,BS,EQ)                                               \
329   DEF_Pack(Type,BS,EQ)                                                            \
330   DEF_Add(Type,BS,EQ)                                                             \
331   DEF_Cmp(Type,BS,EQ)                                                             \
332   DEF_Log(Type,BS,EQ)                                                             \
333   DEF_Bit(Type,BS,EQ)                                                             \
334   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
335     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
336     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
337     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
338     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
339     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
340   }
341 
342 #define DEF_RealType(Type,BS,EQ)                                                  \
343   DEF_Pack(Type,BS,EQ)                                                            \
344   DEF_Add(Type,BS,EQ)                                                             \
345   DEF_Cmp(Type,BS,EQ)                                                             \
346   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
347     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
348     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
349     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
350   }
351 
352 #if defined(PETSC_HAVE_COMPLEX)
353 #define DEF_ComplexType(Type,BS,EQ)                                               \
354   DEF_Pack(Type,BS,EQ)                                                            \
355   DEF_Add(Type,BS,EQ)                                                             \
356   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
357     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
358     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
359   }
360 #endif
361 
362 #define DEF_DumbType(Type,BS,EQ)                                                  \
363   DEF_Pack(Type,BS,EQ)                                                            \
364   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
365     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366   }
367 
368 /* Maxloc, Minloc */
369 #define DEF_PairType(Type,BS,EQ)                                                  \
370   DEF_Pack(Type,BS,EQ)                                                            \
371   DEF_Xloc(Type,BS,EQ)                                                            \
372   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
373     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
375   }
376 
377 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
378 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
379 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
380 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
381 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
382 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
383 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
384 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
385 
386 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
387 DEF_IntegerType(int,1,1)
388 DEF_IntegerType(int,2,1)
389 DEF_IntegerType(int,4,1)
390 DEF_IntegerType(int,8,1)
391 DEF_IntegerType(int,1,0)
392 DEF_IntegerType(int,2,0)
393 DEF_IntegerType(int,4,0)
394 DEF_IntegerType(int,8,0)
395 #endif
396 
397 /* The typedefs are used to get a typename without space that CPPJoin can handle */
398 typedef signed char SignedChar;
399 DEF_IntegerType(SignedChar,1,1)
400 DEF_IntegerType(SignedChar,2,1)
401 DEF_IntegerType(SignedChar,4,1)
402 DEF_IntegerType(SignedChar,8,1)
403 DEF_IntegerType(SignedChar,1,0)
404 DEF_IntegerType(SignedChar,2,0)
405 DEF_IntegerType(SignedChar,4,0)
406 DEF_IntegerType(SignedChar,8,0)
407 
408 typedef unsigned char UnsignedChar;
409 DEF_IntegerType(UnsignedChar,1,1)
410 DEF_IntegerType(UnsignedChar,2,1)
411 DEF_IntegerType(UnsignedChar,4,1)
412 DEF_IntegerType(UnsignedChar,8,1)
413 DEF_IntegerType(UnsignedChar,1,0)
414 DEF_IntegerType(UnsignedChar,2,0)
415 DEF_IntegerType(UnsignedChar,4,0)
416 DEF_IntegerType(UnsignedChar,8,0)
417 
418 DEF_RealType(PetscReal,1,1)
419 DEF_RealType(PetscReal,2,1)
420 DEF_RealType(PetscReal,4,1)
421 DEF_RealType(PetscReal,8,1)
422 DEF_RealType(PetscReal,1,0)
423 DEF_RealType(PetscReal,2,0)
424 DEF_RealType(PetscReal,4,0)
425 DEF_RealType(PetscReal,8,0)
426 
427 #if defined(PETSC_HAVE_COMPLEX)
428 DEF_ComplexType(PetscComplex,1,1)
429 DEF_ComplexType(PetscComplex,2,1)
430 DEF_ComplexType(PetscComplex,4,1)
431 DEF_ComplexType(PetscComplex,8,1)
432 DEF_ComplexType(PetscComplex,1,0)
433 DEF_ComplexType(PetscComplex,2,0)
434 DEF_ComplexType(PetscComplex,4,0)
435 DEF_ComplexType(PetscComplex,8,0)
436 #endif
437 
438 #define PairType(Type1,Type2) Type1##_##Type2
439 typedef struct {int u; int i;}           PairType(int,int);
440 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
441 DEF_PairType(PairType(int,int),1,1)
442 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
443 
444 /* If we don't know the basic type, we treat it as a stream of chars or ints */
445 DEF_DumbType(char,1,1)
446 DEF_DumbType(char,2,1)
447 DEF_DumbType(char,4,1)
448 DEF_DumbType(char,1,0)
449 DEF_DumbType(char,2,0)
450 DEF_DumbType(char,4,0)
451 
452 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
453 DEF_DumbType(DumbInt,1,1)
454 DEF_DumbType(DumbInt,2,1)
455 DEF_DumbType(DumbInt,4,1)
456 DEF_DumbType(DumbInt,8,1)
457 DEF_DumbType(DumbInt,1,0)
458 DEF_DumbType(DumbInt,2,0)
459 DEF_DumbType(DumbInt,4,0)
460 DEF_DumbType(DumbInt,8,0)
461 
462 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
463 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
464 {
465   int ierr;
466   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
467   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
468   return MPI_SUCCESS;
469 }
470 #endif
471 
472 /*
473    The routine Creates a communication link for the given operation. It first looks up its link cache. If
474    there is a free & suitable one, it uses it. Otherwise it creates a new one.
475 
476    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
477    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
478    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
479    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
480 
481    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
482 
483    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
484 
485    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
486    need pack/unpack data.
487 */
488 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
489 {
490   PetscErrorCode    ierr;
491   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
492   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
493   PetscSFLink       *p,link;
494   PetscSFDirection  direction;
495   MPI_Request       *reqs = NULL;
496   PetscBool         match,rootdirect[2],leafdirect[2];
497   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
498   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
499 
500   PetscFunctionBegin;
501   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
502 
503   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
504   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
505     if (sfop == PETSCSF_BCAST) {
506       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
507       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
508     } else if (sfop == PETSCSF_REDUCE) {
509       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
510       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
511     } else { /* PETSCSF_FETCH */
512       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
513       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
514     }
515   }
516 
517   if (sf->use_gpu_aware_mpi) {
518     rootmtype_mpi = rootmtype;
519     leafmtype_mpi = leafmtype;
520   } else {
521     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
522   }
523   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
524   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
525   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
526 
527   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
528   nrootreqs = bas->nrootreqs;
529   nleafreqs = sf->nleafreqs;
530 
531   /* Look for free links in cache */
532   for (p=&bas->avail; (link=*p); p=&link->next) {
533     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
534     if (match) {
535       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
536          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
537       */
538       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
539         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
540         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
541         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
542       }
543       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
544         reqs = link->leafreqs[direction][leafmtype][1];
545         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
546         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
547       }
548       *p = link->next; /* Remove from available list */
549       goto found;
550     }
551   }
552 
553   ierr = PetscNew(&link);CHKERRQ(ierr);
554   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
555   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
556 
557   nreqs = (nrootreqs+nleafreqs)*8;
558   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
559   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
560 
561   for (i=0; i<2; i++) { /* Two communication directions */
562     for (j=0; j<2; j++) { /* Two memory types */
563       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
564         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
565         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
566       }
567     }
568   }
569 
570 found:
571   if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);}
572 
573   /* Allocate buffers along root/leafdata */
574   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
575     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
576     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
577     if (bas->rootbuflen[i]) {
578       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
579         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
580       } else { /* Have to have a separate rootbuf */
581         if (!link->rootbuf_alloc[i][rootmtype]) {
582           ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
583         }
584         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
585       }
586     }
587 
588     if (sf->leafbuflen[i]) {
589       if (leafdirect[i]) {
590         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
591       } else {
592         if (!link->leafbuf_alloc[i][leafmtype]) {
593           ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
594         }
595         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
596       }
597     }
598   }
599 
600   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
601   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
602     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
603       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
604     }
605     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
606   }
607   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
608     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
609       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
610     }
611     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
612   }
613 
614   /* Set `current` state of the link. They may change between different SF invocations with the same link */
615   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
616     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
617     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
618   }
619 
620   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
621   link->leafdata = leafdata;
622   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
623     link->rootdirect[i] = rootdirect[i];
624     link->leafdirect[i] = leafdirect[i];
625   }
626   link->rootdirect_mpi  = rootdirect_mpi;
627   link->leafdirect_mpi  = leafdirect_mpi;
628   link->rootmtype       = rootmtype;
629   link->leafmtype       = leafmtype;
630   link->rootmtype_mpi   = rootmtype_mpi;
631   link->leafmtype_mpi   = leafmtype_mpi;
632 
633   link->next            = bas->inuse;
634   bas->inuse            = link;
635   *mylink               = link;
636   PetscFunctionReturn(0);
637 }
638 
639 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
640    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
641 */
642 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
643 {
644   PetscErrorCode       ierr;
645   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
646   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
647   const PetscInt       *rootoffset,*leafoffset;
648   PetscMPIInt          n;
649   MPI_Aint             disp;
650   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
651   MPI_Datatype         unit = link->unit;
652   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
653   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
654 
655   PetscFunctionBegin;
656   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
657   if (sf->persistent) {
658     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
659       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
660       if (direction == PETSCSF_LEAF2ROOT) {
661         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
662           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
663           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
664           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
665         }
666       } else { /* PETSCSF_ROOT2LEAF */
667         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
668           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
669           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
670           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
671         }
672       }
673       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
674     }
675 
676     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
677       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
678       if (direction == PETSCSF_LEAF2ROOT) {
679         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
680           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
681           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
682           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
683         }
684       } else { /* PETSCSF_ROOT2LEAF */
685         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
686           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
687           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
688           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
689         }
690       }
691       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
692     }
693   }
694   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
695   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
696   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
697   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
698   PetscFunctionReturn(0);
699 }
700 
701 
702 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
703 {
704   PetscErrorCode    ierr;
705   PetscSFLink       link,*p;
706   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
707 
708   PetscFunctionBegin;
709   /* Look for types in cache */
710   for (p=&bas->inuse; (link=*p); p=&link->next) {
711     PetscBool match;
712     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
713     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
714       switch (cmode) {
715       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
716       case PETSC_USE_POINTER: break;
717       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
718       }
719       *mylink = link;
720       PetscFunctionReturn(0);
721     }
722   }
723   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
724   PetscFunctionReturn(0);
725 }
726 
727 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
728 {
729   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
730 
731   PetscFunctionBegin;
732   (*link)->rootdata = NULL;
733   (*link)->leafdata = NULL;
734   (*link)->next     = bas->avail;
735   bas->avail        = *link;
736   *link             = NULL;
737   PetscFunctionReturn(0);
738 }
739 
740 /* Destroy all links chained in 'avail' */
741 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
742 {
743   PetscErrorCode    ierr;
744   PetscSFLink       link = *avail,next;
745   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
746   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
747 
748   PetscFunctionBegin;
749   for (; link; link=next) {
750     next = link->next;
751     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
752     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
753       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
754     }
755     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
756     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
757 #if defined(PETSC_HAVE_CUDA)
758       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
759       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
760       if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
761 #endif
762       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
763       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
764     }
765     ierr = PetscFree(link);CHKERRQ(ierr);
766   }
767   *avail = NULL;
768   PetscFunctionReturn(0);
769 }
770 
771 /* Error out on unsupported overlapped communications */
772 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
773 {
774   PetscErrorCode    ierr;
775   PetscSFLink       link,*p;
776   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
777   PetscBool         match;
778 
779   PetscFunctionBegin;
780   if (!PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
781   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
782      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
783   */
784   if (rootdata || leafdata) {
785     for (p=&bas->inuse; (link=*p); p=&link->next) {
786       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
787       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
788     }
789   }
790   PetscFunctionReturn(0);
791 }
792 
793 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
794 {
795   PetscErrorCode ierr;
796   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
797   PetscBool      is2Int,is2PetscInt;
798   PetscMPIInt    ni,na,nd,combiner;
799 #if defined(PETSC_HAVE_COMPLEX)
800   PetscInt       nPetscComplex=0;
801 #endif
802 
803   PetscFunctionBegin;
804   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
805   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
806   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
807   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
808   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
809   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
810 #if defined(PETSC_HAVE_COMPLEX)
811   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
812 #endif
813   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
814   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
815   /* TODO: shaell we also handle Fortran MPI_2REAL? */
816   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
817   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
818   link->bs = 1; /* default */
819 
820   if (is2Int) {
821     PackInit_PairType_int_int_1_1(link);
822     link->bs        = 1;
823     link->unitbytes = 2*sizeof(int);
824     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
825     link->basicunit = MPI_2INT;
826     link->unit      = MPI_2INT;
827   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
828     PackInit_PairType_PetscInt_PetscInt_1_1(link);
829     link->bs        = 1;
830     link->unitbytes = 2*sizeof(PetscInt);
831     link->basicunit = MPIU_2INT;
832     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
833     link->unit      = MPIU_2INT;
834   } else if (nPetscReal) {
835     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
836     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
837     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
838     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
839     link->bs        = nPetscReal;
840     link->unitbytes = nPetscReal*sizeof(PetscReal);
841     link->basicunit = MPIU_REAL;
842     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
843   } else if (nPetscInt) {
844     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
845     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
846     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
847     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
848     link->bs        = nPetscInt;
849     link->unitbytes = nPetscInt*sizeof(PetscInt);
850     link->basicunit = MPIU_INT;
851     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
852 #if defined(PETSC_USE_64BIT_INDICES)
853   } else if (nInt) {
854     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
855     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
856     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
857     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
858     link->bs        = nInt;
859     link->unitbytes = nInt*sizeof(int);
860     link->basicunit = MPI_INT;
861     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
862 #endif
863   } else if (nSignedChar) {
864     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
865     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
866     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
867     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
868     link->bs        = nSignedChar;
869     link->unitbytes = nSignedChar*sizeof(SignedChar);
870     link->basicunit = MPI_SIGNED_CHAR;
871     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
872   }  else if (nUnsignedChar) {
873     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
874     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
875     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
876     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
877     link->bs        = nUnsignedChar;
878     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
879     link->basicunit = MPI_UNSIGNED_CHAR;
880     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
881 #if defined(PETSC_HAVE_COMPLEX)
882   } else if (nPetscComplex) {
883     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
884     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
885     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
886     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
887     link->bs        = nPetscComplex;
888     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
889     link->basicunit = MPIU_COMPLEX;
890     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
891 #endif
892   } else {
893     MPI_Aint lb,nbyte;
894     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
895     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
896     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
897       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
898       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
899       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
900       link->bs        = nbyte;
901       link->unitbytes = nbyte;
902       link->basicunit = MPI_BYTE;
903     } else {
904       nInt = nbyte / sizeof(int);
905       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
906       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
907       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
908       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
909       link->bs        = nInt;
910       link->unitbytes = nbyte;
911       link->basicunit = MPI_INT;
912     }
913     if (link->isbuiltin) link->unit = unit;
914   }
915 
916   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
917   PetscFunctionReturn(0);
918 }
919 
920 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
921 {
922   PetscFunctionBegin;
923   *UnpackAndOp = NULL;
924   if (mtype == PETSC_MEMTYPE_HOST) {
925     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
926     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
927     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
928     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
929     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
930     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
931     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
932     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
933     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
934     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
935     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
936     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
937     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
938   }
939 #if defined(PETSC_HAVE_CUDA)
940   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
941     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
942     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
943     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
944     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
945     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
946     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
947     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
948     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
949     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
950     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
951     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
952     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
953     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
954   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
955     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
956     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
957     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
958     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
959     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
960     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
961     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
962     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
963     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
964     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
965     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
966     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
967     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
968   }
969 #endif
970   PetscFunctionReturn(0);
971 }
972 
973 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
974 {
975   PetscFunctionBegin;
976   *ScatterAndOp = NULL;
977   if (mtype == PETSC_MEMTYPE_HOST) {
978     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
979     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
980     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
981     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
982     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
983     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
984     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
985     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
986     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
987     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
988     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
989     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
990     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
991   }
992 #if defined(PETSC_HAVE_CUDA)
993   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
994     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
995     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
996     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
997     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
998     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
999     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1000     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1001     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1002     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1003     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1004     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1005     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1006     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1007   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1008     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1009     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1010     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1011     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1012     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1013     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1014     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1015     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1016     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1017     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1018     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1019     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1020     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1021   }
1022 #endif
1023   PetscFunctionReturn(0);
1024 }
1025 
1026 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1027 {
1028   PetscFunctionBegin;
1029   *FetchAndOp = NULL;
1030   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1031   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1032 #if defined(PETSC_HAVE_CUDA)
1033   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1034   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1035 #endif
1036   PetscFunctionReturn(0);
1037 }
1038 
1039 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1040 {
1041   PetscFunctionBegin;
1042   *FetchAndOpLocal = NULL;
1043   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1044   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1045 #if defined(PETSC_HAVE_CUDA)
1046   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1047   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1048 #endif
1049   PetscFunctionReturn(0);
1050 }
1051 
1052 /*=============================================================================
1053               A set of helper routines for Pack/Unpack/Scatter on GPUs
1054  ============================================================================*/
1055 #if defined(PETSC_HAVE_CUDA)
1056 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
1057    make sure the data is ready for packing.
1058  */
1059 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
1060 {
1061   PetscFunctionBegin;
1062   if (sf->use_default_stream) PetscFunctionReturn(0);
1063   if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {
1064     cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
1065   }
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
1070 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
1071 {
1072   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1073 
1074   PetscFunctionBegin;
1075   /* Do nothing if we use stream aware mpi || has nothing for remote */
1076   if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
1077   /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
1078   if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1079     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1080   }
1081   PetscFunctionReturn(0);
1082 }
1083 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
1084 {
1085   PetscFunctionBegin;
1086   /* See comments above */
1087   if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
1088   if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1089     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1090   }
1091   PetscFunctionReturn(0);
1092 }
1093 
1094 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
1095    does not know which stream the callers will use.
1096 */
1097 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
1098 {
1099   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1100   PetscBool      host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
1101 
1102   PetscFunctionBegin;
1103   /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */
1104   if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0);
1105 
1106   /* If rootmtype is HOST or DEVICE:
1107      If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it;
1108      If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect,
1109      i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync).
1110 
1111      Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise
1112      CPU thread might use the yet-to-be-updated rootdata pending in the stream.
1113    */
1114   if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {
1115     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1116   }
1117   PetscFunctionReturn(0);
1118 }
1119 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
1120 {
1121   PetscBool      host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
1122 
1123   PetscFunctionBegin;
1124   /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/
1125   if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0);
1126   if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {
1127     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1128   }
1129   PetscFunctionReturn(0);
1130 }
1131 
1132 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
1133    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
1134 */
1135 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1136 {
1137   PetscErrorCode ierr;
1138   cudaError_t    cerr;
1139   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1140 
1141   PetscFunctionBegin;
1142   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
1143     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1144     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1145     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
1146     if (device2host) {
1147       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1148       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1149     } else {
1150       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1151       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1152     }
1153   }
1154   PetscFunctionReturn(0);
1155 }
1156 
1157 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1158 {
1159   PetscErrorCode ierr;
1160   cudaError_t    cerr;
1161 
1162   PetscFunctionBegin;
1163   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
1164     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1165     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1166     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
1167     if (device2host) {
1168       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1169       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1170     } else {
1171       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1172       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1173     }
1174   }
1175   PetscFunctionReturn(0);
1176 }
1177 #else
1178 
1179 #define PetscSFLinkSyncDeviceBeforePackData(a,b)                0
1180 #define PetscSFLinkSyncStreamAfterPackRootData(a,b)             0
1181 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b)             0
1182 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b)           0
1183 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b)           0
1184 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1185 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1186 
1187 #endif
1188 
1189 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1190 {
1191   PetscErrorCode ierr;
1192   PetscLogDouble flops;
1193   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1194 
1195 
1196   PetscFunctionBegin;
1197   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1198     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1199 #if defined(PETSC_HAVE_CUDA)
1200     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1201 #endif
1202     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1203   }
1204   PetscFunctionReturn(0);
1205 }
1206 
1207 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1208 {
1209   PetscLogDouble flops;
1210   PetscErrorCode ierr;
1211 
1212   PetscFunctionBegin;
1213   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1214     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1215 #if defined(PETSC_HAVE_CUDA)
1216     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1217 #endif
1218     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1219   }
1220   PetscFunctionReturn(0);
1221 }
1222 
1223 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1224   Input Arguments:
1225   +sf      - The StarForest
1226   .link    - The link
1227   .count   - Number of entries to unpack
1228   .start   - The first index, significent when indices=NULL
1229   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1230   .buf     - A contiguous buffer to unpack from
1231   -op      - Operation after unpack
1232 
1233   Output Arguments:
1234   .data    - The data to unpack to
1235 */
1236 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1237 {
1238   PetscFunctionBegin;
1239 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1240   {
1241     PetscErrorCode ierr;
1242     PetscInt       i;
1243     PetscMPIInt    n;
1244     if (indices) {
1245       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1246          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1247       */
1248       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1249     } else {
1250       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1251       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1252     }
1253   }
1254 #else
1255   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1256 #endif
1257   PetscFunctionReturn(0);
1258 }
1259 
1260 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1261 {
1262   PetscFunctionBegin;
1263 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1264   {
1265     PetscErrorCode ierr;
1266     PetscInt       i,disp;
1267     if (!srcIdx) {
1268       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
1269     } else {
1270       for (i=0; i<count; i++) {
1271         disp = dstIdx? dstIdx[i] : dstStart + i;
1272         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1273       }
1274     }
1275   }
1276 #else
1277   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1278 #endif
1279   PetscFunctionReturn(0);
1280 }
1281 
1282 /*=============================================================================
1283               Pack/Unpack/Fetch/Scatter routines
1284  ============================================================================*/
1285 
1286 /* Pack rootdata to rootbuf
1287   Input Arguments:
1288   + sf       - The SF this packing works on.
1289   . link     - It gives the memtype of the roots and also provides root buffer.
1290   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1291   - rootdata - Where to read the roots.
1292 
1293   Notes:
1294   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1295   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1296  */
1297 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1298 {
1299   PetscErrorCode   ierr;
1300   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1301   const PetscInt   *rootindices = NULL;
1302   PetscInt         count,start;
1303   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1304   PetscMemType     rootmtype = link->rootmtype;
1305   PetscSFPackOpt   opt = NULL;
1306 
1307 
1308   PetscFunctionBegin;
1309   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1310   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1311   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1312     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1313     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1314     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1315   }
1316   if (scope == PETSCSF_REMOTE) {
1317     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1318     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1319   }
1320   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1321   PetscFunctionReturn(0);
1322 }
1323 
1324 /* Pack leafdata to leafbuf */
1325 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1326 {
1327   PetscErrorCode   ierr;
1328   const PetscInt   *leafindices = NULL;
1329   PetscInt         count,start;
1330   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1331   PetscMemType     leafmtype = link->leafmtype;
1332   PetscSFPackOpt   opt = NULL;
1333 
1334   PetscFunctionBegin;
1335   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1336   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1337   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1338     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1339     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1340     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1341   }
1342   if (scope == PETSCSF_REMOTE) {
1343     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1344     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1345   }
1346   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 /* Unpack rootbuf to rootdata */
1351 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1352 {
1353   PetscErrorCode   ierr;
1354   const PetscInt   *rootindices = NULL;
1355   PetscInt         count,start;
1356   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1357   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1358   PetscMemType     rootmtype = link->rootmtype;
1359   PetscSFPackOpt   opt = NULL;
1360 
1361   PetscFunctionBegin;
1362   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1363   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1364   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1365     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1366     if (UnpackAndOp) {
1367       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1368       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1369     } else {
1370       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1371       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1372     }
1373   }
1374   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1375   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1376   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 /* Unpack leafbuf to leafdata */
1381 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1382 {
1383   PetscErrorCode   ierr;
1384   const PetscInt   *leafindices = NULL;
1385   PetscInt         count,start;
1386   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1387   PetscMemType     leafmtype = link->leafmtype;
1388   PetscSFPackOpt   opt = NULL;
1389 
1390   PetscFunctionBegin;
1391   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1392   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1393   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1394     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1395     if (UnpackAndOp) {
1396       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1397       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1398     } else {
1399       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1400       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1401     }
1402   }
1403   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1404   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1405   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1406   PetscFunctionReturn(0);
1407 }
1408 
1409 /* FetchAndOp rootdata with rootbuf */
1410 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1411 {
1412   PetscErrorCode     ierr;
1413   const PetscInt     *rootindices = NULL;
1414   PetscInt           count,start;
1415   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1416   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1417   PetscMemType       rootmtype = link->rootmtype;
1418   PetscSFPackOpt     opt = NULL;
1419 
1420   PetscFunctionBegin;
1421   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1422   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1423   if (bas->rootbuflen[scope]) {
1424     /* Do FetchAndOp on rootdata with rootbuf */
1425     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1426     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1427     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1428   }
1429   if (scope == PETSCSF_REMOTE) {
1430     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1431     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1432   }
1433   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1434   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1435   PetscFunctionReturn(0);
1436 }
1437 
1438 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1439 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1440 {
1441   PetscErrorCode       ierr;
1442   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1443   PetscInt             count,rootstart,leafstart;
1444   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1445   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1446   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1447   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1448 
1449   PetscFunctionBegin;
1450   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1451   if (rootmtype != leafmtype) { /* Uncommon case */
1452      /* The local communication has to go through pack and unpack */
1453     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1454     ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1455     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1456   } else {
1457     ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1458     if (ScatterAndOp) {
1459       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1460       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1461       ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1462     } else {
1463       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1464       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1465       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1466     }
1467   }
1468   PetscFunctionReturn(0);
1469 }
1470 
1471 /* Reduce leafdata to rootdata locally */
1472 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1473 {
1474   PetscErrorCode       ierr;
1475   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1476   PetscInt             count,rootstart,leafstart;
1477   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1478   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1479   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1480   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1481 
1482   PetscFunctionBegin;
1483   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1484   if (rootmtype != leafmtype) {
1485     /* The local communication has to go through pack and unpack */
1486     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1487     ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1488     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1489   } else {
1490     ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1491     if (ScatterAndOp) {
1492       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1493       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1494       ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1495     } else {
1496       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1497       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1498       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1499     }
1500   }
1501   PetscFunctionReturn(0);
1502 }
1503 
1504 /* Fetch rootdata to leafdata and leafupdate locally */
1505 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1506 {
1507   PetscErrorCode       ierr;
1508   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1509   PetscInt             count,rootstart,leafstart;
1510   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1511   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1512   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1513   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1514 
1515   PetscFunctionBegin;
1516   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1517   if (rootmtype != leafmtype) {
1518    /* The local communication has to go through pack and unpack */
1519     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1520   } else {
1521     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1522     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1523     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1524     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1525   }
1526   PetscFunctionReturn(0);
1527 }
1528 
1529 /*
1530   Create per-rank pack/unpack optimizations based on indice patterns
1531 
1532    Input Parameters:
1533   +  n       - Number of destination ranks
1534   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1535   -  idx     - [*]   Array storing indices
1536 
1537    Output Parameters:
1538   +  opt     - Pack optimizations. NULL if no optimizations.
1539 */
1540 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1541 {
1542   PetscErrorCode ierr;
1543   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1544   PetscBool      optimizable = PETSC_TRUE;
1545   PetscSFPackOpt opt;
1546 
1547   PetscFunctionBegin;
1548   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1549   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1550   opt->n      = opt->array[0] = n;
1551   opt->offset = opt->array + 1;
1552   opt->start  = opt->array + n   + 2;
1553   opt->dx     = opt->array + 2*n + 2;
1554   opt->dy     = opt->array + 3*n + 2;
1555   opt->dz     = opt->array + 4*n + 2;
1556   opt->X      = opt->array + 5*n + 2;
1557   opt->Y      = opt->array + 6*n + 2;
1558 
1559   for (r=0; r<n; r++) { /* For each destination rank */
1560     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1561     p     = offset[r];
1562     start = idx[p]; /* First index for this rank */
1563     p++;
1564 
1565     /* Search in X dimension */
1566     for (dx=1; dx<m; dx++,p++) {
1567       if (start+dx != idx[p]) break;
1568     }
1569 
1570     dydz = m/dx;
1571     X    = dydz > 1 ? (idx[p]-start) : dx;
1572     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1573     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1574     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1575       for (i=0; i<dx; i++,p++) {
1576         if (start+X*dy+i != idx[p]) {
1577           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1578           else goto Z_dimension;
1579         }
1580       }
1581     }
1582 
1583 Z_dimension:
1584     dz = m/(dx*dy);
1585     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1586     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1587     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1588     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1589       for (j=0; j<dy; j++) {
1590         for (i=0; i<dx; i++,p++) {
1591           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1592         }
1593       }
1594     }
1595     opt->start[r] = start;
1596     opt->dx[r]    = dx;
1597     opt->dy[r]    = dy;
1598     opt->dz[r]    = dz;
1599     opt->X[r]     = X;
1600     opt->Y[r]     = Y;
1601   }
1602 
1603 finish:
1604   /* If not optimizable, free arrays to save memory */
1605   if (!n || !optimizable) {
1606     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1607     ierr = PetscFree(opt);CHKERRQ(ierr);
1608     *out = NULL;
1609   } else {
1610     opt->offset[0] = 0;
1611     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1612     *out = opt;
1613   }
1614   PetscFunctionReturn(0);
1615 }
1616 
1617 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out)
1618 {
1619   PetscErrorCode ierr;
1620   PetscSFPackOpt opt = *out;
1621 
1622   PetscFunctionBegin;
1623   if (opt) {
1624     if (mtype == PETSC_MEMTYPE_HOST) {ierr = PetscFree(opt->array);CHKERRQ(ierr);}
1625 #if defined(PETSC_HAVE_CUDA)
1626     else {cudaError_t cerr = cudaFree(opt->array);CHKERRCUDA(cerr);opt->array=NULL;}
1627 #endif
1628     ierr = PetscFree(opt);CHKERRQ(ierr);
1629     *out = NULL;
1630   }
1631   PetscFunctionReturn(0);
1632 }
1633 
1634 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1635 {
1636   PetscErrorCode ierr;
1637   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1638   PetscInt       i,j;
1639 
1640   PetscFunctionBegin;
1641   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1642   for (i=0; i<2; i++) { /* Set defaults */
1643     sf->leafstart[i]   = 0;
1644     sf->leafcontig[i]  = PETSC_TRUE;
1645     sf->leafdups[i]    = PETSC_FALSE;
1646     bas->rootstart[i]  = 0;
1647     bas->rootcontig[i] = PETSC_TRUE;
1648     bas->rootdups[i]   = PETSC_FALSE;
1649   }
1650 
1651   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1652   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1653 
1654   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1655   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1656 
1657   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1658   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1659     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1660   }
1661   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1662     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1663   }
1664 
1665   /* If not, see if we can have per-rank optimizations by doing index analysis */
1666   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1667   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1668 
1669   /* Are root indices for self and remote contiguous? */
1670   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1671   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1672 
1673   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1674   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1675 
1676   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1677     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1678   }
1679   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1680     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1681   }
1682 
1683   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1684   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1685 
1686 #if defined(PETSC_HAVE_CUDA)
1687     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1688   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1689   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1690   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1691   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1692 #endif
1693 
1694   PetscFunctionReturn(0);
1695 }
1696 
1697 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1698 {
1699   PetscErrorCode ierr;
1700   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1701   PetscInt       i;
1702 
1703   PetscFunctionBegin;
1704   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1705     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1706     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1707     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1708     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1709   }
1710   PetscFunctionReturn(0);
1711 }
1712