xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 967582eba84cc53dc1fbf6b54d6aa49b7d83bae6)
1 #include <../src/vec/is/sf/impls/basic/sfpack.h>
2 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
3 
4 #if defined(PETSC_HAVE_CUDA)
5 #include <cuda_runtime.h>
6 #endif
7 /*
8  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9  * therefore we pack data types manually. This file defines packing routines for the standard data types.
10  */
11 
12 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
13 
14 /* Operations working like s += t */
15 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
16 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
17 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
18 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while(0)
19 /* Ref MPI MAXLOC */
20 #define OP_XLOC(op,s,t) \
21   do {                                       \
22     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23     else if (!((s).u op (t).u)) s = t;           \
24   } while(0)
25 
26 /* DEF_PackFunc - macro defining a Pack routine
27 
28    Arguments of the macro:
29    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30    .BS        Block size for vectorization. It is a factor of bsz.
31    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
32 
33    Arguments of the Pack routine:
34    +count     Number of indices in idx[].
35    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36    .opt       Per-pack optimization plan. NULL means no such plan.
37    .idx       Indices of entries to packed.
38    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40    -packed    Address of the packed data.
41  */
42 #define DEF_PackFunc(Type,BS,EQ) \
43   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44   {                                                                                                          \
45     PetscErrorCode ierr;                                                                                     \
46     const Type     *u = (const Type*)unpacked,*u2;                                                           \
47     Type           *p = (Type*)packed,*p2;                                                                   \
48     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
49     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
50     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
51     PetscFunctionBegin;                                                                                      \
52     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
53     else if (opt) { /* has optimizations available */                                                        \
54       p2 = p;                                                                                                \
55       for (r=0; r<opt->n; r++) {                                                                             \
56         u2 = u + opt->start[r]*MBS;                                                                          \
57         X  = opt->X[r];                                                                                      \
58         Y  = opt->Y[r];                                                                                      \
59         for (k=0; k<opt->dz[r]; k++)                                                                         \
60           for (j=0; j<opt->dy[r]; j++) {                                                                     \
61             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
62             p2  += opt->dx[r]*MBS;                                                                           \
63           }                                                                                                  \
64       }                                                                                                      \
65     } else {                                                                                                 \
66       for (i=0; i<count; i++)                                                                                \
67         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
68           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
69             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
70     }                                                                                                        \
71     PetscFunctionReturn(0);                                                                                  \
72   }
73 
74 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
75                 and inserts into a sparse array.
76 
77    Arguments:
78   .Type       Type of the data
79   .BS         Block size for vectorization
80   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
81 
82   Notes:
83    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
84  */
85 #define DEF_UnpackFunc(Type,BS,EQ)               \
86   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
87   {                                                                                                          \
88     PetscErrorCode ierr;                                                                                     \
89     Type           *u = (Type*)unpacked,*u2;                                                                 \
90     const Type     *p = (const Type*)packed;                                                                 \
91     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
92     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
93     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
94     PetscFunctionBegin;                                                                                      \
95     if (!idx) {                                                                                              \
96       u += start*MBS;                                                                                        \
97       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
98     } else if (opt) { /* has optimizations available */                                                      \
99       for (r=0; r<opt->n; r++) {                                                                             \
100         u2 = u + opt->start[r]*MBS;                                                                          \
101         X  = opt->X[r];                                                                                      \
102         Y  = opt->Y[r];                                                                                      \
103         for (k=0; k<opt->dz[r]; k++)                                                                         \
104           for (j=0; j<opt->dy[r]; j++) {                                                                     \
105             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
106             p   += opt->dx[r]*MBS;                                                                           \
107           }                                                                                                  \
108       }                                                                                                      \
109     } else {                                                                                                 \
110       for (i=0; i<count; i++)                                                                                \
111         for (j=0; j<M; j++)                                                                                  \
112           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
113     }                                                                                                        \
114     PetscFunctionReturn(0);                                                                                  \
115   }
116 
117 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
118 
119    Arguments:
120   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
121   .Type       Type of the data
122   .BS         Block size for vectorization
123   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
124   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
125   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
126  */
127 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
128   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
129   {                                                                                                          \
130     Type           *u = (Type*)unpacked,*u2;                                                                 \
131     const Type     *p = (const Type*)packed;                                                                 \
132     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
133     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
134     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
135     PetscFunctionBegin;                                                                                      \
136     if (!idx) {                                                                                              \
137       u += start*MBS;                                                                                        \
138       for (i=0; i<count; i++)                                                                                \
139         for (j=0; j<M; j++)                                                                                  \
140           for (k=0; k<BS; k++)                                                                               \
141             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
142     } else if (opt) { /* idx[] has patterns */                                                               \
143       for (r=0; r<opt->n; r++) {                                                                             \
144         u2 = u + opt->start[r]*MBS;                                                                          \
145         X  = opt->X[r];                                                                                      \
146         Y  = opt->Y[r];                                                                                      \
147         for (k=0; k<opt->dz[r]; k++)                                                                         \
148           for (j=0; j<opt->dy[r]; j++) {                                                                     \
149             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
150             p += opt->dx[r]*MBS;                                                                             \
151           }                                                                                                  \
152       }                                                                                                      \
153     } else {                                                                                                 \
154       for (i=0; i<count; i++)                                                                                \
155         for (j=0; j<M; j++)                                                                                  \
156           for (k=0; k<BS; k++)                                                                               \
157             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
158     }                                                                                                        \
159     PetscFunctionReturn(0);                                                                                  \
160   }
161 
162 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
163   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
164   {                                                                                                          \
165     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
166     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
167     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
168     const PetscInt MBS = M*BS;                                                                               \
169     PetscFunctionBegin;                                                                                      \
170     for (i=0; i<count; i++) {                                                                                \
171       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
172       l = i*MBS;                                                                                             \
173       for (j=0; j<M; j++)                                                                                    \
174         for (k=0; k<BS; k++) {                                                                               \
175           tmp = u[r+j*BS+k];                                                                                 \
176           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
177           p[l+j*BS+k] = tmp;                                                                                 \
178         }                                                                                                    \
179     }                                                                                                        \
180     PetscFunctionReturn(0);                                                                                  \
181   }
182 
183 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
184   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
185   {                                                                                                          \
186     PetscErrorCode ierr;                                                                                     \
187     const Type     *u = (const Type*)src;                                                                    \
188     Type           *v = (Type*)dst;                                                                          \
189     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
190     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
191     const PetscInt MBS = M*BS;                                                                               \
192     PetscFunctionBegin;                                                                                      \
193     if (!srcIdx) { /* src is contiguous */                                                                   \
194       u += srcStart*MBS;                                                                                     \
195       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
196     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
197       u += srcOpt->start[0]*MBS;                                                                             \
198       v += dstStart*MBS;                                                                                     \
199       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
200       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
201         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
202           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
203           v += srcOpt->dx[0]*MBS;                                                                            \
204         }                                                                                                    \
205     } else { /* all other cases */                                                                           \
206       for (i=0; i<count; i++) {                                                                              \
207         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
208         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
209         for (j=0; j<M; j++)                                                                                  \
210           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
211       }                                                                                                      \
212     }                                                                                                        \
213     PetscFunctionReturn(0);                                                                                  \
214   }
215 
216 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
217   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
218   {                                                                                                          \
219     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
220     const Type     *ldata = (const Type*)leafdata;                                                           \
221     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
222     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
223     const PetscInt MBS = M*BS;                                                                               \
224     PetscFunctionBegin;                                                                                      \
225     for (i=0; i<count; i++) {                                                                                \
226       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
227       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
228       for (j=0; j<M; j++)                                                                                    \
229         for (k=0; k<BS; k++) {                                                                               \
230           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
231           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
232         }                                                                                                    \
233     }                                                                                                        \
234     PetscFunctionReturn(0);                                                                                  \
235   }
236 
237 /* Pack, Unpack/Fetch ops */
238 #define DEF_Pack(Type,BS,EQ)                                                      \
239   DEF_PackFunc(Type,BS,EQ)                                                        \
240   DEF_UnpackFunc(Type,BS,EQ)                                                      \
241   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
242   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
243     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
244     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
245     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
246   }
247 
248 /* Add, Mult ops */
249 #define DEF_Add(Type,BS,EQ)                                                       \
250   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
251   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
252   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
253   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
254   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
255   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
256   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
257     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
258     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
259     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
260     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
261     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
262     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
263   }
264 
265 /* Max, Min ops */
266 #define DEF_Cmp(Type,BS,EQ)                                                       \
267   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
268   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
269   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
270   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
271   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
272     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
273     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
274     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
275     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
276   }
277 
278 /* Logical ops.
279   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
280   the compilation warning "empty macro arguments are undefined in ISO C90"
281  */
282 #define DEF_Log(Type,BS,EQ)                                                       \
283   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
284   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
285   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
286   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
287   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
288   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
289   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
290     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
291     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
292     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
293     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
294     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
295     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
296   }
297 
298 /* Bitwise ops */
299 #define DEF_Bit(Type,BS,EQ)                                                       \
300   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
301   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
302   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
303   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
304   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
305   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
306   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
307     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
308     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
309     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
310     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
311     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
312     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
313   }
314 
315 /* Maxloc, Minloc ops */
316 #define DEF_Xloc(Type,BS,EQ)                                                      \
317   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
318   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
319   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
320   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
321   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
322     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
323     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
324     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
325     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
326   }
327 
328 #define DEF_IntegerType(Type,BS,EQ)                                               \
329   DEF_Pack(Type,BS,EQ)                                                            \
330   DEF_Add(Type,BS,EQ)                                                             \
331   DEF_Cmp(Type,BS,EQ)                                                             \
332   DEF_Log(Type,BS,EQ)                                                             \
333   DEF_Bit(Type,BS,EQ)                                                             \
334   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
335     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
336     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
337     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
338     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
339     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
340   }
341 
342 #define DEF_RealType(Type,BS,EQ)                                                  \
343   DEF_Pack(Type,BS,EQ)                                                            \
344   DEF_Add(Type,BS,EQ)                                                             \
345   DEF_Cmp(Type,BS,EQ)                                                             \
346   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
347     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
348     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
349     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
350   }
351 
352 #if defined(PETSC_HAVE_COMPLEX)
353 #define DEF_ComplexType(Type,BS,EQ)                                               \
354   DEF_Pack(Type,BS,EQ)                                                            \
355   DEF_Add(Type,BS,EQ)                                                             \
356   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
357     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
358     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
359   }
360 #endif
361 
362 #define DEF_DumbType(Type,BS,EQ)                                                  \
363   DEF_Pack(Type,BS,EQ)                                                            \
364   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
365     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366   }
367 
368 /* Maxloc, Minloc */
369 #define DEF_PairType(Type,BS,EQ)                                                  \
370   DEF_Pack(Type,BS,EQ)                                                            \
371   DEF_Xloc(Type,BS,EQ)                                                            \
372   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
373     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
375   }
376 
377 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
378 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
379 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
380 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
381 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
382 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
383 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
384 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
385 
386 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
387 DEF_IntegerType(int,1,1)
388 DEF_IntegerType(int,2,1)
389 DEF_IntegerType(int,4,1)
390 DEF_IntegerType(int,8,1)
391 DEF_IntegerType(int,1,0)
392 DEF_IntegerType(int,2,0)
393 DEF_IntegerType(int,4,0)
394 DEF_IntegerType(int,8,0)
395 #endif
396 
397 /* The typedefs are used to get a typename without space that CPPJoin can handle */
398 typedef signed char SignedChar;
399 DEF_IntegerType(SignedChar,1,1)
400 DEF_IntegerType(SignedChar,2,1)
401 DEF_IntegerType(SignedChar,4,1)
402 DEF_IntegerType(SignedChar,8,1)
403 DEF_IntegerType(SignedChar,1,0)
404 DEF_IntegerType(SignedChar,2,0)
405 DEF_IntegerType(SignedChar,4,0)
406 DEF_IntegerType(SignedChar,8,0)
407 
408 typedef unsigned char UnsignedChar;
409 DEF_IntegerType(UnsignedChar,1,1)
410 DEF_IntegerType(UnsignedChar,2,1)
411 DEF_IntegerType(UnsignedChar,4,1)
412 DEF_IntegerType(UnsignedChar,8,1)
413 DEF_IntegerType(UnsignedChar,1,0)
414 DEF_IntegerType(UnsignedChar,2,0)
415 DEF_IntegerType(UnsignedChar,4,0)
416 DEF_IntegerType(UnsignedChar,8,0)
417 
418 DEF_RealType(PetscReal,1,1)
419 DEF_RealType(PetscReal,2,1)
420 DEF_RealType(PetscReal,4,1)
421 DEF_RealType(PetscReal,8,1)
422 DEF_RealType(PetscReal,1,0)
423 DEF_RealType(PetscReal,2,0)
424 DEF_RealType(PetscReal,4,0)
425 DEF_RealType(PetscReal,8,0)
426 
427 #if defined(PETSC_HAVE_COMPLEX)
428 DEF_ComplexType(PetscComplex,1,1)
429 DEF_ComplexType(PetscComplex,2,1)
430 DEF_ComplexType(PetscComplex,4,1)
431 DEF_ComplexType(PetscComplex,8,1)
432 DEF_ComplexType(PetscComplex,1,0)
433 DEF_ComplexType(PetscComplex,2,0)
434 DEF_ComplexType(PetscComplex,4,0)
435 DEF_ComplexType(PetscComplex,8,0)
436 #endif
437 
438 #define PairType(Type1,Type2) Type1##_##Type2
439 typedef struct {int u; int i;}           PairType(int,int);
440 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
441 DEF_PairType(PairType(int,int),1,1)
442 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
443 
444 /* If we don't know the basic type, we treat it as a stream of chars or ints */
445 DEF_DumbType(char,1,1)
446 DEF_DumbType(char,2,1)
447 DEF_DumbType(char,4,1)
448 DEF_DumbType(char,1,0)
449 DEF_DumbType(char,2,0)
450 DEF_DumbType(char,4,0)
451 
452 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
453 DEF_DumbType(DumbInt,1,1)
454 DEF_DumbType(DumbInt,2,1)
455 DEF_DumbType(DumbInt,4,1)
456 DEF_DumbType(DumbInt,8,1)
457 DEF_DumbType(DumbInt,1,0)
458 DEF_DumbType(DumbInt,2,0)
459 DEF_DumbType(DumbInt,4,0)
460 DEF_DumbType(DumbInt,8,0)
461 
462 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
463 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
464 {
465   int ierr;
466   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
467   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
468   return MPI_SUCCESS;
469 }
470 #endif
471 
472 /*
473    The routine Creates a communication link for the given operation. It first looks up its link cache. If
474    there is a free & suitable one, it uses it. Otherwise it creates a new one.
475 
476    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
477    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
478    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
479    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
480 
481    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
482 
483    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
484 
485    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
486    need pack/unpack data.
487 */
488 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
489 {
490   PetscErrorCode    ierr;
491   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
492   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
493   PetscSFLink       *p,link;
494   PetscSFDirection  direction;
495   MPI_Request       *reqs = NULL;
496   PetscBool         match,rootdirect[2],leafdirect[2];
497   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
498   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
499 
500   PetscFunctionBegin;
501   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
502 
503   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
504   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
505     if (sfop == PETSCSF_BCAST) {
506       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
507       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
508     } else if (sfop == PETSCSF_REDUCE) {
509       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
510       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
511     } else { /* PETSCSF_FETCH */
512       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
513       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
514     }
515   }
516 
517   if (sf->use_gpu_aware_mpi) {
518     rootmtype_mpi = rootmtype;
519     leafmtype_mpi = leafmtype;
520   } else {
521     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
522   }
523   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
524   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
525   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
526 
527   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
528   nrootreqs = bas->nrootreqs;
529   nleafreqs = sf->nleafreqs;
530 
531   /* Look for free links in cache */
532   for (p=&bas->avail; (link=*p); p=&link->next) {
533     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
534     if (match) {
535       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
536          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
537       */
538       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
539         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
540         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
541         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
542       }
543       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
544         reqs = link->leafreqs[direction][leafmtype][1];
545         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
546         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
547       }
548       *p = link->next; /* Remove from available list */
549       goto found;
550     }
551   }
552 
553   ierr = PetscNew(&link);CHKERRQ(ierr);
554   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
555   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
556 
557   nreqs = (nrootreqs+nleafreqs)*8;
558   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
559   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
560 
561   for (i=0; i<2; i++) { /* Two communication directions */
562     for (j=0; j<2; j++) { /* Two memory types */
563       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
564         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
565         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
566       }
567     }
568   }
569 
570 found:
571   if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);}
572 
573   /* Allocate buffers along root/leafdata */
574   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
575     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
576     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
577     if (bas->rootbuflen[i]) {
578       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
579         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
580       } else { /* Have to have a separate rootbuf */
581         if (!link->rootbuf_alloc[i][rootmtype]) {
582           ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
583         }
584         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
585       }
586     }
587 
588     if (sf->leafbuflen[i]) {
589       if (leafdirect[i]) {
590         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
591       } else {
592         if (!link->leafbuf_alloc[i][leafmtype]) {
593           ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
594         }
595         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
596       }
597     }
598   }
599 
600   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
601   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
602     if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
603       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
604     }
605     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
606   }
607   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
608     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
609       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
610     }
611     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
612   }
613 
614   /* Set `current` state of the link. They may change between different SF invocations with the same link */
615   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
616     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
617     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
618   }
619 
620   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
621   link->leafdata = leafdata;
622   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
623     link->rootdirect[i] = rootdirect[i];
624     link->leafdirect[i] = leafdirect[i];
625   }
626   link->rootdirect_mpi  = rootdirect_mpi;
627   link->leafdirect_mpi  = leafdirect_mpi;
628   link->rootmtype       = rootmtype;
629   link->leafmtype       = leafmtype;
630   link->rootmtype_mpi   = rootmtype_mpi;
631   link->leafmtype_mpi   = leafmtype_mpi;
632 
633   link->next            = bas->inuse;
634   bas->inuse            = link;
635   *mylink               = link;
636   PetscFunctionReturn(0);
637 }
638 
639 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
640    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
641 */
642 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
643 {
644   PetscErrorCode       ierr;
645   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
646   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
647   const PetscInt       *rootoffset,*leafoffset;
648   PetscMPIInt          n;
649   MPI_Aint             disp;
650   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
651   MPI_Datatype         unit = link->unit;
652   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
653   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
654 
655   PetscFunctionBegin;
656   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
657   if (sf->persistent) {
658     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
659       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
660       if (direction == PETSCSF_LEAF2ROOT) {
661         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
662           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
663           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
664           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
665         }
666       } else { /* PETSCSF_ROOT2LEAF */
667         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
668           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
669           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
670           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
671         }
672       }
673       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
674     }
675 
676     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
677       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
678       if (direction == PETSCSF_LEAF2ROOT) {
679         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
680           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
681           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
682           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
683         }
684       } else { /* PETSCSF_ROOT2LEAF */
685         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
686           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
687           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
688           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
689         }
690       }
691       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
692     }
693   }
694   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
695   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
696   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
697   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
698   PetscFunctionReturn(0);
699 }
700 
701 
702 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
703 {
704   PetscErrorCode    ierr;
705   PetscSFLink       link,*p;
706   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
707 
708   PetscFunctionBegin;
709   /* Look for types in cache */
710   for (p=&bas->inuse; (link=*p); p=&link->next) {
711     PetscBool match;
712     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
713     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
714       switch (cmode) {
715       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
716       case PETSC_USE_POINTER: break;
717       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
718       }
719       *mylink = link;
720       PetscFunctionReturn(0);
721     }
722   }
723   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
724   PetscFunctionReturn(0);
725 }
726 
727 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
728 {
729   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
730 
731   PetscFunctionBegin;
732   (*link)->rootdata = NULL;
733   (*link)->leafdata = NULL;
734   (*link)->next     = bas->avail;
735   bas->avail        = *link;
736   *link             = NULL;
737   PetscFunctionReturn(0);
738 }
739 
740 /* Destroy all links chained in 'avail' */
741 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
742 {
743   PetscErrorCode    ierr;
744   PetscSFLink       link = *avail,next;
745   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
746   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
747 
748   PetscFunctionBegin;
749   for (; link; link=next) {
750     next = link->next;
751     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
752     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
753       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
754     }
755     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
756     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
757 #if defined(PETSC_HAVE_CUDA)
758       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
759       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
760       if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
761 #endif
762       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
763       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
764     }
765     ierr = PetscFree(link);CHKERRQ(ierr);
766   }
767   *avail = NULL;
768   PetscFunctionReturn(0);
769 }
770 
771 #if defined(PETSC_USE_DEBUG)
772 /* Error out on unsupported overlapped communications */
773 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
774 {
775   PetscErrorCode    ierr;
776   PetscSFLink       link,*p;
777   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
778   PetscBool         match;
779 
780   PetscFunctionBegin;
781   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
782      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
783   */
784   if (rootdata || leafdata) {
785     for (p=&bas->inuse; (link=*p); p=&link->next) {
786       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
787       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
788     }
789   }
790   PetscFunctionReturn(0);
791 }
792 #endif
793 
794 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
795 {
796   PetscErrorCode ierr;
797   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
798   PetscBool      is2Int,is2PetscInt;
799   PetscMPIInt    ni,na,nd,combiner;
800 #if defined(PETSC_HAVE_COMPLEX)
801   PetscInt       nPetscComplex=0;
802 #endif
803 
804   PetscFunctionBegin;
805   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
806   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
807   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
808   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
809   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
810   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
811 #if defined(PETSC_HAVE_COMPLEX)
812   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
813 #endif
814   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
815   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
816   /* TODO: shaell we also handle Fortran MPI_2REAL? */
817   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
818   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
819   link->bs = 1; /* default */
820 
821   if (is2Int) {
822     PackInit_PairType_int_int_1_1(link);
823     link->bs        = 1;
824     link->unitbytes = 2*sizeof(int);
825     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
826     link->basicunit = MPI_2INT;
827     link->unit      = MPI_2INT;
828   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
829     PackInit_PairType_PetscInt_PetscInt_1_1(link);
830     link->bs        = 1;
831     link->unitbytes = 2*sizeof(PetscInt);
832     link->basicunit = MPIU_2INT;
833     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
834     link->unit      = MPIU_2INT;
835   } else if (nPetscReal) {
836     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
837     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
838     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
839     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
840     link->bs        = nPetscReal;
841     link->unitbytes = nPetscReal*sizeof(PetscReal);
842     link->basicunit = MPIU_REAL;
843     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
844   } else if (nPetscInt) {
845     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
846     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
847     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
848     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
849     link->bs        = nPetscInt;
850     link->unitbytes = nPetscInt*sizeof(PetscInt);
851     link->basicunit = MPIU_INT;
852     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
853 #if defined(PETSC_USE_64BIT_INDICES)
854   } else if (nInt) {
855     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
856     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
857     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
858     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
859     link->bs        = nInt;
860     link->unitbytes = nInt*sizeof(int);
861     link->basicunit = MPI_INT;
862     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
863 #endif
864   } else if (nSignedChar) {
865     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
866     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
867     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
868     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
869     link->bs        = nSignedChar;
870     link->unitbytes = nSignedChar*sizeof(SignedChar);
871     link->basicunit = MPI_SIGNED_CHAR;
872     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
873   }  else if (nUnsignedChar) {
874     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
875     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
876     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
877     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
878     link->bs        = nUnsignedChar;
879     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
880     link->basicunit = MPI_UNSIGNED_CHAR;
881     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
882 #if defined(PETSC_HAVE_COMPLEX)
883   } else if (nPetscComplex) {
884     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
885     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
886     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
887     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
888     link->bs        = nPetscComplex;
889     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
890     link->basicunit = MPIU_COMPLEX;
891     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
892 #endif
893   } else {
894     MPI_Aint lb,nbyte;
895     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
896     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
897     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
898       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
899       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
900       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
901       link->bs        = nbyte;
902       link->unitbytes = nbyte;
903       link->basicunit = MPI_BYTE;
904     } else {
905       nInt = nbyte / sizeof(int);
906       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
907       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
908       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
909       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
910       link->bs        = nInt;
911       link->unitbytes = nbyte;
912       link->basicunit = MPI_INT;
913     }
914     if (link->isbuiltin) link->unit = unit;
915   }
916 
917   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
918   PetscFunctionReturn(0);
919 }
920 
921 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
922 {
923   PetscFunctionBegin;
924   *UnpackAndOp = NULL;
925   if (mtype == PETSC_MEMTYPE_HOST) {
926     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
927     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
928     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
929     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
930     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
931     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
932     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
933     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
934     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
935     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
936     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
937     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
938     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
939   }
940 #if defined(PETSC_HAVE_CUDA)
941   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
942     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
943     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
944     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
945     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
946     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
947     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
948     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
949     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
950     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
951     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
952     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
953     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
954     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
955   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
956     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
957     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
958     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
959     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
960     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
961     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
962     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
963     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
964     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
965     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
966     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
967     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
968     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
969   }
970 #endif
971   PetscFunctionReturn(0);
972 }
973 
974 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
975 {
976   PetscFunctionBegin;
977   *ScatterAndOp = NULL;
978   if (mtype == PETSC_MEMTYPE_HOST) {
979     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
980     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
981     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
982     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
983     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
984     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
985     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
986     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
987     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
988     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
989     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
990     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
991     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
992   }
993 #if defined(PETSC_HAVE_CUDA)
994   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
995     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
996     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
997     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
998     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
999     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1000     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1001     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1002     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1003     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1004     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1005     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1006     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1007     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1008   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1009     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1010     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1011     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1012     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1013     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1014     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1015     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1016     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1017     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1018     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1019     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1020     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1021     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1022   }
1023 #endif
1024   PetscFunctionReturn(0);
1025 }
1026 
1027 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1028 {
1029   PetscFunctionBegin;
1030   *FetchAndOp = NULL;
1031   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1032   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1033 #if defined(PETSC_HAVE_CUDA)
1034   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1035   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1036 #endif
1037   PetscFunctionReturn(0);
1038 }
1039 
1040 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1041 {
1042   PetscFunctionBegin;
1043   *FetchAndOpLocal = NULL;
1044   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1045   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1046 #if defined(PETSC_HAVE_CUDA)
1047   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1048   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1049 #endif
1050   PetscFunctionReturn(0);
1051 }
1052 
1053 /*=============================================================================
1054               A set of helper routines for Pack/Unpack/Scatter on GPUs
1055  ============================================================================*/
1056 #if defined(PETSC_HAVE_CUDA)
1057 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
1058    make sure the data is ready for packing.
1059  */
1060 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
1061 {
1062   PetscFunctionBegin;
1063   if (sf->use_default_stream) PetscFunctionReturn(0);
1064   if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {
1065     cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
1066   }
1067   PetscFunctionReturn(0);
1068 }
1069 
1070 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
1071 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
1072 {
1073   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1074 
1075   PetscFunctionBegin;
1076   /* Do nothing if we use stream aware mpi || has nothing for remote */
1077   if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
1078   /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
1079   if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1080     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1081   }
1082   PetscFunctionReturn(0);
1083 }
1084 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
1085 {
1086   PetscFunctionBegin;
1087   /* See comments above */
1088   if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
1089   if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1090     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1091   }
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
1096    does not know which stream the callers will use.
1097 */
1098 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
1099 {
1100   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1101   PetscBool      host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
1102 
1103   PetscFunctionBegin;
1104   /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */
1105   if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0);
1106 
1107   /* If rootmtype is HOST or DEVICE:
1108      If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it;
1109      If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect,
1110      i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync).
1111 
1112      Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise
1113      CPU thread might use the yet-to-be-updated rootdata pending in the stream.
1114    */
1115   if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {
1116     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1117   }
1118   PetscFunctionReturn(0);
1119 }
1120 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
1121 {
1122   PetscBool      host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
1123 
1124   PetscFunctionBegin;
1125   /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/
1126   if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0);
1127   if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {
1128     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1129   }
1130   PetscFunctionReturn(0);
1131 }
1132 
1133 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
1134    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
1135 */
1136 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1137 {
1138   PetscErrorCode ierr;
1139   cudaError_t    cerr;
1140   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1141 
1142   PetscFunctionBegin;
1143   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
1144     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1145     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1146     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
1147     if (device2host) {
1148       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1149       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1150     } else {
1151       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1152       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1153     }
1154   }
1155   PetscFunctionReturn(0);
1156 }
1157 
1158 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1159 {
1160   PetscErrorCode ierr;
1161   cudaError_t    cerr;
1162 
1163   PetscFunctionBegin;
1164   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
1165     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1166     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1167     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
1168     if (device2host) {
1169       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1170       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1171     } else {
1172       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1173       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1174     }
1175   }
1176   PetscFunctionReturn(0);
1177 }
1178 #else
1179 
1180 #define PetscSFLinkSyncDeviceBeforePackData(a,b)                0
1181 #define PetscSFLinkSyncStreamAfterPackRootData(a,b)             0
1182 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b)             0
1183 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b)           0
1184 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b)           0
1185 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1186 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1187 
1188 #endif
1189 
1190 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1191 {
1192   PetscErrorCode ierr;
1193   PetscLogDouble flops;
1194   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1195 
1196 
1197   PetscFunctionBegin;
1198   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1199     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1200 #if defined(PETSC_HAVE_CUDA)
1201     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1202 #endif
1203     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1204   }
1205   PetscFunctionReturn(0);
1206 }
1207 
1208 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1209 {
1210   PetscLogDouble flops;
1211   PetscErrorCode ierr;
1212 
1213   PetscFunctionBegin;
1214   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1215     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1216 #if defined(PETSC_HAVE_CUDA)
1217     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1218 #endif
1219     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1220   }
1221   PetscFunctionReturn(0);
1222 }
1223 
1224 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1225   Input Arguments:
1226   +sf      - The StarForest
1227   .link    - The link
1228   .count   - Number of entries to unpack
1229   .start   - The first index, significent when indices=NULL
1230   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1231   .buf     - A contiguous buffer to unpack from
1232   -op      - Operation after unpack
1233 
1234   Output Arguments:
1235   .data    - The data to unpack to
1236 */
1237 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1238 {
1239   PetscFunctionBegin;
1240 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1241   {
1242     PetscErrorCode ierr;
1243     PetscInt       i;
1244     PetscMPIInt    n;
1245     if (indices) {
1246       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1247          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1248       */
1249       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1250     } else {
1251       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1252       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1253     }
1254   }
1255 #else
1256   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1257 #endif
1258   PetscFunctionReturn(0);
1259 }
1260 
1261 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1262 {
1263   PetscFunctionBegin;
1264 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1265   {
1266     PetscErrorCode ierr;
1267     PetscInt       i,disp;
1268     if (!srcIdx) {
1269       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
1270     } else {
1271       for (i=0; i<count; i++) {
1272         disp = dstIdx? dstIdx[i] : dstStart + i;
1273         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1274       }
1275     }
1276   }
1277 #else
1278   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1279 #endif
1280   PetscFunctionReturn(0);
1281 }
1282 
1283 /*=============================================================================
1284               Pack/Unpack/Fetch/Scatter routines
1285  ============================================================================*/
1286 
1287 /* Pack rootdata to rootbuf
1288   Input Arguments:
1289   + sf       - The SF this packing works on.
1290   . link     - It gives the memtype of the roots and also provides root buffer.
1291   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1292   - rootdata - Where to read the roots.
1293 
1294   Notes:
1295   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1296   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1297  */
1298 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1299 {
1300   PetscErrorCode   ierr;
1301   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1302   const PetscInt   *rootindices = NULL;
1303   PetscInt         count,start;
1304   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1305   PetscMemType     rootmtype = link->rootmtype;
1306   PetscSFPackOpt   opt = NULL;
1307 
1308 
1309   PetscFunctionBegin;
1310   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1311   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1312   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1313     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1314     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1315     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1316   }
1317   if (scope == PETSCSF_REMOTE) {
1318     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1319     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1320   }
1321   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1322   PetscFunctionReturn(0);
1323 }
1324 
1325 /* Pack leafdata to leafbuf */
1326 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1327 {
1328   PetscErrorCode   ierr;
1329   const PetscInt   *leafindices = NULL;
1330   PetscInt         count,start;
1331   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1332   PetscMemType     leafmtype = link->leafmtype;
1333   PetscSFPackOpt   opt = NULL;
1334 
1335   PetscFunctionBegin;
1336   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1337   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1338   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1339     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1340     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1341     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1342   }
1343   if (scope == PETSCSF_REMOTE) {
1344     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1345     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1346   }
1347   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1348   PetscFunctionReturn(0);
1349 }
1350 
1351 /* Unpack rootbuf to rootdata */
1352 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1353 {
1354   PetscErrorCode   ierr;
1355   const PetscInt   *rootindices = NULL;
1356   PetscInt         count,start;
1357   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1358   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1359   PetscMemType     rootmtype = link->rootmtype;
1360   PetscSFPackOpt   opt = NULL;
1361 
1362   PetscFunctionBegin;
1363   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1364   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1365   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1366     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1367     if (UnpackAndOp) {
1368       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1369       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1370     } else {
1371       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1372       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1373     }
1374   }
1375   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1376   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1377   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1378   PetscFunctionReturn(0);
1379 }
1380 
1381 /* Unpack leafbuf to leafdata */
1382 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1383 {
1384   PetscErrorCode   ierr;
1385   const PetscInt   *leafindices = NULL;
1386   PetscInt         count,start;
1387   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1388   PetscMemType     leafmtype = link->leafmtype;
1389   PetscSFPackOpt   opt = NULL;
1390 
1391   PetscFunctionBegin;
1392   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1393   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1394   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1395     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1396     if (UnpackAndOp) {
1397       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1398       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1399     } else {
1400       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1401       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1402     }
1403   }
1404   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1405   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1406   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1407   PetscFunctionReturn(0);
1408 }
1409 
1410 /* FetchAndOp rootdata with rootbuf */
1411 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1412 {
1413   PetscErrorCode     ierr;
1414   const PetscInt     *rootindices = NULL;
1415   PetscInt           count,start;
1416   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1417   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1418   PetscMemType       rootmtype = link->rootmtype;
1419   PetscSFPackOpt     opt = NULL;
1420 
1421   PetscFunctionBegin;
1422   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1423   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1424   if (bas->rootbuflen[scope]) {
1425     /* Do FetchAndOp on rootdata with rootbuf */
1426     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1427     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1428     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1429   }
1430   if (scope == PETSCSF_REMOTE) {
1431     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1432     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1433   }
1434   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1435   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1436   PetscFunctionReturn(0);
1437 }
1438 
1439 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1440 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1441 {
1442   PetscErrorCode       ierr;
1443   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1444   PetscInt             count,rootstart,leafstart;
1445   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1446   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1447   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1448   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1449 
1450   PetscFunctionBegin;
1451   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1452   if (rootmtype != leafmtype) { /* Uncommon case */
1453      /* The local communication has to go through pack and unpack */
1454     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1455     ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1456     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1457   } else {
1458     ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1459     if (ScatterAndOp) {
1460       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1461       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1462       ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1463     } else {
1464       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1465       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1466       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1467     }
1468   }
1469   PetscFunctionReturn(0);
1470 }
1471 
1472 /* Reduce leafdata to rootdata locally */
1473 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1474 {
1475   PetscErrorCode       ierr;
1476   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1477   PetscInt             count,rootstart,leafstart;
1478   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1479   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1480   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1481   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1482 
1483   PetscFunctionBegin;
1484   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1485   if (rootmtype != leafmtype) {
1486     /* The local communication has to go through pack and unpack */
1487     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1488     ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1489     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1490   } else {
1491     ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1492     if (ScatterAndOp) {
1493       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1494       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1495       ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1496     } else {
1497       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1498       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1499       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1500     }
1501   }
1502   PetscFunctionReturn(0);
1503 }
1504 
1505 /* Fetch rootdata to leafdata and leafupdate locally */
1506 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1507 {
1508   PetscErrorCode       ierr;
1509   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1510   PetscInt             count,rootstart,leafstart;
1511   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1512   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1513   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1514   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1515 
1516   PetscFunctionBegin;
1517   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1518   if (rootmtype != leafmtype) {
1519    /* The local communication has to go through pack and unpack */
1520     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1521   } else {
1522     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1523     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1524     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1525     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1526   }
1527   PetscFunctionReturn(0);
1528 }
1529 
1530 /*
1531   Create per-rank pack/unpack optimizations based on indice patterns
1532 
1533    Input Parameters:
1534   +  n       - Number of destination ranks
1535   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1536   -  idx     - [*]   Array storing indices
1537 
1538    Output Parameters:
1539   +  opt     - Pack optimizations. NULL if no optimizations.
1540 */
1541 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1542 {
1543   PetscErrorCode ierr;
1544   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1545   PetscBool      optimizable = PETSC_TRUE;
1546   PetscSFPackOpt opt;
1547 
1548   PetscFunctionBegin;
1549   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1550   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1551   opt->n      = opt->array[0] = n;
1552   opt->offset = opt->array + 1;
1553   opt->start  = opt->array + n   + 2;
1554   opt->dx     = opt->array + 2*n + 2;
1555   opt->dy     = opt->array + 3*n + 2;
1556   opt->dz     = opt->array + 4*n + 2;
1557   opt->X      = opt->array + 5*n + 2;
1558   opt->Y      = opt->array + 6*n + 2;
1559 
1560   for (r=0; r<n; r++) { /* For each destination rank */
1561     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1562     p     = offset[r];
1563     start = idx[p]; /* First index for this rank */
1564     p++;
1565 
1566     /* Search in X dimension */
1567     for (dx=1; dx<m; dx++,p++) {
1568       if (start+dx != idx[p]) break;
1569     }
1570 
1571     dydz = m/dx;
1572     X    = dydz > 1 ? (idx[p]-start) : dx;
1573     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1574     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1575     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1576       for (i=0; i<dx; i++,p++) {
1577         if (start+X*dy+i != idx[p]) {
1578           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1579           else goto Z_dimension;
1580         }
1581       }
1582     }
1583 
1584 Z_dimension:
1585     dz = m/(dx*dy);
1586     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1587     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1588     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1589     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1590       for (j=0; j<dy; j++) {
1591         for (i=0; i<dx; i++,p++) {
1592           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1593         }
1594       }
1595     }
1596     opt->start[r] = start;
1597     opt->dx[r]    = dx;
1598     opt->dy[r]    = dy;
1599     opt->dz[r]    = dz;
1600     opt->X[r]     = X;
1601     opt->Y[r]     = Y;
1602   }
1603 
1604 finish:
1605   /* If not optimizable, free arrays to save memory */
1606   if (!n || !optimizable) {
1607     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1608     ierr = PetscFree(opt);CHKERRQ(ierr);
1609     *out = NULL;
1610   } else {
1611     opt->offset[0] = 0;
1612     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1613     *out = opt;
1614   }
1615   PetscFunctionReturn(0);
1616 }
1617 
1618 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out)
1619 {
1620   PetscErrorCode ierr;
1621   PetscSFPackOpt opt = *out;
1622 
1623   PetscFunctionBegin;
1624   if (opt) {
1625     if (mtype == PETSC_MEMTYPE_HOST) {ierr = PetscFree(opt->array);CHKERRQ(ierr);}
1626 #if defined(PETSC_HAVE_CUDA)
1627     else {cudaError_t cerr = cudaFree(opt->array);CHKERRCUDA(cerr);opt->array=NULL;}
1628 #endif
1629     ierr = PetscFree(opt);CHKERRQ(ierr);
1630     *out = NULL;
1631   }
1632   PetscFunctionReturn(0);
1633 }
1634 
1635 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1636 {
1637   PetscErrorCode ierr;
1638   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1639   PetscInt       i,j;
1640 
1641   PetscFunctionBegin;
1642   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1643   for (i=0; i<2; i++) { /* Set defaults */
1644     sf->leafstart[i]   = 0;
1645     sf->leafcontig[i]  = PETSC_TRUE;
1646     sf->leafdups[i]    = PETSC_FALSE;
1647     bas->rootstart[i]  = 0;
1648     bas->rootcontig[i] = PETSC_TRUE;
1649     bas->rootdups[i]   = PETSC_FALSE;
1650   }
1651 
1652   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1653   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1654 
1655   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1656   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1657 
1658   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1659   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1660     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1661   }
1662   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1663     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1664   }
1665 
1666   /* If not, see if we can have per-rank optimizations by doing index analysis */
1667   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1668   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1669 
1670   /* Are root indices for self and remote contiguous? */
1671   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1672   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1673 
1674   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1675   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1676 
1677   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1678     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1679   }
1680   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1681     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1682   }
1683 
1684   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1685   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1686 
1687 #if defined(PETSC_HAVE_CUDA)
1688     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1689   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1690   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1691   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1692   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1693 #endif
1694 
1695   PetscFunctionReturn(0);
1696 }
1697 
1698 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1699 {
1700   PetscErrorCode ierr;
1701   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1702   PetscInt       i;
1703 
1704   PetscFunctionBegin;
1705   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1706     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1707     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1708     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1709     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1710   }
1711   PetscFunctionReturn(0);
1712 }
1713