xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 0e03b746557e2551025fde0294144c0532d12f68)
1 
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6 #include <cuda_runtime.h>
7 #endif
8 /*
9  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
10  * therefore we pack data types manually. This file defines packing routines for the standard data types.
11  */
12 
13 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
14 
15 /* Operations working like s += t */
16 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
17 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
18 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
19 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while(0)
20 /* Ref MPI MAXLOC */
21 #define OP_XLOC(op,s,t) \
22   do {                                       \
23     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
24     else if (!((s).u op (t).u)) s = t;           \
25   } while(0)
26 
27 /* DEF_PackFunc - macro defining a Pack routine
28 
29    Arguments of the macro:
30    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
31    .BS        Block size for vectorization. It is a factor of bsz.
32    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
33 
34    Arguments of the Pack routine:
35    +count     Number of indices in idx[].
36    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
37    .opt       Per-pack optimization plan. NULL means no such plan.
38    .idx       Indices of entries to packed.
39    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
40    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
41    -packed    Address of the packed data.
42  */
43 #define DEF_PackFunc(Type,BS,EQ) \
44   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
45   {                                                                                                          \
46     PetscErrorCode ierr;                                                                                     \
47     const Type     *u = (const Type*)unpacked,*u2;                                                           \
48     Type           *p = (Type*)packed,*p2;                                                                   \
49     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
50     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
51     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
52     PetscFunctionBegin;                                                                                      \
53     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
54     else if (opt) { /* has optimizations available */                                                        \
55       p2 = p;                                                                                                \
56       for (r=0; r<opt->n; r++) {                                                                             \
57         u2 = u + opt->start[r]*MBS;                                                                          \
58         X  = opt->X[r];                                                                                      \
59         Y  = opt->Y[r];                                                                                      \
60         for (k=0; k<opt->dz[r]; k++)                                                                         \
61           for (j=0; j<opt->dy[r]; j++) {                                                                     \
62             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
63             p2  += opt->dx[r]*MBS;                                                                           \
64           }                                                                                                  \
65       }                                                                                                      \
66     } else {                                                                                                 \
67       for (i=0; i<count; i++)                                                                                \
68         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
69           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
70             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
71     }                                                                                                        \
72     PetscFunctionReturn(0);                                                                                  \
73   }
74 
75 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
76                 and inserts into a sparse array.
77 
78    Arguments:
79   .Type       Type of the data
80   .BS         Block size for vectorization
81   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
82 
83   Notes:
84    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
85  */
86 #define DEF_UnpackFunc(Type,BS,EQ)               \
87   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
88   {                                                                                                          \
89     PetscErrorCode ierr;                                                                                     \
90     Type           *u = (Type*)unpacked,*u2;                                                                 \
91     const Type     *p = (const Type*)packed;                                                                 \
92     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
93     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
94     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
95     PetscFunctionBegin;                                                                                      \
96     if (!idx) {                                                                                              \
97       u += start*MBS;                                                                                        \
98       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
99     } else if (opt) { /* has optimizations available */                                                      \
100       for (r=0; r<opt->n; r++) {                                                                             \
101         u2 = u + opt->start[r]*MBS;                                                                          \
102         X  = opt->X[r];                                                                                      \
103         Y  = opt->Y[r];                                                                                      \
104         for (k=0; k<opt->dz[r]; k++)                                                                         \
105           for (j=0; j<opt->dy[r]; j++) {                                                                     \
106             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
107             p   += opt->dx[r]*MBS;                                                                           \
108           }                                                                                                  \
109       }                                                                                                      \
110     } else {                                                                                                 \
111       for (i=0; i<count; i++)                                                                                \
112         for (j=0; j<M; j++)                                                                                  \
113           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
114     }                                                                                                        \
115     PetscFunctionReturn(0);                                                                                  \
116   }
117 
118 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
119 
120    Arguments:
121   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
122   .Type       Type of the data
123   .BS         Block size for vectorization
124   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
125   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
126   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
127  */
128 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
129   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
130   {                                                                                                          \
131     Type           *u = (Type*)unpacked,*u2;                                                                 \
132     const Type     *p = (const Type*)packed;                                                                 \
133     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
134     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
135     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
136     PetscFunctionBegin;                                                                                      \
137     if (!idx) {                                                                                              \
138       u += start*MBS;                                                                                        \
139       for (i=0; i<count; i++)                                                                                \
140         for (j=0; j<M; j++)                                                                                  \
141           for (k=0; k<BS; k++)                                                                               \
142             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
143     } else if (opt) { /* idx[] has patterns */                                                               \
144       for (r=0; r<opt->n; r++) {                                                                             \
145         u2 = u + opt->start[r]*MBS;                                                                          \
146         X  = opt->X[r];                                                                                      \
147         Y  = opt->Y[r];                                                                                      \
148         for (k=0; k<opt->dz[r]; k++)                                                                         \
149           for (j=0; j<opt->dy[r]; j++) {                                                                     \
150             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
151             p += opt->dx[r]*MBS;                                                                             \
152           }                                                                                                  \
153       }                                                                                                      \
154     } else {                                                                                                 \
155       for (i=0; i<count; i++)                                                                                \
156         for (j=0; j<M; j++)                                                                                  \
157           for (k=0; k<BS; k++)                                                                               \
158             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
159     }                                                                                                        \
160     PetscFunctionReturn(0);                                                                                  \
161   }
162 
163 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
164   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
165   {                                                                                                          \
166     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
167     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
168     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
169     const PetscInt MBS = M*BS;                                                                               \
170     PetscFunctionBegin;                                                                                      \
171     for (i=0; i<count; i++) {                                                                                \
172       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
173       l = i*MBS;                                                                                             \
174       for (j=0; j<M; j++)                                                                                    \
175         for (k=0; k<BS; k++) {                                                                               \
176           tmp = u[r+j*BS+k];                                                                                 \
177           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
178           p[l+j*BS+k] = tmp;                                                                                 \
179         }                                                                                                    \
180     }                                                                                                        \
181     PetscFunctionReturn(0);                                                                                  \
182   }
183 
184 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
185   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
186   {                                                                                                          \
187     PetscErrorCode ierr;                                                                                     \
188     const Type     *u = (const Type*)src;                                                                    \
189     Type           *v = (Type*)dst;                                                                          \
190     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
191     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
192     const PetscInt MBS = M*BS;                                                                               \
193     PetscFunctionBegin;                                                                                      \
194     if (!srcIdx) { /* src is contiguous */                                                                   \
195       u += srcStart*MBS;                                                                                     \
196       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
197     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
198       u += srcOpt->start[0]*MBS;                                                                             \
199       v += dstStart*MBS;                                                                                     \
200       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
201       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
202         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
203           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
204           v += srcOpt->dx[0]*MBS;                                                                            \
205         }                                                                                                    \
206     } else { /* all other cases */                                                                           \
207       for (i=0; i<count; i++) {                                                                              \
208         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
209         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
210         for (j=0; j<M; j++)                                                                                  \
211           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
212       }                                                                                                      \
213     }                                                                                                        \
214     PetscFunctionReturn(0);                                                                                  \
215   }
216 
217 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
218   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
219   {                                                                                                          \
220     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
221     const Type     *ldata = (const Type*)leafdata;                                                           \
222     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
223     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
224     const PetscInt MBS = M*BS;                                                                               \
225     PetscFunctionBegin;                                                                                      \
226     for (i=0; i<count; i++) {                                                                                \
227       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
228       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
229       for (j=0; j<M; j++)                                                                                    \
230         for (k=0; k<BS; k++) {                                                                               \
231           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
232           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
233         }                                                                                                    \
234     }                                                                                                        \
235     PetscFunctionReturn(0);                                                                                  \
236   }
237 
238 /* Pack, Unpack/Fetch ops */
239 #define DEF_Pack(Type,BS,EQ)                                                      \
240   DEF_PackFunc(Type,BS,EQ)                                                        \
241   DEF_UnpackFunc(Type,BS,EQ)                                                      \
242   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
243   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
244     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
245     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
246     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
247   }
248 
249 /* Add, Mult ops */
250 #define DEF_Add(Type,BS,EQ)                                                       \
251   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
252   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
253   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
254   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
255   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
256   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
257   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
258     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
259     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
260     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
261     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
262     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
263     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
264   }
265 
266 /* Max, Min ops */
267 #define DEF_Cmp(Type,BS,EQ)                                                       \
268   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
269   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
270   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
271   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
272   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
273     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
274     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
275     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
276     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
277   }
278 
279 /* Logical ops.
280   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
281   the compilation warning "empty macro arguments are undefined in ISO C90"
282  */
283 #define DEF_Log(Type,BS,EQ)                                                       \
284   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
285   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
286   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
287   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
288   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
289   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
290   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
291     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
292     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
293     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
294     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
295     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
296     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
297   }
298 
299 /* Bitwise ops */
300 #define DEF_Bit(Type,BS,EQ)                                                       \
301   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
302   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
303   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
304   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
305   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
306   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
307   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
308     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
309     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
310     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
311     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
312     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
313     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
314   }
315 
316 /* Maxloc, Minloc ops */
317 #define DEF_Xloc(Type,BS,EQ)                                                      \
318   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
319   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
320   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
321   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
322   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
323     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
324     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
325     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
326     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
327   }
328 
329 #define DEF_IntegerType(Type,BS,EQ)                                               \
330   DEF_Pack(Type,BS,EQ)                                                            \
331   DEF_Add(Type,BS,EQ)                                                             \
332   DEF_Cmp(Type,BS,EQ)                                                             \
333   DEF_Log(Type,BS,EQ)                                                             \
334   DEF_Bit(Type,BS,EQ)                                                             \
335   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
336     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
337     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
338     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
339     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
340     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
341   }
342 
343 #define DEF_RealType(Type,BS,EQ)                                                  \
344   DEF_Pack(Type,BS,EQ)                                                            \
345   DEF_Add(Type,BS,EQ)                                                             \
346   DEF_Cmp(Type,BS,EQ)                                                             \
347   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
348     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
349     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
350     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
351   }
352 
353 #if defined(PETSC_HAVE_COMPLEX)
354 #define DEF_ComplexType(Type,BS,EQ)                                               \
355   DEF_Pack(Type,BS,EQ)                                                            \
356   DEF_Add(Type,BS,EQ)                                                             \
357   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
358     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
359     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
360   }
361 #endif
362 
363 #define DEF_DumbType(Type,BS,EQ)                                                  \
364   DEF_Pack(Type,BS,EQ)                                                            \
365   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
366     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
367   }
368 
369 /* Maxloc, Minloc */
370 #define DEF_PairType(Type,BS,EQ)                                                  \
371   DEF_Pack(Type,BS,EQ)                                                            \
372   DEF_Xloc(Type,BS,EQ)                                                            \
373   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
374     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
375     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
376   }
377 
378 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
379 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
380 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
381 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
382 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
383 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
384 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
385 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
386 
387 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
388 DEF_IntegerType(int,1,1)
389 DEF_IntegerType(int,2,1)
390 DEF_IntegerType(int,4,1)
391 DEF_IntegerType(int,8,1)
392 DEF_IntegerType(int,1,0)
393 DEF_IntegerType(int,2,0)
394 DEF_IntegerType(int,4,0)
395 DEF_IntegerType(int,8,0)
396 #endif
397 
398 /* The typedefs are used to get a typename without space that CPPJoin can handle */
399 typedef signed char SignedChar;
400 DEF_IntegerType(SignedChar,1,1)
401 DEF_IntegerType(SignedChar,2,1)
402 DEF_IntegerType(SignedChar,4,1)
403 DEF_IntegerType(SignedChar,8,1)
404 DEF_IntegerType(SignedChar,1,0)
405 DEF_IntegerType(SignedChar,2,0)
406 DEF_IntegerType(SignedChar,4,0)
407 DEF_IntegerType(SignedChar,8,0)
408 
409 typedef unsigned char UnsignedChar;
410 DEF_IntegerType(UnsignedChar,1,1)
411 DEF_IntegerType(UnsignedChar,2,1)
412 DEF_IntegerType(UnsignedChar,4,1)
413 DEF_IntegerType(UnsignedChar,8,1)
414 DEF_IntegerType(UnsignedChar,1,0)
415 DEF_IntegerType(UnsignedChar,2,0)
416 DEF_IntegerType(UnsignedChar,4,0)
417 DEF_IntegerType(UnsignedChar,8,0)
418 
419 DEF_RealType(PetscReal,1,1)
420 DEF_RealType(PetscReal,2,1)
421 DEF_RealType(PetscReal,4,1)
422 DEF_RealType(PetscReal,8,1)
423 DEF_RealType(PetscReal,1,0)
424 DEF_RealType(PetscReal,2,0)
425 DEF_RealType(PetscReal,4,0)
426 DEF_RealType(PetscReal,8,0)
427 
428 #if defined(PETSC_HAVE_COMPLEX)
429 DEF_ComplexType(PetscComplex,1,1)
430 DEF_ComplexType(PetscComplex,2,1)
431 DEF_ComplexType(PetscComplex,4,1)
432 DEF_ComplexType(PetscComplex,8,1)
433 DEF_ComplexType(PetscComplex,1,0)
434 DEF_ComplexType(PetscComplex,2,0)
435 DEF_ComplexType(PetscComplex,4,0)
436 DEF_ComplexType(PetscComplex,8,0)
437 #endif
438 
439 #define PairType(Type1,Type2) Type1##_##Type2
440 typedef struct {int u; int i;}           PairType(int,int);
441 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
442 DEF_PairType(PairType(int,int),1,1)
443 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
444 
445 /* If we don't know the basic type, we treat it as a stream of chars or ints */
446 DEF_DumbType(char,1,1)
447 DEF_DumbType(char,2,1)
448 DEF_DumbType(char,4,1)
449 DEF_DumbType(char,1,0)
450 DEF_DumbType(char,2,0)
451 DEF_DumbType(char,4,0)
452 
453 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
454 DEF_DumbType(DumbInt,1,1)
455 DEF_DumbType(DumbInt,2,1)
456 DEF_DumbType(DumbInt,4,1)
457 DEF_DumbType(DumbInt,8,1)
458 DEF_DumbType(DumbInt,1,0)
459 DEF_DumbType(DumbInt,2,0)
460 DEF_DumbType(DumbInt,4,0)
461 DEF_DumbType(DumbInt,8,0)
462 
463 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
464 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
465 {
466   int ierr;
467   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
468   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
469   return MPI_SUCCESS;
470 }
471 #endif
472 
473 /*
474    The routine Creates a communication link for the given operation. It first looks up its link cache. If
475    there is a free & suitable one, it uses it. Otherwise it creates a new one.
476 
477    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
478    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
479    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
480    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
481 
482    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
483 
484    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
485 
486    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
487    need pack/unpack data.
488 */
489 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
490 {
491   PetscErrorCode    ierr;
492   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
493   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
494   PetscSFLink       *p,link;
495   PetscSFDirection  direction;
496   MPI_Request       *reqs = NULL;
497   PetscBool         match,rootdirect[2],leafdirect[2];
498   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
499   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
500 
501   PetscFunctionBegin;
502   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
503 
504   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
505   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
506     if (sfop == PETSCSF_BCAST) {
507       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
508       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
509     } else if (sfop == PETSCSF_REDUCE) {
510       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
511       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
512     } else { /* PETSCSF_FETCH */
513       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
514       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
515     }
516   }
517 
518   if (sf->use_gpu_aware_mpi) {
519     rootmtype_mpi = rootmtype;
520     leafmtype_mpi = leafmtype;
521   } else {
522     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
523   }
524   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
525   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
526   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
527 
528   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
529   nrootreqs = bas->nrootreqs;
530   nleafreqs = sf->nleafreqs;
531 
532   /* Look for free links in cache */
533   for (p=&bas->avail; (link=*p); p=&link->next) {
534     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
535     if (match) {
536       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
537          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
538       */
539       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
540         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
541         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
542         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
543       }
544       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
545         reqs = link->leafreqs[direction][leafmtype][1];
546         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
547         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
548       }
549       *p = link->next; /* Remove from available list */
550       goto found;
551     }
552   }
553 
554   ierr = PetscNew(&link);CHKERRQ(ierr);
555   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
556   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
557 
558   nreqs = (nrootreqs+nleafreqs)*8;
559   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
560   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
561 
562   for (i=0; i<2; i++) { /* Two communication directions */
563     for (j=0; j<2; j++) { /* Two memory types */
564       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
565         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
566         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
567       }
568     }
569   }
570 
571 found:
572   if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);}
573 
574   /* Allocate buffers along root/leafdata */
575   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
576     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
577     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
578     if (bas->rootbuflen[i]) {
579       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
580         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
581       } else { /* Have to have a separate rootbuf */
582         if (!link->rootbuf_alloc[i][rootmtype]) {
583           ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
584         }
585         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
586       }
587     }
588 
589     if (sf->leafbuflen[i]) {
590       if (leafdirect[i]) {
591         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
592       } else {
593         if (!link->leafbuf_alloc[i][leafmtype]) {
594           ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
595         }
596         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
597       }
598     }
599   }
600 
601   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
602   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
603     if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
604       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
605     }
606     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
607   }
608   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
609     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
610       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
611     }
612     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
613   }
614 
615   /* Set `current` state of the link. They may change between different SF invocations with the same link */
616   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
617     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
618     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
619   }
620 
621   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
622   link->leafdata = leafdata;
623   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
624     link->rootdirect[i] = rootdirect[i];
625     link->leafdirect[i] = leafdirect[i];
626   }
627   link->rootdirect_mpi  = rootdirect_mpi;
628   link->leafdirect_mpi  = leafdirect_mpi;
629   link->rootmtype       = rootmtype;
630   link->leafmtype       = leafmtype;
631   link->rootmtype_mpi   = rootmtype_mpi;
632   link->leafmtype_mpi   = leafmtype_mpi;
633 
634   link->next            = bas->inuse;
635   bas->inuse            = link;
636   *mylink               = link;
637   PetscFunctionReturn(0);
638 }
639 
640 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
641    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
642 */
643 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
644 {
645   PetscErrorCode       ierr;
646   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
647   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
648   const PetscInt       *rootoffset,*leafoffset;
649   PetscMPIInt          n;
650   MPI_Aint             disp;
651   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
652   MPI_Datatype         unit = link->unit;
653   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
654   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
655 
656   PetscFunctionBegin;
657   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
658   if (sf->persistent) {
659     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
660       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
661       if (direction == PETSCSF_LEAF2ROOT) {
662         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
663           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
664           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
665           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
666         }
667       } else { /* PETSCSF_ROOT2LEAF */
668         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
669           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
670           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
671           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
672         }
673       }
674       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
675     }
676 
677     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
678       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
679       if (direction == PETSCSF_LEAF2ROOT) {
680         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
681           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
682           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
683           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
684         }
685       } else { /* PETSCSF_ROOT2LEAF */
686         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
687           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
688           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
689           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
690         }
691       }
692       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
693     }
694   }
695   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
696   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
697   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
698   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
699   PetscFunctionReturn(0);
700 }
701 
702 
703 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
704 {
705   PetscErrorCode    ierr;
706   PetscSFLink       link,*p;
707   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
708 
709   PetscFunctionBegin;
710   /* Look for types in cache */
711   for (p=&bas->inuse; (link=*p); p=&link->next) {
712     PetscBool match;
713     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
714     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
715       switch (cmode) {
716       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
717       case PETSC_USE_POINTER: break;
718       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
719       }
720       *mylink = link;
721       PetscFunctionReturn(0);
722     }
723   }
724   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
725   PetscFunctionReturn(0);
726 }
727 
728 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
729 {
730   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
731 
732   PetscFunctionBegin;
733   (*link)->rootdata = NULL;
734   (*link)->leafdata = NULL;
735   (*link)->next     = bas->avail;
736   bas->avail        = *link;
737   *link             = NULL;
738   PetscFunctionReturn(0);
739 }
740 
741 /* Destroy all links chained in 'avail' */
742 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
743 {
744   PetscErrorCode    ierr;
745   PetscSFLink       link = *avail,next;
746   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
747   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
748 
749   PetscFunctionBegin;
750   for (; link; link=next) {
751     next = link->next;
752     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
753     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
754       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
755     }
756     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
757     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
758 #if defined(PETSC_HAVE_CUDA)
759       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
760       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
761       if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
762 #endif
763       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
764       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
765     }
766     ierr = PetscFree(link);CHKERRQ(ierr);
767   }
768   *avail = NULL;
769   PetscFunctionReturn(0);
770 }
771 
772 #if defined(PETSC_USE_DEBUG)
773 /* Error out on unsupported overlapped communications */
774 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
775 {
776   PetscErrorCode    ierr;
777   PetscSFLink       link,*p;
778   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
779   PetscBool         match;
780 
781   PetscFunctionBegin;
782   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
783      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
784   */
785   if (rootdata || leafdata) {
786     for (p=&bas->inuse; (link=*p); p=&link->next) {
787       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
788       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
789     }
790   }
791   PetscFunctionReturn(0);
792 }
793 #endif
794 
795 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
796 {
797   PetscErrorCode ierr;
798   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
799   PetscBool      is2Int,is2PetscInt;
800   PetscMPIInt    ni,na,nd,combiner;
801 #if defined(PETSC_HAVE_COMPLEX)
802   PetscInt       nPetscComplex=0;
803 #endif
804 
805   PetscFunctionBegin;
806   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
807   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
808   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
809   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
810   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
811   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
812 #if defined(PETSC_HAVE_COMPLEX)
813   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
814 #endif
815   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
816   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
817   /* TODO: shaell we also handle Fortran MPI_2REAL? */
818   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
819   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
820   link->bs = 1; /* default */
821 
822   if (is2Int) {
823     PackInit_PairType_int_int_1_1(link);
824     link->bs        = 1;
825     link->unitbytes = 2*sizeof(int);
826     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
827     link->basicunit = MPI_2INT;
828     link->unit      = MPI_2INT;
829   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
830     PackInit_PairType_PetscInt_PetscInt_1_1(link);
831     link->bs        = 1;
832     link->unitbytes = 2*sizeof(PetscInt);
833     link->basicunit = MPIU_2INT;
834     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
835     link->unit      = MPIU_2INT;
836   } else if (nPetscReal) {
837     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
838     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
839     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
840     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
841     link->bs        = nPetscReal;
842     link->unitbytes = nPetscReal*sizeof(PetscReal);
843     link->basicunit = MPIU_REAL;
844     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
845   } else if (nPetscInt) {
846     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
847     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
848     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
849     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
850     link->bs        = nPetscInt;
851     link->unitbytes = nPetscInt*sizeof(PetscInt);
852     link->basicunit = MPIU_INT;
853     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
854 #if defined(PETSC_USE_64BIT_INDICES)
855   } else if (nInt) {
856     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
857     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
858     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
859     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
860     link->bs        = nInt;
861     link->unitbytes = nInt*sizeof(int);
862     link->basicunit = MPI_INT;
863     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
864 #endif
865   } else if (nSignedChar) {
866     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
867     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
868     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
869     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
870     link->bs        = nSignedChar;
871     link->unitbytes = nSignedChar*sizeof(SignedChar);
872     link->basicunit = MPI_SIGNED_CHAR;
873     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
874   }  else if (nUnsignedChar) {
875     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
876     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
877     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
878     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
879     link->bs        = nUnsignedChar;
880     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
881     link->basicunit = MPI_UNSIGNED_CHAR;
882     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
883 #if defined(PETSC_HAVE_COMPLEX)
884   } else if (nPetscComplex) {
885     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
886     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
887     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
888     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
889     link->bs        = nPetscComplex;
890     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
891     link->basicunit = MPIU_COMPLEX;
892     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
893 #endif
894   } else {
895     MPI_Aint lb,nbyte;
896     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
897     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
898     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
899       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
900       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
901       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
902       link->bs        = nbyte;
903       link->unitbytes = nbyte;
904       link->basicunit = MPI_BYTE;
905     } else {
906       nInt = nbyte / sizeof(int);
907       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
908       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
909       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
910       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
911       link->bs        = nInt;
912       link->unitbytes = nbyte;
913       link->basicunit = MPI_INT;
914     }
915     if (link->isbuiltin) link->unit = unit;
916   }
917 
918   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
919   PetscFunctionReturn(0);
920 }
921 
922 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
923 {
924   PetscFunctionBegin;
925   *UnpackAndOp = NULL;
926   if (mtype == PETSC_MEMTYPE_HOST) {
927     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
928     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
929     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
930     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
931     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
932     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
933     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
934     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
935     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
936     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
937     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
938     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
939     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
940   }
941 #if defined(PETSC_HAVE_CUDA)
942   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
943     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
944     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
945     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
946     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
947     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
948     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
949     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
950     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
951     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
952     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
953     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
954     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
955     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
956   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
957     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
958     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
959     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
960     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
961     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
962     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
963     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
964     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
965     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
966     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
967     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
968     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
969     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
970   }
971 #endif
972   PetscFunctionReturn(0);
973 }
974 
975 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
976 {
977   PetscFunctionBegin;
978   *ScatterAndOp = NULL;
979   if (mtype == PETSC_MEMTYPE_HOST) {
980     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
981     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
982     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
983     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
984     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
985     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
986     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
987     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
988     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
989     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
990     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
991     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
992     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
993   }
994 #if defined(PETSC_HAVE_CUDA)
995   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
996     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
997     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
998     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
999     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1000     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1001     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1002     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1003     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1004     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1005     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1006     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1007     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1008     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1009   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1010     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1011     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1012     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1013     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1014     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1015     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1016     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1017     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1018     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1019     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1020     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1021     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1022     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1023   }
1024 #endif
1025   PetscFunctionReturn(0);
1026 }
1027 
1028 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1029 {
1030   PetscFunctionBegin;
1031   *FetchAndOp = NULL;
1032   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1033   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1034 #if defined(PETSC_HAVE_CUDA)
1035   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1036   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1037 #endif
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1042 {
1043   PetscFunctionBegin;
1044   *FetchAndOpLocal = NULL;
1045   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1046   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1047 #if defined(PETSC_HAVE_CUDA)
1048   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1049   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1050 #endif
1051   PetscFunctionReturn(0);
1052 }
1053 
1054 /*=============================================================================
1055               A set of helper routines for Pack/Unpack/Scatter on GPUs
1056  ============================================================================*/
1057 #if defined(PETSC_HAVE_CUDA)
1058 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
1059    make sure the data is ready for packing.
1060  */
1061 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
1062 {
1063   PetscFunctionBegin;
1064   if (sf->use_default_stream) PetscFunctionReturn(0);
1065   if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {
1066     cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
1067   }
1068   PetscFunctionReturn(0);
1069 }
1070 
1071 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
1072 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
1073 {
1074   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1075 
1076   PetscFunctionBegin;
1077   /* Do nothing if we use stream aware mpi || has nothing for remote */
1078   if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
1079   /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
1080   if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1081     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1082   }
1083   PetscFunctionReturn(0);
1084 }
1085 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
1086 {
1087   PetscFunctionBegin;
1088   /* See comments above */
1089   if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
1090   if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1091     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1092   }
1093   PetscFunctionReturn(0);
1094 }
1095 
1096 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
1097    does not know which stream the callers will use.
1098 */
1099 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
1100 {
1101   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1102 
1103   PetscFunctionBegin;
1104   /* Do nothing if we are expected to put rootdata on default stream */
1105   if (sf->use_default_stream || link->rootmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0);
1106   /* If we have something from local, then we called a scatter kernel (on link->stream), then we must sync it;
1107      If we have something from remote and we called unpack kernel, then we must also sycn it.
1108    */
1109   if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {
1110     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1111   }
1112   PetscFunctionReturn(0);
1113 }
1114 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
1115 {
1116   PetscFunctionBegin;
1117   /* See comments above */
1118   if (sf->use_default_stream || link->leafmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0);
1119   if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {
1120     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1121   }
1122   PetscFunctionReturn(0);
1123 }
1124 
1125 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
1126    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
1127 */
1128 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1129 {
1130   PetscErrorCode ierr;
1131   cudaError_t    cerr;
1132   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1133 
1134   PetscFunctionBegin;
1135   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
1136     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1137     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1138     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
1139     if (device2host) {
1140       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1141       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1142     } else {
1143       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1144       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1145     }
1146   }
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1151 {
1152   PetscErrorCode ierr;
1153   cudaError_t    cerr;
1154 
1155   PetscFunctionBegin;
1156   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
1157     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1158     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1159     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
1160     if (device2host) {
1161       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1162       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1163     } else {
1164       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1165       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1166     }
1167   }
1168   PetscFunctionReturn(0);
1169 }
1170 #else
1171 
1172 #define PetscSFLinkSyncDeviceBeforePackData(a,b)                0
1173 #define PetscSFLinkSyncStreamAfterPackRootData(a,b)             0
1174 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b)             0
1175 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b)           0
1176 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b)           0
1177 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1178 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1179 
1180 #endif
1181 
1182 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1183 {
1184   PetscErrorCode ierr;
1185   PetscLogDouble flops;
1186   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1187 
1188 
1189   PetscFunctionBegin;
1190   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1191     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1192 #if defined(PETSC_HAVE_CUDA)
1193     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1194 #endif
1195     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1196   }
1197   PetscFunctionReturn(0);
1198 }
1199 
1200 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1201 {
1202   PetscLogDouble flops;
1203   PetscErrorCode ierr;
1204 
1205   PetscFunctionBegin;
1206   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1207     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1208 #if defined(PETSC_HAVE_CUDA)
1209     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1210 #endif
1211     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1212   }
1213   PetscFunctionReturn(0);
1214 }
1215 
1216 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1217   Input Arguments:
1218   +sf      - The StarForest
1219   .link    - The link
1220   .count   - Number of entries to unpack
1221   .start   - The first index, significent when indices=NULL
1222   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1223   .buf     - A contiguous buffer to unpack from
1224   -op      - Operation after unpack
1225 
1226   Output Arguments:
1227   .data    - The data to unpack to
1228 */
1229 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1230 {
1231   PetscFunctionBegin;
1232 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1233   {
1234     PetscErrorCode ierr;
1235     PetscInt       i;
1236     PetscMPIInt    n;
1237     if (indices) {
1238       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1239          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1240       */
1241       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1242     } else {
1243       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1244       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1245     }
1246   }
1247 #else
1248   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1249 #endif
1250   PetscFunctionReturn(0);
1251 }
1252 
1253 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1254 {
1255   PetscFunctionBegin;
1256 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1257   {
1258     PetscErrorCode ierr;
1259     PetscInt       i,disp;
1260     if (!srcIdx) {
1261       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
1262     } else {
1263       for (i=0; i<count; i++) {
1264         disp = dstIdx? dstIdx[i] : dstStart + i;
1265         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1266       }
1267     }
1268   }
1269 #else
1270   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1271 #endif
1272   PetscFunctionReturn(0);
1273 }
1274 
1275 /*=============================================================================
1276               Pack/Unpack/Fetch/Scatter routines
1277  ============================================================================*/
1278 
1279 /* Pack rootdata to rootbuf
1280   Input Arguments:
1281   + sf       - The SF this packing works on.
1282   . link     - It gives the memtype of the roots and also provides root buffer.
1283   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1284   - rootdata - Where to read the roots.
1285 
1286   Notes:
1287   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1288   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1289  */
1290 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1291 {
1292   PetscErrorCode   ierr;
1293   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1294   const PetscInt   *rootindices = NULL;
1295   PetscInt         count,start;
1296   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1297   PetscMemType     rootmtype = link->rootmtype;
1298   PetscSFPackOpt   opt = NULL;
1299 
1300 
1301   PetscFunctionBegin;
1302   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1303   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1304   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1305     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1306     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1307     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1308   }
1309   if (scope == PETSCSF_REMOTE) {
1310     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1311     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1312   }
1313   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1314   PetscFunctionReturn(0);
1315 }
1316 
1317 /* Pack leafdata to leafbuf */
1318 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1319 {
1320   PetscErrorCode   ierr;
1321   const PetscInt   *leafindices = NULL;
1322   PetscInt         count,start;
1323   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1324   PetscMemType     leafmtype = link->leafmtype;
1325   PetscSFPackOpt   opt = NULL;
1326 
1327   PetscFunctionBegin;
1328   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1329   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1330   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1331     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1332     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1333     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1334   }
1335   if (scope == PETSCSF_REMOTE) {
1336     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1337     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1338   }
1339   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1340   PetscFunctionReturn(0);
1341 }
1342 
1343 /* Unpack rootbuf to rootdata */
1344 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1345 {
1346   PetscErrorCode   ierr;
1347   const PetscInt   *rootindices = NULL;
1348   PetscInt         count,start;
1349   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1350   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1351   PetscMemType     rootmtype = link->rootmtype;
1352   PetscSFPackOpt   opt = NULL;
1353 
1354   PetscFunctionBegin;
1355   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1356   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1357   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1358     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1359     if (UnpackAndOp) {
1360       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1361       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1362     } else {
1363       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1364       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1365     }
1366   }
1367   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1368   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1369   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 /* Unpack leafbuf to leafdata */
1374 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1375 {
1376   PetscErrorCode   ierr;
1377   const PetscInt   *leafindices = NULL;
1378   PetscInt         count,start;
1379   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1380   PetscMemType     leafmtype = link->leafmtype;
1381   PetscSFPackOpt   opt = NULL;
1382 
1383   PetscFunctionBegin;
1384   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1385   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1386   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1387     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1388     if (UnpackAndOp) {
1389       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1390       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1391     } else {
1392       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1393       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1394     }
1395   }
1396   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1397   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1398   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1399   PetscFunctionReturn(0);
1400 }
1401 
1402 /* FetchAndOp rootdata with rootbuf */
1403 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1404 {
1405   PetscErrorCode     ierr;
1406   const PetscInt     *rootindices = NULL;
1407   PetscInt           count,start;
1408   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1409   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1410   PetscMemType       rootmtype = link->rootmtype;
1411   PetscSFPackOpt     opt = NULL;
1412 
1413   PetscFunctionBegin;
1414   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1415   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1416   if (bas->rootbuflen[scope]) {
1417     /* Do FetchAndOp on rootdata with rootbuf */
1418     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1419     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1420     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1421   }
1422   if (scope == PETSCSF_REMOTE) {
1423     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1424     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1425   }
1426   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1427   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1428   PetscFunctionReturn(0);
1429 }
1430 
1431 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1432 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1433 {
1434   PetscErrorCode       ierr;
1435   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1436   PetscInt             count,rootstart,leafstart;
1437   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1438   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1439   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1440   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1441 
1442   PetscFunctionBegin;
1443   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1444   if (rootmtype != leafmtype) { /* Uncommon case */
1445      /* The local communication has to go through pack and unpack */
1446     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1447     ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1448     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1449   } else {
1450     ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1451     if (ScatterAndOp) {
1452       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1453       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1454       ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1455     } else {
1456       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1457       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1458       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1459     }
1460   }
1461   PetscFunctionReturn(0);
1462 }
1463 
1464 /* Reduce leafdata to rootdata locally */
1465 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1466 {
1467   PetscErrorCode       ierr;
1468   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1469   PetscInt             count,rootstart,leafstart;
1470   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1471   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1472   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1473   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1474 
1475   PetscFunctionBegin;
1476   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1477   if (rootmtype != leafmtype) {
1478     /* The local communication has to go through pack and unpack */
1479     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1480     ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1481     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1482   } else {
1483     ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1484     if (ScatterAndOp) {
1485       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1486       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1487       ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1488     } else {
1489       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1490       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1491       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1492     }
1493   }
1494   PetscFunctionReturn(0);
1495 }
1496 
1497 /* Fetch rootdata to leafdata and leafupdate locally */
1498 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1499 {
1500   PetscErrorCode       ierr;
1501   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1502   PetscInt             count,rootstart,leafstart;
1503   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1504   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1505   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1506   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1507 
1508   PetscFunctionBegin;
1509   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1510   if (rootmtype != leafmtype) {
1511    /* The local communication has to go through pack and unpack */
1512     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1513   } else {
1514     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1515     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1516     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1517     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1518   }
1519   PetscFunctionReturn(0);
1520 }
1521 
1522 /*
1523   Create per-rank pack/unpack optimizations based on indice patterns
1524 
1525    Input Parameters:
1526   +  n       - Number of destination ranks
1527   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1528   -  idx     - [*]   Array storing indices
1529 
1530    Output Parameters:
1531   +  opt     - Pack optimizations. NULL if no optimizations.
1532 */
1533 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1534 {
1535   PetscErrorCode ierr;
1536   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1537   PetscBool      optimizable = PETSC_TRUE;
1538   PetscSFPackOpt opt;
1539 
1540   PetscFunctionBegin;
1541   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1542   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1543   opt->n      = opt->array[0] = n;
1544   opt->offset = opt->array + 1;
1545   opt->start  = opt->array + n   + 2;
1546   opt->dx     = opt->array + 2*n + 2;
1547   opt->dy     = opt->array + 3*n + 2;
1548   opt->dz     = opt->array + 4*n + 2;
1549   opt->X      = opt->array + 5*n + 2;
1550   opt->Y      = opt->array + 6*n + 2;
1551 
1552   for (r=0; r<n; r++) { /* For each destination rank */
1553     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1554     p     = offset[r];
1555     start = idx[p]; /* First index for this rank */
1556     p++;
1557 
1558     /* Search in X dimension */
1559     for (dx=1; dx<m; dx++,p++) {
1560       if (start+dx != idx[p]) break;
1561     }
1562 
1563     dydz = m/dx;
1564     X    = dydz > 1 ? (idx[p]-start) : dx;
1565     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1566     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1567     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1568       for (i=0; i<dx; i++,p++) {
1569         if (start+X*dy+i != idx[p]) {
1570           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1571           else goto Z_dimension;
1572         }
1573       }
1574     }
1575 
1576 Z_dimension:
1577     dz = m/(dx*dy);
1578     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1579     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1580     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1581     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1582       for (j=0; j<dy; j++) {
1583         for (i=0; i<dx; i++,p++) {
1584           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1585         }
1586       }
1587     }
1588     opt->start[r] = start;
1589     opt->dx[r]    = dx;
1590     opt->dy[r]    = dy;
1591     opt->dz[r]    = dz;
1592     opt->X[r]     = X;
1593     opt->Y[r]     = Y;
1594   }
1595 
1596 finish:
1597   /* If not optimizable, free arrays to save memory */
1598   if (!n || !optimizable) {
1599     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1600     ierr = PetscFree(opt);CHKERRQ(ierr);
1601     *out = NULL;
1602   } else {
1603     opt->offset[0] = 0;
1604     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1605     *out = opt;
1606   }
1607   PetscFunctionReturn(0);
1608 }
1609 
1610 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out)
1611 {
1612   PetscErrorCode ierr;
1613   PetscSFPackOpt opt = *out;
1614 
1615   PetscFunctionBegin;
1616   if (opt) {
1617     if (mtype == PETSC_MEMTYPE_HOST) {ierr = PetscFree(opt->array);CHKERRQ(ierr);}
1618 #if defined(PETSC_HAVE_CUDA)
1619     else {cudaError_t cerr = cudaFree(opt->array);CHKERRCUDA(cerr);opt->array=NULL;}
1620 #endif
1621     ierr = PetscFree(opt);CHKERRQ(ierr);
1622     *out = NULL;
1623   }
1624   PetscFunctionReturn(0);
1625 }
1626 
1627 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1628 {
1629   PetscErrorCode ierr;
1630   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1631   PetscInt       i,j;
1632 
1633   PetscFunctionBegin;
1634   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1635   for (i=0; i<2; i++) { /* Set defaults */
1636     sf->leafstart[i]   = 0;
1637     sf->leafcontig[i]  = PETSC_TRUE;
1638     sf->leafdups[i]    = PETSC_FALSE;
1639     bas->rootstart[i]  = 0;
1640     bas->rootcontig[i] = PETSC_TRUE;
1641     bas->rootdups[i]   = PETSC_FALSE;
1642   }
1643 
1644   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1645   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1646 
1647   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1648   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1649 
1650   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1651   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1652     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1653   }
1654   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1655     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1656   }
1657 
1658   /* If not, see if we can have per-rank optimizations by doing index analysis */
1659   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1660   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1661 
1662   /* Are root indices for self and remote contiguous? */
1663   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1664   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1665 
1666   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1667   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1668 
1669   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1670     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1671   }
1672   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1673     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1674   }
1675 
1676   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1677   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1678 
1679 #if defined(PETSC_HAVE_CUDA)
1680     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1681   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1682   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1683   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1684   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1685 #endif
1686 
1687   PetscFunctionReturn(0);
1688 }
1689 
1690 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1691 {
1692   PetscErrorCode ierr;
1693   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1694   PetscInt       i;
1695 
1696   PetscFunctionBegin;
1697   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1698     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1699     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1700     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1701     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1702   }
1703   PetscFunctionReturn(0);
1704 }
1705