xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision a5526d509825ad7a038b634d27d13fc2a03247e1)
1 
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6 #include <cuda_runtime.h>
7 #endif
8 /*
9  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
10  * therefore we pack data types manually. This file defines packing routines for the standard data types.
11  */
12 
13 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
14 
15 /* Operations working like s += t */
16 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
17 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
18 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
19 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while(0)
20 /* Ref MPI MAXLOC */
21 #define OP_XLOC(op,s,t) \
22   do {                                       \
23     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
24     else if (!((s).u op (t).u)) s = t;           \
25   } while(0)
26 
27 /* DEF_PackFunc - macro defining a Pack routine
28 
29    Arguments of the macro:
30    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
31    .BS        Block size for vectorization. It is a factor of bs.
32    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
33 
34    Arguments of the Pack routine:
35    +count     Number of indices in idx[].
36    .start     If indices are contiguous, it is the first index; otherwise, not used.
37    .idx       Indices of entries to packed. NULL means contiguous indices, that is [start,start+count).
38    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39    .opt       Per-remote pack optimization plan. NULL means no such plan.
40    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
41    -packed    Address of the packed data.
42  */
43 #define DEF_PackFunc(Type,BS,EQ) \
44   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,const void *unpacked,void *packed) \
45   {                                                                                                          \
46     PetscErrorCode ierr;                                                                                     \
47     const Type     *u = (const Type*)unpacked,*u2;                                                           \
48     Type           *p = (Type*)packed,*p2;                                                                   \
49     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
50     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
51     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
52     PetscFunctionBegin;                                                                                      \
53     if (!idx) {ierr = PetscArraycpy(p,u+start*bs,MBS*count);CHKERRQ(ierr);}  /* Indices are contiguous */    \
54     else if (!opt) { /* No optimizations available */                                                        \
55       for (i=0; i<count; i++)                                                                                \
56         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
57           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
58             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
59     } else {                                                                                                 \
60       for (r=0; r<opt->n; r++) {                                                                             \
61         p2  = p + opt->offset[r]*MBS;                                                                        \
62         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
63           idx2 = idx + opt->offset[r];                                                                       \
64           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
65             for (j=0; j<M; j++)                                                                              \
66               for (k=0; k<BS; k++)                                                                           \
67                 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k];                                                    \
68         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
69           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
70             u2   = u + idx[opt->copy_start[i]]*MBS;                                                          \
71             l    = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */                       \
72             ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr);                                                     \
73             p2  += l;                                                                                        \
74           }                                                                                                  \
75         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
76           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
77           step = opt->stride_step[r];                                                                        \
78           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
79             for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j];                                            \
80         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
81       }                                                                                                      \
82     }                                                                                                        \
83     PetscFunctionReturn(0);                                                                                  \
84   }
85 
86 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
87                 and inserts into a sparse array.
88 
89    Arguments:
90   .Type       Type of the data
91   .BS         Block size for vectorization
92   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
93 
94   Notes:
95    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
96  */
97 #define DEF_UnpackFunc(Type,BS,EQ)               \
98   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \
99   {                                                                                                          \
100     PetscErrorCode ierr;                                                                                     \
101     Type           *u = (Type*)unpacked,*u2;                                                                 \
102     const Type     *p = (const Type*)packed,*p2;                                                             \
103     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
104     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
105     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
106     PetscFunctionBegin;                                                                                      \
107     if (!idx) {                                                                                              \
108       u2 = u + start*bs;                                                                                     \
109       if (u2 != p) {ierr = PetscArraycpy(u2,p,count*MBS);CHKERRQ(ierr);}                                     \
110     } else if (!opt) { /* No optimizations available */                                                      \
111       for (i=0; i<count; i++)                                                                                \
112         for (j=0; j<M; j++)                                                                                  \
113           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
114     } else {                                                                                                 \
115       for (r=0; r<opt->n; r++) {                                                                             \
116         p2 = p + opt->offset[r]*MBS;                                                                         \
117         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
118           idx2 = idx + opt->offset[r];                                                                       \
119           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
120             for (j=0; j<M; j++)                                                                              \
121               for (k=0; k<BS; k++) u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                 \
122         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
123           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
124             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
125             l  = opt->copy_length[i]*MBS;                                                                    \
126             ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr);                                                     \
127             p2 += l;                                                                                         \
128           }                                                                                                  \
129         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
130           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
131           step = opt->stride_step[r];                                                                        \
132           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
133             for (j=0; j<M; j++)                                                                              \
134               for (k=0; k<BS; k++) u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                 \
135         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
136       }                                                                                                      \
137     }                                                                                                        \
138     PetscFunctionReturn(0);                                                                                  \
139   }
140 
141 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
142 
143    Arguments:
144   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
145   .Type       Type of the data
146   .BS         Block size for vectorization
147   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
148   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
149   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
150  */
151 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
152   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \
153   {                                                                                                          \
154     Type           *u = (Type*)unpacked,*u2;                                                                 \
155     const Type     *p = (const Type*)packed,*p2;                                                             \
156     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
157     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
158     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
159     PetscFunctionBegin;                                                                                      \
160     if (!idx) {                                                                                              \
161       u += start*bs;                                                                                         \
162       for (i=0; i<count; i++)                                                                                \
163         for (j=0; j<M; j++)                                                                                  \
164           for (k=0; k<BS; k++)                                                                               \
165             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
166     } else if (!opt) { /* No optimizations available */                                                      \
167       for (i=0; i<count; i++)                                                                                \
168         for (j=0; j<M; j++)                                                                                  \
169           for (k=0; k<BS; k++)                                                                               \
170               OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                              \
171     } else {                                                                                                 \
172       for (r=0; r<opt->n; r++) {                                                                             \
173         p2 = p + opt->offset[r]*MBS;                                                                         \
174         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
175           idx2 = idx + opt->offset[r];                                                                       \
176           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
177             for (j=0; j<M; j++)                                                                              \
178               for (k=0; k<BS; k++)                                                                           \
179                 OpApply(Op,u[idx2[i]*MBS+j*BS+k],p2[i*MBS+j*BS+k]);                                          \
180         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
181           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
182             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
183             l  = opt->copy_length[i]*MBS;                                                                    \
184             for (j=0; j<l; j++) OpApply(Op,u2[j],p2[j]);                                                     \
185             p2 += l;                                                                                         \
186           }                                                                                                  \
187         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
188           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
189           step = opt->stride_step[r];                                                                        \
190           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
191             for (j=0; j<M; j++)                                                                              \
192               for (k=0; k<BS; k++)                                                                           \
193                 OpApply(Op,u2[i*step*MBS+j*BS+k],p2[i*MBS+j*BS+k]);                                          \
194         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
195       }                                                                                                      \
196     }                                                                                                        \
197     PetscFunctionReturn(0);                                                                                  \
198   }
199 
200 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
201   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,void *packed) \
202   {                                                                                                          \
203     Type           *u = (Type*)unpacked,*p = (Type*)packed,t;                                                \
204     PetscInt       i,j,k,bs=link->bs;                                                                        \
205     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
206     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
207     PetscFunctionBegin;                                                                                      \
208     if (!idx) {                                                                                              \
209       u += start*bs;                                                                                         \
210       for (i=0; i<count; i++)                                                                                \
211         for (j=0; j<M; j++)                                                                                  \
212           for (k=0; k<BS; k++) {                                                                             \
213             t = u[i*MBS+j*BS+k];                                                                             \
214             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
215             p[i*MBS+j*BS+k] = t;                                                                             \
216           }                                                                                                  \
217     } else {                                                                                                 \
218       for (i=0; i<count; i++)                                                                                \
219         for (j=0; j<M; j++)                                                                                  \
220           for (k=0; k<BS; k++) {                                                                             \
221               t = u[idx[i]*MBS+j*BS+k];                                                                      \
222               OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                              \
223               p[i*MBS+j*BS+k] = t;                                                                           \
224           }                                                                                                  \
225     }                                                                                                        \
226     PetscFunctionReturn(0);                                                                                  \
227   }
228 
229 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
230   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt startx,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata) \
231   {                                                                                                          \
232     const Type     *x = (const Type*)xdata;                                                                  \
233     Type           *y = (Type*)ydata;                                                                        \
234     PetscInt       i,j,k,bs = link->bs;                                                                      \
235     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
236     const PetscInt MBS = M*BS;                                                                               \
237     PetscFunctionBegin;                                                                                      \
238     if (!idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use UnpackAndOp instead of ScatterAndOp");\
239     if (!idy) {                                                                                              \
240       y += starty*bs;                                                                                        \
241       for (i=0; i<count; i++)                                                                                \
242         for (j=0; j<M; j++)                                                                                  \
243           for (k=0; k<BS; k++)                                                                               \
244             OpApply(Op,y[i*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]);                                                \
245     } else {                                                                                                 \
246       for (i=0; i<count; i++)                                                                                \
247         for (j=0; j<M; j++)                                                                                  \
248           for (k=0; k<BS; k++)                                                                               \
249             OpApply(Op,y[idy[i]*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]);                                           \
250     }                                                                                                        \
251     PetscFunctionReturn(0);                                                                                  \
252   }
253 
254 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
255   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,const PetscInt *rootindices,void *rootdata,PetscInt leafstart,const PetscInt *leafindices,const void *leafdata,void *leafupdate) \
256   {                                                                                                          \
257     Type           *x = (Type*)rootdata,*y2 = (Type*)leafupdate;                                             \
258     const Type     *y = (const Type*)leafdata;                                                               \
259     PetscInt       i,j,k,bs = link->bs;                                                                      \
260     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
261     const PetscInt MBS = M*BS;                                                                               \
262     PetscFunctionBegin;                                                                                      \
263     if (!rootindices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use FetchAndOp instead of FetchAndOpLocal");\
264     if (!leafindices) {                                                                                      \
265       y += leafstart*bs; y2 += leafstart*bs;                                                                 \
266       for (i=0; i<count; i++)                                                                                \
267         for (j=0; j<M; j++)                                                                                  \
268           for (k=0; k<BS; k++) {                                                                             \
269             y2[i*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k];                                                 \
270             OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[i*MBS+j*BS+k]);                                        \
271           }                                                                                                  \
272     } else {                                                                                                 \
273       for (i=0; i<count; i++)                                                                                \
274         for (j=0; j<M; j++)                                                                                  \
275           for (k=0; k<BS; k++) {                                                                             \
276             y2[leafindices[i]*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k];                                    \
277             OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[leafindices[i]*MBS+j*BS+k]);                           \
278           }                                                                                                  \
279     }                                                                                                        \
280     PetscFunctionReturn(0);                                                                                  \
281   }
282 
283 
284 /* Pack, Unpack/Fetch ops */
285 #define DEF_Pack(Type,BS,EQ)                                                      \
286   DEF_PackFunc(Type,BS,EQ)                                                        \
287   DEF_UnpackFunc(Type,BS,EQ)                                                      \
288   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
289   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
290     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
291     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
292     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
293   }
294 
295 /* Add, Mult ops */
296 #define DEF_Add(Type,BS,EQ)                                                       \
297   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
298   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
299   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
300   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
301   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
302   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
303   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
304     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
305     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
306     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
307     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
308     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
309     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
310   }
311 
312 /* Max, Min ops */
313 #define DEF_Cmp(Type,BS,EQ)                                                       \
314   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
315   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
316   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
317   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
318   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
319     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
320     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
321     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
322     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
323   }
324 
325 /* Logical ops.
326   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
327   the compilation warning "empty macro arguments are undefined in ISO C90"
328  */
329 #define DEF_Log(Type,BS,EQ)                                                       \
330   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
331   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
332   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
333   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
334   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
335   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
336   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
337     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
338     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
339     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
340     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
341     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
342     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
343   }
344 
345 /* Bitwise ops */
346 #define DEF_Bit(Type,BS,EQ)                                                       \
347   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
348   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
349   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
350   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
351   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
352   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
353   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
354     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
355     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
356     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
357     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
358     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
359     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
360   }
361 
362 /* Maxloc, Minloc ops */
363 #define DEF_Xloc(Type,BS,EQ)                                                      \
364   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
365   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
366   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
367   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
368   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
369     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
370     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
371     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
372     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
373   }
374 
375 #define DEF_IntegerType(Type,BS,EQ)                                               \
376   DEF_Pack(Type,BS,EQ)                                                            \
377   DEF_Add(Type,BS,EQ)                                                             \
378   DEF_Cmp(Type,BS,EQ)                                                             \
379   DEF_Log(Type,BS,EQ)                                                             \
380   DEF_Bit(Type,BS,EQ)                                                             \
381   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
382     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
383     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
384     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
385     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
386     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
387   }
388 
389 #define DEF_RealType(Type,BS,EQ)                                                  \
390   DEF_Pack(Type,BS,EQ)                                                            \
391   DEF_Add(Type,BS,EQ)                                                             \
392   DEF_Cmp(Type,BS,EQ)                                                             \
393   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
394     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
395     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
396     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
397   }
398 
399 #if defined(PETSC_HAVE_COMPLEX)
400 #define DEF_ComplexType(Type,BS,EQ)                                               \
401   DEF_Pack(Type,BS,EQ)                                                            \
402   DEF_Add(Type,BS,EQ)                                                             \
403   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
404     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
405     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
406   }
407 #endif
408 
409 #define DEF_DumbType(Type,BS,EQ)                                                  \
410   DEF_Pack(Type,BS,EQ)                                                            \
411   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
412     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
413   }
414 
415 /* Maxloc, Minloc */
416 #define DEF_PairType(Type,BS,EQ)                                                  \
417   DEF_Pack(Type,BS,EQ)                                                            \
418   DEF_Xloc(Type,BS,EQ)                                                            \
419   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
420     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
421     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
422   }
423 
424 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
425 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
426 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
427 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
428 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
429 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
430 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
431 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
432 
433 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
434 DEF_IntegerType(int,1,1)
435 DEF_IntegerType(int,2,1)
436 DEF_IntegerType(int,4,1)
437 DEF_IntegerType(int,8,1)
438 DEF_IntegerType(int,1,0)
439 DEF_IntegerType(int,2,0)
440 DEF_IntegerType(int,4,0)
441 DEF_IntegerType(int,8,0)
442 #endif
443 
444 /* The typedefs are used to get a typename without space that CPPJoin can handle */
445 typedef signed char SignedChar;
446 DEF_IntegerType(SignedChar,1,1)
447 DEF_IntegerType(SignedChar,2,1)
448 DEF_IntegerType(SignedChar,4,1)
449 DEF_IntegerType(SignedChar,8,1)
450 DEF_IntegerType(SignedChar,1,0)
451 DEF_IntegerType(SignedChar,2,0)
452 DEF_IntegerType(SignedChar,4,0)
453 DEF_IntegerType(SignedChar,8,0)
454 
455 typedef unsigned char UnsignedChar;
456 DEF_IntegerType(UnsignedChar,1,1)
457 DEF_IntegerType(UnsignedChar,2,1)
458 DEF_IntegerType(UnsignedChar,4,1)
459 DEF_IntegerType(UnsignedChar,8,1)
460 DEF_IntegerType(UnsignedChar,1,0)
461 DEF_IntegerType(UnsignedChar,2,0)
462 DEF_IntegerType(UnsignedChar,4,0)
463 DEF_IntegerType(UnsignedChar,8,0)
464 
465 DEF_RealType(PetscReal,1,1)
466 DEF_RealType(PetscReal,2,1)
467 DEF_RealType(PetscReal,4,1)
468 DEF_RealType(PetscReal,8,1)
469 DEF_RealType(PetscReal,1,0)
470 DEF_RealType(PetscReal,2,0)
471 DEF_RealType(PetscReal,4,0)
472 DEF_RealType(PetscReal,8,0)
473 
474 #if defined(PETSC_HAVE_COMPLEX)
475 DEF_ComplexType(PetscComplex,1,1)
476 DEF_ComplexType(PetscComplex,2,1)
477 DEF_ComplexType(PetscComplex,4,1)
478 DEF_ComplexType(PetscComplex,8,1)
479 DEF_ComplexType(PetscComplex,1,0)
480 DEF_ComplexType(PetscComplex,2,0)
481 DEF_ComplexType(PetscComplex,4,0)
482 DEF_ComplexType(PetscComplex,8,0)
483 #endif
484 
485 #define PairType(Type1,Type2) Type1##_##Type2
486 typedef struct {int u; int i;}           PairType(int,int);
487 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
488 DEF_PairType(PairType(int,int),1,1)
489 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
490 
491 /* If we don't know the basic type, we treat it as a stream of chars or ints */
492 DEF_DumbType(char,1,1)
493 DEF_DumbType(char,2,1)
494 DEF_DumbType(char,4,1)
495 DEF_DumbType(char,1,0)
496 DEF_DumbType(char,2,0)
497 DEF_DumbType(char,4,0)
498 
499 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
500 DEF_DumbType(DumbInt,1,1)
501 DEF_DumbType(DumbInt,2,1)
502 DEF_DumbType(DumbInt,4,1)
503 DEF_DumbType(DumbInt,8,1)
504 DEF_DumbType(DumbInt,1,0)
505 DEF_DumbType(DumbInt,2,0)
506 DEF_DumbType(DumbInt,4,0)
507 DEF_DumbType(DumbInt,8,0)
508 
509 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
510 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
511 {
512   int ierr;
513   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
514   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
515   return MPI_SUCCESS;
516 }
517 #endif
518 
519 /*
520    The routine Creates a communication link for the given operation. It first looks up its link cache. If
521    there is a free & suitable one, it uses it. Otherwise it creates a new one.
522 
523    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
524    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
525    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
526    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
527 
528    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
529 
530    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
531 
532    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
533    need pack/unpack data.
534 */
535 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
536 {
537   PetscErrorCode    ierr;
538   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
539   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
540   PetscSFLink       *p,link;
541   PetscSFDirection  direction;
542   MPI_Request       *reqs = NULL;
543   PetscBool         match,rootdirect[2],leafdirect[2];
544   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
545   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
546 
547   PetscFunctionBegin;
548   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
549 
550   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
551   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
552     if (sfop == PETSCSF_BCAST) {
553       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
554       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
555     } else if (sfop == PETSCSF_REDUCE) {
556       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
557       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
558     } else { /* PETSCSF_FETCH */
559       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
560       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
561     }
562   }
563 
564   if (use_gpu_aware_mpi) {
565     rootmtype_mpi = rootmtype;
566     leafmtype_mpi = leafmtype;
567   } else {
568     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
569   }
570   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
571   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
572   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
573 
574   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
575   nrootreqs = bas->nrootreqs;
576   nleafreqs = sf->nleafreqs;
577 
578   /* Look for free links in cache */
579   for (p=&bas->avail; (link=*p); p=&link->next) {
580     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
581     if (match) {
582       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
583          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
584       */
585       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
586         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
587         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
588         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
589       }
590       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
591         reqs = link->leafreqs[direction][leafmtype][1];
592         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
593         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
594       }
595       *p = link->next; /* Remove from available list */
596       goto found;
597     }
598   }
599 
600   ierr = PetscNew(&link);CHKERRQ(ierr);
601   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
602   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
603 
604   nreqs = (nrootreqs+nleafreqs)*8;
605   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
606   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
607 
608   for (i=0; i<2; i++) { /* Two communication directions */
609     for (j=0; j<2; j++) { /* Two memory types */
610       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
611         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
612         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
613       }
614     }
615   }
616 
617 found:
618   if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);}
619 
620   /* Allocate buffers along root/leafdata */
621   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
622     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
623     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
624     if (bas->rootbuflen[i]) {
625       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
626         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
627       } else { /* Have to have a separate rootbuf */
628         if (!link->rootbuf_alloc[i][rootmtype]) {
629           ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
630         }
631         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
632       }
633     }
634 
635     if (sf->leafbuflen[i]) {
636       if (leafdirect[i]) {
637         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
638       } else {
639         if (!link->leafbuf_alloc[i][leafmtype]) {
640           ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
641         }
642         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
643       }
644     }
645   }
646 
647   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
648   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
649     if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
650       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
651     }
652     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
653   }
654   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
655     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
656       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
657     }
658     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
659   }
660 
661   /* Set `current` state of the link. They may change between different SF invocations with the same link */
662   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
663     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
664     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
665   }
666 
667   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
668   link->leafdata = leafdata;
669   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
670     link->rootdirect[i] = rootdirect[i];
671     link->leafdirect[i] = leafdirect[i];
672   }
673   link->rootdirect_mpi  = rootdirect_mpi;
674   link->leafdirect_mpi  = leafdirect_mpi;
675   link->rootmtype       = rootmtype;
676   link->leafmtype       = leafmtype;
677   link->rootmtype_mpi   = rootmtype_mpi;
678   link->leafmtype_mpi   = leafmtype_mpi;
679 
680   link->next            = bas->inuse;
681   bas->inuse            = link;
682   *mylink               = link;
683   PetscFunctionReturn(0);
684 }
685 
686 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
687    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
688 */
689 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
690 {
691   PetscErrorCode       ierr;
692   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
693   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
694   const PetscInt       *rootoffset,*leafoffset;
695   PetscMPIInt          n;
696   MPI_Aint             disp;
697   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
698   MPI_Datatype         unit = link->unit;
699   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
700   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
701 
702   PetscFunctionBegin;
703   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
704   if (sf->persistent) {
705     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
706       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
707       if (direction == PETSCSF_LEAF2ROOT) {
708         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
709           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
710           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
711           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
712         }
713       } else { /* PETSCSF_ROOT2LEAF */
714         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
715           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
716           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
717           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
718         }
719       }
720       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
721     }
722 
723     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
724       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
725       if (direction == PETSCSF_LEAF2ROOT) {
726         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
727           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
728           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
729           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
730         }
731       } else { /* PETSCSF_ROOT2LEAF */
732         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
733           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
734           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
735           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
736         }
737       }
738       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
739     }
740   }
741   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
742   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
743   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
744   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
745   PetscFunctionReturn(0);
746 }
747 
748 
749 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
750 {
751   PetscErrorCode    ierr;
752   PetscSFLink       link,*p;
753   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
754 
755   PetscFunctionBegin;
756   /* Look for types in cache */
757   for (p=&bas->inuse; (link=*p); p=&link->next) {
758     PetscBool match;
759     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
760     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
761       switch (cmode) {
762       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
763       case PETSC_USE_POINTER: break;
764       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
765       }
766       *mylink = link;
767       PetscFunctionReturn(0);
768     }
769   }
770   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
771   PetscFunctionReturn(0);
772 }
773 
774 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
775 {
776   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
777 
778   PetscFunctionBegin;
779   (*link)->rootdata = NULL;
780   (*link)->leafdata = NULL;
781   (*link)->next     = bas->avail;
782   bas->avail        = *link;
783   *link             = NULL;
784   PetscFunctionReturn(0);
785 }
786 
787 /* Destroy all links chained in 'avail' */
788 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
789 {
790   PetscErrorCode    ierr;
791   PetscSFLink       link = *avail,next;
792   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
793   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
794 
795   PetscFunctionBegin;
796   for (; link; link=next) {
797     next = link->next;
798     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
799     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
800       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
801     }
802     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
803     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
804 #if defined(PETSC_HAVE_CUDA)
805       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
806       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
807       if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
808 #endif
809       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
810       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
811     }
812     ierr = PetscFree(link);CHKERRQ(ierr);
813   }
814   *avail = NULL;
815   PetscFunctionReturn(0);
816 }
817 
818 #if defined(PETSC_USE_DEBUG)
819 /* Error out on unsupported overlapped communications */
820 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
821 {
822   PetscErrorCode    ierr;
823   PetscSFLink       link,*p;
824   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
825   PetscBool         match;
826 
827   PetscFunctionBegin;
828   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
829      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
830   */
831   if (rootdata || leafdata) {
832     for (p=&bas->inuse; (link=*p); p=&link->next) {
833       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
834       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
835     }
836   }
837   PetscFunctionReturn(0);
838 }
839 #endif
840 
841 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
842 {
843   PetscErrorCode ierr;
844   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
845   PetscBool      is2Int,is2PetscInt;
846   PetscMPIInt    ni,na,nd,combiner;
847 #if defined(PETSC_HAVE_COMPLEX)
848   PetscInt       nPetscComplex=0;
849 #endif
850 
851   PetscFunctionBegin;
852   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
853   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
854   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
855   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
856   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
857   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
858 #if defined(PETSC_HAVE_COMPLEX)
859   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
860 #endif
861   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
862   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
863   /* TODO: shaell we also handle Fortran MPI_2REAL? */
864   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
865   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
866   link->bs = 1; /* default */
867 
868   if (is2Int) {
869     PackInit_PairType_int_int_1_1(link);
870     link->bs        = 1;
871     link->unitbytes = 2*sizeof(int);
872     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
873     link->basicunit = MPI_2INT;
874     link->unit      = MPI_2INT;
875   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
876     PackInit_PairType_PetscInt_PetscInt_1_1(link);
877     link->bs        = 1;
878     link->unitbytes = 2*sizeof(PetscInt);
879     link->basicunit = MPIU_2INT;
880     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
881     link->unit      = MPIU_2INT;
882   } else if (nPetscReal) {
883     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
884     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
885     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
886     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
887     link->bs        = nPetscReal;
888     link->unitbytes = nPetscReal*sizeof(PetscReal);
889     link->basicunit = MPIU_REAL;
890     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
891   } else if (nPetscInt) {
892     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
893     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
894     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
895     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
896     link->bs        = nPetscInt;
897     link->unitbytes = nPetscInt*sizeof(PetscInt);
898     link->basicunit = MPIU_INT;
899     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
900 #if defined(PETSC_USE_64BIT_INDICES)
901   } else if (nInt) {
902     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
903     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
904     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
905     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
906     link->bs        = nInt;
907     link->unitbytes = nInt*sizeof(int);
908     link->basicunit = MPI_INT;
909     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
910 #endif
911   } else if (nSignedChar) {
912     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
913     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
914     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
915     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
916     link->bs        = nSignedChar;
917     link->unitbytes = nSignedChar*sizeof(SignedChar);
918     link->basicunit = MPI_SIGNED_CHAR;
919     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
920   }  else if (nUnsignedChar) {
921     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
922     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
923     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
924     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
925     link->bs        = nUnsignedChar;
926     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
927     link->basicunit = MPI_UNSIGNED_CHAR;
928     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
929 #if defined(PETSC_HAVE_COMPLEX)
930   } else if (nPetscComplex) {
931     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
932     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
933     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
934     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
935     link->bs        = nPetscComplex;
936     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
937     link->basicunit = MPIU_COMPLEX;
938     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
939 #endif
940   } else {
941     MPI_Aint lb,nbyte;
942     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
943     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
944     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
945       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
946       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
947       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
948       link->bs        = nbyte;
949       link->unitbytes = nbyte;
950       link->basicunit = MPI_BYTE;
951     } else {
952       nInt = nbyte / sizeof(int);
953       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
954       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
955       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
956       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
957       link->bs        = nInt;
958       link->unitbytes = nbyte;
959       link->basicunit = MPI_INT;
960     }
961     if (link->isbuiltin) link->unit = unit;
962   }
963 
964   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
965   PetscFunctionReturn(0);
966 }
967 
968 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*))
969 {
970   PetscFunctionBegin;
971   *UnpackAndOp = NULL;
972   if (mtype == PETSC_MEMTYPE_HOST) {
973     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
974     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
975     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
976     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
977     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
978     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
979     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
980     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
981     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
982     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
983     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
984     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
985     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
986   }
987 #if defined(PETSC_HAVE_CUDA)
988   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
989     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
990     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
991     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
992     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
993     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
994     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
995     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
996     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
997     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
998     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
999     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
1000     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
1001     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
1002   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1003     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
1004     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
1005     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
1006     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
1007     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
1008     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
1009     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
1010     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
1011     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
1012     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
1013     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
1014     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
1015     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
1016   }
1017 #endif
1018   PetscFunctionReturn(0);
1019 }
1020 
1021 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*))
1022 {
1023   PetscFunctionBegin;
1024   *ScatterAndOp = NULL;
1025   if (mtype == PETSC_MEMTYPE_HOST) {
1026     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
1027     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
1028     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
1029     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
1030     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
1031     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
1032     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
1033     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
1034     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
1035     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
1036     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
1037     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
1038     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
1039   }
1040 #if defined(PETSC_HAVE_CUDA)
1041   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1042     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
1043     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1044     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
1045     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1046     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1047     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1048     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1049     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1050     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1051     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1052     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1053     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1054     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1055   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1056     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1057     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1058     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1059     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1060     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1061     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1062     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1063     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1064     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1065     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1066     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1067     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1068     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1069   }
1070 #endif
1071   PetscFunctionReturn(0);
1072 }
1073 
1074 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*))
1075 {
1076   PetscFunctionBegin;
1077   *FetchAndOp = NULL;
1078   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1079   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1080 #if defined(PETSC_HAVE_CUDA)
1081   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1082   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1083 #endif
1084   PetscFunctionReturn(0);
1085 }
1086 
1087 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*))
1088 {
1089   PetscFunctionBegin;
1090   *FetchAndOpLocal = NULL;
1091   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1092   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1093 #if defined(PETSC_HAVE_CUDA)
1094   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1095   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1096 #endif
1097   PetscFunctionReturn(0);
1098 }
1099 
1100 /*=============================================================================
1101               A set of helper routines for Pack/Unpack/Scatter on GPUs
1102  ============================================================================*/
1103 #if defined(PETSC_HAVE_CUDA)
1104 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
1105    make sure the data is ready for packing.
1106  */
1107 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
1108 {
1109   PetscFunctionBegin;
1110   if (sf->use_default_stream) PetscFunctionReturn(0);
1111   if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {
1112     cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
1113   }
1114   PetscFunctionReturn(0);
1115 }
1116 
1117 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
1118 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
1119 {
1120   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1121 
1122   PetscFunctionBegin;
1123   /* Do nothing if we use stream aware mpi || has nothing for remote */
1124   if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
1125   /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
1126   if (!link->rootdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi || sf->use_default_stream) {
1127     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1128   }
1129   PetscFunctionReturn(0);
1130 }
1131 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
1132 {
1133   PetscFunctionBegin;
1134   /* See comments above */
1135   if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
1136   if (!link->leafdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi || sf->use_default_stream) {
1137     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1138   }
1139   PetscFunctionReturn(0);
1140 }
1141 
1142 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
1143    does not know which stream the callers will use.
1144 */
1145 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
1146 {
1147   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1148 
1149   PetscFunctionBegin;
1150   /* Do nothing if we are expected to put rootdata on default stream */
1151   if (sf->use_default_stream || link->rootmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0);
1152   /* If we have something from local, then we called a scatter kernel (on link->stream), then we must sync it;
1153      If we have something from remote and we called unpack kernel, then we must also sycn it.
1154    */
1155   if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {
1156     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1157   }
1158   PetscFunctionReturn(0);
1159 }
1160 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
1161 {
1162   PetscFunctionBegin;
1163   /* See comments above */
1164   if (sf->use_default_stream || link->leafmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0);
1165   if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {
1166     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1167   }
1168   PetscFunctionReturn(0);
1169 }
1170 
1171 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
1172    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
1173 */
1174 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1175 {
1176   PetscErrorCode ierr;
1177   cudaError_t    cerr;
1178   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1179 
1180   PetscFunctionBegin;
1181   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
1182     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1183     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1184     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
1185     if (device2host) {
1186       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1187       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1188     } else {
1189       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1190       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1191     }
1192   }
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1197 {
1198   PetscErrorCode ierr;
1199   cudaError_t    cerr;
1200 
1201   PetscFunctionBegin;
1202   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
1203     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1204     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1205     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
1206     if (device2host) {
1207       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1208       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1209     } else {
1210       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1211       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1212     }
1213   }
1214   PetscFunctionReturn(0);
1215 }
1216 #else
1217 
1218 #define PetscSFLinkSyncDeviceBeforePackData(a,b)                0
1219 #define PetscSFLinkSyncStreamAfterPackRootData(a,b)             0
1220 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b)             0
1221 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b)           0
1222 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b)           0
1223 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1224 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1225 
1226 #endif
1227 
1228 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1229 {
1230   PetscErrorCode ierr;
1231   PetscLogDouble flops;
1232   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1233 
1234 
1235   PetscFunctionBegin;
1236   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1237     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1238 #if defined(PETSC_HAVE_CUDA)
1239     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1240 #endif
1241     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1242   }
1243   PetscFunctionReturn(0);
1244 }
1245 
1246 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1247 {
1248   PetscLogDouble flops;
1249   PetscErrorCode ierr;
1250 
1251   PetscFunctionBegin;
1252   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1253     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1254 #if defined(PETSC_HAVE_CUDA)
1255     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1256 #endif
1257     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1258   }
1259   PetscFunctionReturn(0);
1260 }
1261 
1262 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1263   Input Arguments:
1264   +sf      - The StarForest
1265   .link    - The link
1266   .count   - Number of entries to unpack
1267   .start   - The first index, significent when indices=NULL
1268   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1269   .buf     - A contiguous buffer to unpack from
1270   -op      - Operation after unpack
1271 
1272   Output Arguments:
1273   .data    - The data to unpack to
1274 */
1275 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1276 {
1277   PetscFunctionBegin;
1278 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1279   {
1280     PetscErrorCode ierr;
1281     PetscInt       i;
1282     PetscMPIInt    n;
1283     if (indices) {
1284       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1285          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1286       */
1287       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1288     } else {
1289       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1290       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1291     }
1292   }
1293 #else
1294   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1295 #endif
1296   PetscFunctionReturn(0);
1297 }
1298 
1299 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata,MPI_Op op)
1300 {
1301   PetscFunctionBegin;
1302 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1303   {
1304     PetscErrorCode ierr;
1305     PetscInt       i,disp;
1306     for (i=0; i<count; i++) {
1307       disp = idy? idy[i] : starty + i;
1308       ierr = MPI_Reduce_local((const char*)xdata+idx[i]*link->unitbytes,(char*)ydata+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1309     }
1310   }
1311 #else
1312   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1313 #endif
1314   PetscFunctionReturn(0);
1315 }
1316 
1317 /*=============================================================================
1318               Pack/Unpack/Fetch/Scatter routines
1319  ============================================================================*/
1320 
1321 /* Pack rootdata to rootbuf
1322   Input Arguments:
1323   + sf       - The SF this packing works on.
1324   . link     - It gives the memtype of the roots and also provides root buffer.
1325   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1326   - rootdata - Where to read the roots.
1327 
1328   Notes:
1329   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1330   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1331  */
1332 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1333 {
1334   PetscErrorCode   ierr;
1335   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1336   const PetscInt   *rootindices = NULL;
1337   PetscInt         count,start;
1338   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL;
1339   PetscMemType     rootmtype = link->rootmtype;
1340 
1341   PetscFunctionBegin;
1342   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1343   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1344   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1345     ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1346     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1347     ierr = (*Pack)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1348   }
1349   if (scope == PETSCSF_REMOTE) {
1350     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1351     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1352   }
1353   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1354   PetscFunctionReturn(0);
1355 }
1356 
1357 /* Pack leafdata to leafbuf */
1358 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1359 {
1360   PetscErrorCode   ierr;
1361   const PetscInt   *leafindices = NULL;
1362   PetscInt         count,start;
1363   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL;
1364   PetscMemType     leafmtype = link->leafmtype;
1365 
1366   PetscFunctionBegin;
1367   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1368   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1369   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1370     ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr);
1371     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1372     ierr = (*Pack)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1373   }
1374   if (scope == PETSCSF_REMOTE) {
1375     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1376     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1377   }
1378   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1379   PetscFunctionReturn(0);
1380 }
1381 
1382 /* Unpack rootbuf to rootdata */
1383 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1384 {
1385   PetscErrorCode   ierr;
1386   const PetscInt   *rootindices = NULL;
1387   PetscInt         count,start;
1388   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1389   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1390   PetscMemType     rootmtype = link->rootmtype;
1391 
1392   PetscFunctionBegin;
1393   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1394   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1395   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1396     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1397     if (UnpackAndOp) {
1398       ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1399       ierr = (*UnpackAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1400     } else {
1401       ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1402       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1403     }
1404   }
1405   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1406   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1407   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1408   PetscFunctionReturn(0);
1409 }
1410 
1411 /* Unpack leafbuf to leafdata */
1412 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1413 {
1414   PetscErrorCode   ierr;
1415   const PetscInt   *leafindices = NULL;
1416   PetscInt         count,start;
1417   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1418   PetscMemType     leafmtype = link->leafmtype;
1419 
1420   PetscFunctionBegin;
1421   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1422   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1423   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1424     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1425     if (UnpackAndOp) {
1426       ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr);
1427       ierr = (*UnpackAndOp)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1428     } else {
1429       ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&leafindices);CHKERRQ(ierr);
1430       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1431     }
1432   }
1433   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1434   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1435   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1436   PetscFunctionReturn(0);
1437 }
1438 
1439 /* FetchAndOp rootdata with rootbuf */
1440 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1441 {
1442   PetscErrorCode     ierr;
1443   const PetscInt     *rootindices = NULL;
1444   PetscInt           count,start;
1445   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1446   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*) = NULL;
1447   PetscMemType       rootmtype = link->rootmtype;
1448 
1449   PetscFunctionBegin;
1450   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1451   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1452   if (bas->rootbuflen[scope]) {
1453     /* Do FetchAndOp on rootdata with rootbuf */
1454     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1455     ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1456     ierr = (*FetchAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1457   }
1458   if (scope == PETSCSF_REMOTE) {
1459     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1460     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1461   }
1462   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1463   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1464   PetscFunctionReturn(0);
1465 }
1466 
1467 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1468 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1469 {
1470   PetscErrorCode       ierr;
1471   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1472   PetscInt             count,rootstart,leafstart;
1473   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1474   PetscErrorCode       (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1475   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL;
1476   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1477 
1478   PetscFunctionBegin;
1479   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1480   if (rootmtype != leafmtype) { /* Uncommon case */
1481      /* The local communication has to go through pack and unpack */
1482     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1483     ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1484     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1485   } else {
1486     if (bas->rootcontig[PETSCSF_LOCAL]) { /* If root indices are contiguous, Scatter becomes Unpack */
1487       ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr);
1488       rootdata = (const char*)rootdata + bas->rootstart[PETSCSF_LOCAL]*link->unitbytes; /* Make rootdata point to start of the buffer */
1489       if (UnpackAndOp) {
1490         ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr);
1491         ierr = (*UnpackAndOp)(link,count,leafstart,leafindices,sf->leafpackopt[PETSCSF_LOCAL],leafdata,rootdata);CHKERRQ(ierr);
1492       } else {
1493         ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr);
1494         ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootdata,op);CHKERRQ(ierr);
1495       }
1496     } else { /* ScatterAndOp */
1497       ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1498       if (ScatterAndOp) {
1499         ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1500         ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr);
1501         ierr = (*ScatterAndOp)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata);CHKERRQ(ierr);
1502       } else {
1503         ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,NULL,&rootindices);CHKERRQ(ierr);
1504         ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr);
1505         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1506       }
1507     }
1508   }
1509   PetscFunctionReturn(0);
1510 }
1511 
1512 /* Reduce leafdata to rootdata locally */
1513 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1514 {
1515   PetscErrorCode       ierr;
1516   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1517   PetscInt             count,rootstart,leafstart;
1518   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1519   PetscErrorCode       (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1520   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL;
1521   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1522 
1523   PetscFunctionBegin;
1524   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1525   if (rootmtype != leafmtype) {
1526     /* The local communication has to go through pack and unpack */
1527     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1528     ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1529     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1530   } else {
1531     if (sf->leafcontig[PETSCSF_LOCAL]) {
1532       /* If leaf indices are contiguous, Scatter becomes Unpack */
1533       ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr);
1534       leafdata = (const char*)leafdata + sf->leafstart[PETSCSF_LOCAL]*link->unitbytes; /* Make leafdata point to start of the buffer */
1535       if (UnpackAndOp) {
1536         ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1537         ierr = (*UnpackAndOp)(link,count,rootstart,rootindices,bas->rootpackopt[PETSCSF_LOCAL],rootdata,leafdata);CHKERRQ(ierr);
1538       } else {
1539         ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1540         ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafdata,op);CHKERRQ(ierr);
1541       }
1542     } else {
1543       ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1544       if (ScatterAndOp) {
1545         ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1546         ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr);
1547         ierr = (*ScatterAndOp)(link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata);CHKERRQ(ierr);
1548       } else {
1549         ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1550         ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,NULL,&leafindices);CHKERRQ(ierr);
1551         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1552       }
1553     }
1554   }
1555   PetscFunctionReturn(0);
1556 }
1557 
1558 /* Fetch rootdata to leafdata and leafupdate locally */
1559 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1560 {
1561   PetscErrorCode       ierr;
1562   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1563   PetscInt             count,rootstart,leafstart;
1564   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1565   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*) = NULL;
1566   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1567 
1568   PetscFunctionBegin;
1569   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1570   if (rootmtype != leafmtype) {
1571    /* The local communication has to go through pack and unpack */
1572     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1573   } else {
1574     ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1575     ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr);
1576     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1577     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1578   }
1579   PetscFunctionReturn(0);
1580 }
1581 
1582 /*
1583   Create per-rank pack/unpack optimizations based on indice patterns
1584 
1585    Input Parameters:
1586   +  n       - Number of target ranks
1587   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1588   -  idx     - [*]   Array storing indices
1589 
1590    Output Parameters:
1591   +  opt     - Pack optimizations. NULL if no optimizations.
1592 */
1593 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1594 {
1595   PetscErrorCode ierr;
1596   PetscInt       i,j,k,n_copies,tot_copies = 0,step;
1597   PetscBool      strided,optimized = PETSC_FALSE;
1598   PetscSFPackOpt opt;
1599 
1600   PetscFunctionBegin;
1601   if (!n) {
1602     *out = NULL;
1603     PetscFunctionReturn(0);
1604   }
1605 
1606   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
1607   ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr);
1608   ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr);
1609   /* Make opt->offset[] zero-based. If one calls this routine with non-zero offset[0], one should use packing routine in this way, Pack(count,idx+offset[0],packopt,...) */
1610   if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];}
1611 
1612   opt->n = n;
1613 
1614   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with multiple memcpy's ) */
1615   for (i=0; i<n; i++) { /* for each target processor */
1616     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
1617     n_copies = 1;
1618     for (j=offset[i]; j<offset[i+1]-1; j++) {
1619       if (idx[j]+1 != idx[j+1]) n_copies++;
1620     }
1621     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
1622        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
1623      */
1624     if ((offset[i+1]-offset[i])/n_copies >= 32) {
1625       opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
1626       optimized    = PETSC_TRUE;
1627       tot_copies  += n_copies;
1628     }
1629   }
1630 
1631   /* Setup memcpy plan for each contiguous piece */
1632   k    = 0; /* k-th copy */
1633   ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
1634   for (i=0; i<n; i++) { /* for each target processor */
1635     if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
1636       n_copies           = 1;
1637       opt->copy_start[k] = offset[i] - offset[0];
1638       for (j=offset[i]; j<offset[i+1]-1; j++) {
1639         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
1640           n_copies++;
1641           opt->copy_start[k+1] = j-offset[0]+1;
1642           opt->copy_length[k]  = opt->copy_start[k+1] - opt->copy_start[k];
1643           k++;
1644         }
1645       }
1646       /* Set copy length of the last copy for this target */
1647       opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
1648       k++;
1649     }
1650     /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
1651     opt->copy_offset[i+1] = k;
1652   }
1653 
1654   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
1655   for (i=0; i<n; i++) { /* for each remote */
1656     if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
1657       strided = PETSC_TRUE;
1658       step    = idx[offset[i]+1] - idx[offset[i]];
1659       for (j=offset[i]; j<offset[i+1]-1; j++) {
1660         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
1661       }
1662       if (strided) {
1663         opt->type[i]         = PETSCSF_PACKOPT_STRIDE;
1664         opt->stride_step[i]  = step;
1665         opt->stride_n[i]     = offset[i+1] - offset[i];
1666         optimized            = PETSC_TRUE;
1667       }
1668     }
1669   }
1670   /* If no rank gets optimized, free arrays to save memory */
1671   if (!optimized) {
1672     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
1673     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
1674     ierr = PetscFree(opt);CHKERRQ(ierr);
1675     *out = NULL;
1676   } else *out = opt;
1677   PetscFunctionReturn(0);
1678 }
1679 
1680 PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out)
1681 {
1682   PetscErrorCode ierr;
1683   PetscSFPackOpt opt = *out;
1684 
1685   PetscFunctionBegin;
1686   if (opt) {
1687     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
1688     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
1689     ierr = PetscFree(opt);CHKERRQ(ierr);
1690     *out = NULL;
1691   }
1692   PetscFunctionReturn(0);
1693 }
1694 
1695 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1696 {
1697   PetscErrorCode ierr;
1698   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1699   PetscInt       i,j;
1700 
1701   PetscFunctionBegin;
1702   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1703   for (i=0; i<2; i++) { /* Set defaults */
1704     sf->leafstart[i]   = 0;
1705     sf->leafcontig[i]  = PETSC_TRUE;
1706     sf->leafdups[i]    = PETSC_FALSE;
1707     bas->rootstart[i]  = 0;
1708     bas->rootcontig[i] = PETSC_TRUE;
1709     bas->rootdups[i]   = PETSC_FALSE;
1710   }
1711 
1712   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1713   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1714 
1715   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1716   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1717 
1718   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1719   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1720     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1721   }
1722   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1723     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1724   }
1725 
1726   /* If not, see if we can have per-rank optimizations by doing index analysis */
1727   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1728   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1729 
1730   /* Are root indices for self and remote contiguous? */
1731   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1732   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1733 
1734   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1735   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1736 
1737   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1738     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1739   }
1740   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1741     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1742   }
1743 
1744   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1745   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1746 
1747 #if defined(PETSC_HAVE_CUDA)
1748     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1749   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1750   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1751   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1752   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1753 #endif
1754 
1755   PetscFunctionReturn(0);
1756 }
1757 
1758 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1759 {
1760   PetscErrorCode ierr;
1761   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1762   PetscInt       i;
1763 
1764   PetscFunctionBegin;
1765   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1766     ierr = PetscSFDestroyPackOpt(&sf->leafpackopt[i]);CHKERRQ(ierr);
1767     ierr = PetscSFDestroyPackOpt(&bas->rootpackopt[i]);CHKERRQ(ierr);
1768   }
1769   PetscFunctionReturn(0);
1770 }
1771