xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision cd62000487352d7c447823794a883667dfb859d7)
1 
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6 #include <cuda_runtime.h>
7 #endif
8 /*
9  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
10  * therefore we pack data types manually. This file defines packing routines for the standard data types.
11  */
12 
13 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
14 
15 /* Operations working like s += t */
16 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
17 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
18 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
19 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while(0)
20 /* Ref MPI MAXLOC */
21 #define OP_XLOC(op,s,t) \
22   do {                                       \
23     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
24     else if (!((s).u op (t).u)) s = t;           \
25   } while(0)
26 
27 /* DEF_PackFunc - macro defining a Pack routine
28 
29    Arguments of the macro:
30    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
31    .BS        Block size for vectorization. It is a factor of bs.
32    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
33 
34    Arguments of the Pack routine:
35    +count     Number of indices in idx[].
36    .start     If indices are contiguous, it is the first index; otherwise, not used.
37    .idx       Indices of entries to packed. NULL means contiguous indices, that is [start,start+count).
38    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39    .opt       Per-remote pack optimization plan. NULL means no such plan.
40    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
41    -packed    Address of the packed data.
42  */
43 #define DEF_PackFunc(Type,BS,EQ) \
44   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,const void *unpacked,void *packed) \
45   {                                                                                                          \
46     PetscErrorCode ierr;                                                                                     \
47     const Type     *u = (const Type*)unpacked,*u2;                                                           \
48     Type           *p = (Type*)packed,*p2;                                                                   \
49     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
50     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
51     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
52     PetscFunctionBegin;                                                                                      \
53     if (!idx) {ierr = PetscArraycpy(p,u+start*bs,MBS*count);CHKERRQ(ierr);}  /* Indices are contiguous */    \
54     else if (!opt) { /* No optimizations available */                                                        \
55       for (i=0; i<count; i++)                                                                                \
56         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
57           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
58             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
59     } else {                                                                                                 \
60       for (r=0; r<opt->n; r++) {                                                                             \
61         p2  = p + opt->offset[r]*MBS;                                                                        \
62         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
63           idx2 = idx + opt->offset[r];                                                                       \
64           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
65             for (j=0; j<M; j++)                                                                              \
66               for (k=0; k<BS; k++)                                                                           \
67                 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k];                                                    \
68         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
69           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
70             u2   = u + idx[opt->copy_start[i]]*MBS;                                                          \
71             l    = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */                       \
72             ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr);                                                     \
73             p2  += l;                                                                                        \
74           }                                                                                                  \
75         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
76           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
77           step = opt->stride_step[r];                                                                        \
78           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
79             for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j];                                            \
80         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
81       }                                                                                                      \
82     }                                                                                                        \
83     PetscFunctionReturn(0);                                                                                  \
84   }
85 
86 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
87                 and inserts into a sparse array.
88 
89    Arguments:
90   .Type       Type of the data
91   .BS         Block size for vectorization
92   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
93 
94   Notes:
95    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
96  */
97 #define DEF_UnpackFunc(Type,BS,EQ)               \
98   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \
99   {                                                                                                          \
100     PetscErrorCode ierr;                                                                                     \
101     Type           *u = (Type*)unpacked,*u2;                                                                 \
102     const Type     *p = (const Type*)packed,*p2;                                                             \
103     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
104     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
105     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
106     PetscFunctionBegin;                                                                                      \
107     if (!idx) {                                                                                              \
108       u2 = u + start*bs;                                                                                     \
109       if (u2 != p) {ierr = PetscArraycpy(u2,p,count*MBS);CHKERRQ(ierr);}                                     \
110     } else if (!opt) { /* No optimizations available */                                                      \
111       for (i=0; i<count; i++)                                                                                \
112         for (j=0; j<M; j++)                                                                                  \
113           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
114     } else {                                                                                                 \
115       for (r=0; r<opt->n; r++) {                                                                             \
116         p2 = p + opt->offset[r]*MBS;                                                                         \
117         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
118           idx2 = idx + opt->offset[r];                                                                       \
119           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
120             for (j=0; j<M; j++)                                                                              \
121               for (k=0; k<BS; k++) u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                 \
122         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
123           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
124             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
125             l  = opt->copy_length[i]*MBS;                                                                    \
126             ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr);                                                     \
127             p2 += l;                                                                                         \
128           }                                                                                                  \
129         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
130           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
131           step = opt->stride_step[r];                                                                        \
132           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
133             for (j=0; j<M; j++)                                                                              \
134               for (k=0; k<BS; k++) u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                 \
135         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
136       }                                                                                                      \
137     }                                                                                                        \
138     PetscFunctionReturn(0);                                                                                  \
139   }
140 
141 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
142 
143    Arguments:
144   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
145   .Type       Type of the data
146   .BS         Block size for vectorization
147   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
148   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
149   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
150  */
151 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
152   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \
153   {                                                                                                          \
154     Type           *u = (Type*)unpacked,*u2;                                                                 \
155     const Type     *p = (const Type*)packed,*p2;                                                             \
156     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
157     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
158     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
159     PetscFunctionBegin;                                                                                      \
160     if (!idx) {                                                                                              \
161       u += start*bs;                                                                                         \
162       for (i=0; i<count; i++)                                                                                \
163         for (j=0; j<M; j++)                                                                                  \
164           for (k=0; k<BS; k++)                                                                               \
165             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
166     } else if (!opt) { /* No optimizations available */                                                      \
167       for (i=0; i<count; i++)                                                                                \
168         for (j=0; j<M; j++)                                                                                  \
169           for (k=0; k<BS; k++)                                                                               \
170               OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                              \
171     } else {                                                                                                 \
172       for (r=0; r<opt->n; r++) {                                                                             \
173         p2 = p + opt->offset[r]*MBS;                                                                         \
174         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
175           idx2 = idx + opt->offset[r];                                                                       \
176           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
177             for (j=0; j<M; j++)                                                                              \
178               for (k=0; k<BS; k++)                                                                           \
179                 OpApply(Op,u[idx2[i]*MBS+j*BS+k],p2[i*MBS+j*BS+k]);                                          \
180         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
181           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
182             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
183             l  = opt->copy_length[i]*MBS;                                                                    \
184             for (j=0; j<l; j++) OpApply(Op,u2[j],p2[j]);                                                     \
185             p2 += l;                                                                                         \
186           }                                                                                                  \
187         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
188           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
189           step = opt->stride_step[r];                                                                        \
190           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
191             for (j=0; j<M; j++)                                                                              \
192               for (k=0; k<BS; k++)                                                                           \
193                 OpApply(Op,u2[i*step*MBS+j*BS+k],p2[i*MBS+j*BS+k]);                                          \
194         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
195       }                                                                                                      \
196     }                                                                                                        \
197     PetscFunctionReturn(0);                                                                                  \
198   }
199 
200 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
201   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,void *packed) \
202   {                                                                                                          \
203     Type           *u = (Type*)unpacked,*p = (Type*)packed,t;                                                \
204     PetscInt       i,j,k,bs=link->bs;                                                                        \
205     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
206     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
207     PetscFunctionBegin;                                                                                      \
208     if (!idx) {                                                                                              \
209       u += start*bs;                                                                                         \
210       for (i=0; i<count; i++)                                                                                \
211         for (j=0; j<M; j++)                                                                                  \
212           for (k=0; k<BS; k++) {                                                                             \
213             t = u[i*MBS+j*BS+k];                                                                             \
214             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
215             p[i*MBS+j*BS+k] = t;                                                                             \
216           }                                                                                                  \
217     } else {                                                                                                 \
218       for (i=0; i<count; i++)                                                                                \
219         for (j=0; j<M; j++)                                                                                  \
220           for (k=0; k<BS; k++) {                                                                             \
221               t = u[idx[i]*MBS+j*BS+k];                                                                      \
222               OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                              \
223               p[i*MBS+j*BS+k] = t;                                                                           \
224           }                                                                                                  \
225     }                                                                                                        \
226     PetscFunctionReturn(0);                                                                                  \
227   }
228 
229 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
230   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt startx,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata) \
231   {                                                                                                          \
232     const Type     *x = (const Type*)xdata;                                                                  \
233     Type           *y = (Type*)ydata;                                                                        \
234     PetscInt       i,j,k,bs = link->bs;                                                                      \
235     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
236     const PetscInt MBS = M*BS;                                                                               \
237     PetscFunctionBegin;                                                                                      \
238     if (!idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use UnpackAndOp instead of ScatterAndOp");\
239     if (!idy) {                                                                                              \
240       y += starty*bs;                                                                                        \
241       for (i=0; i<count; i++)                                                                                \
242         for (j=0; j<M; j++)                                                                                  \
243           for (k=0; k<BS; k++)                                                                               \
244             OpApply(Op,y[i*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]);                                                \
245     } else {                                                                                                 \
246       for (i=0; i<count; i++)                                                                                \
247         for (j=0; j<M; j++)                                                                                  \
248           for (k=0; k<BS; k++)                                                                               \
249             OpApply(Op,y[idy[i]*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]);                                           \
250     }                                                                                                        \
251     PetscFunctionReturn(0);                                                                                  \
252   }
253 
254 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
255   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,const PetscInt *rootindices,void *rootdata,PetscInt leafstart,const PetscInt *leafindices,const void *leafdata,void *leafupdate) \
256   {                                                                                                          \
257     Type           *x = (Type*)rootdata,*y2 = (Type*)leafupdate;                                             \
258     const Type     *y = (const Type*)leafdata;                                                               \
259     PetscInt       i,j,k,bs = link->bs;                                                                      \
260     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
261     const PetscInt MBS = M*BS;                                                                               \
262     PetscFunctionBegin;                                                                                      \
263     if (!rootindices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use FetchAndOp instead of FetchAndOpLocal");\
264     if (!leafindices) {                                                                                      \
265       y += leafstart*bs; y2 += leafstart*bs;                                                                 \
266       for (i=0; i<count; i++)                                                                                \
267         for (j=0; j<M; j++)                                                                                  \
268           for (k=0; k<BS; k++) {                                                                             \
269             y2[i*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k];                                                 \
270             OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[i*MBS+j*BS+k]);                                        \
271           }                                                                                                  \
272     } else {                                                                                                 \
273       for (i=0; i<count; i++)                                                                                \
274         for (j=0; j<M; j++)                                                                                  \
275           for (k=0; k<BS; k++) {                                                                             \
276             y2[leafindices[i]*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k];                                    \
277             OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[leafindices[i]*MBS+j*BS+k]);                           \
278           }                                                                                                  \
279     }                                                                                                        \
280     PetscFunctionReturn(0);                                                                                  \
281   }
282 
283 
284 /* Pack, Unpack/Fetch ops */
285 #define DEF_Pack(Type,BS,EQ)                                                      \
286   DEF_PackFunc(Type,BS,EQ)                                                        \
287   DEF_UnpackFunc(Type,BS,EQ)                                                      \
288   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
289   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
290     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
291     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
292     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
293   }
294 
295 /* Add, Mult ops */
296 #define DEF_Add(Type,BS,EQ)                                                       \
297   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
298   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
299   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
300   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
301   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
302   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
303   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
304     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
305     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
306     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
307     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
308     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
309     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
310   }
311 
312 /* Max, Min ops */
313 #define DEF_Cmp(Type,BS,EQ)                                                       \
314   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
315   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
316   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
317   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
318   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
319     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
320     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
321     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
322     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
323   }
324 
325 /* Logical ops.
326   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
327   the compilation warning "empty macro arguments are undefined in ISO C90"
328  */
329 #define DEF_Log(Type,BS,EQ)                                                       \
330   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
331   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
332   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
333   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
334   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
335   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
336   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
337     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
338     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
339     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
340     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
341     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
342     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
343   }
344 
345 /* Bitwise ops */
346 #define DEF_Bit(Type,BS,EQ)                                                       \
347   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
348   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
349   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
350   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
351   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
352   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
353   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
354     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
355     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
356     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
357     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
358     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
359     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
360   }
361 
362 /* Maxloc, Minloc ops */
363 #define DEF_Xloc(Type,BS,EQ)                                                      \
364   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
365   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
366   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
367   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
368   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
369     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
370     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
371     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
372     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
373   }
374 
375 #define DEF_IntegerType(Type,BS,EQ)                                               \
376   DEF_Pack(Type,BS,EQ)                                                            \
377   DEF_Add(Type,BS,EQ)                                                             \
378   DEF_Cmp(Type,BS,EQ)                                                             \
379   DEF_Log(Type,BS,EQ)                                                             \
380   DEF_Bit(Type,BS,EQ)                                                             \
381   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
382     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
383     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
384     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
385     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
386     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
387   }
388 
389 #define DEF_RealType(Type,BS,EQ)                                                  \
390   DEF_Pack(Type,BS,EQ)                                                            \
391   DEF_Add(Type,BS,EQ)                                                             \
392   DEF_Cmp(Type,BS,EQ)                                                             \
393   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
394     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
395     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
396     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
397   }
398 
399 #if defined(PETSC_HAVE_COMPLEX)
400 #define DEF_ComplexType(Type,BS,EQ)                                               \
401   DEF_Pack(Type,BS,EQ)                                                            \
402   DEF_Add(Type,BS,EQ)                                                             \
403   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
404     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
405     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
406   }
407 #endif
408 
409 #define DEF_DumbType(Type,BS,EQ)                                                  \
410   DEF_Pack(Type,BS,EQ)                                                            \
411   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
412     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
413   }
414 
415 /* Maxloc, Minloc */
416 #define DEF_PairType(Type,BS,EQ)                                                  \
417   DEF_Pack(Type,BS,EQ)                                                            \
418   DEF_Xloc(Type,BS,EQ)                                                            \
419   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
420     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
421     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
422   }
423 
424 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
425 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
426 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
427 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
428 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
429 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
430 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
431 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
432 
433 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
434 DEF_IntegerType(int,1,1)
435 DEF_IntegerType(int,2,1)
436 DEF_IntegerType(int,4,1)
437 DEF_IntegerType(int,8,1)
438 DEF_IntegerType(int,1,0)
439 DEF_IntegerType(int,2,0)
440 DEF_IntegerType(int,4,0)
441 DEF_IntegerType(int,8,0)
442 #endif
443 
444 /* The typedefs are used to get a typename without space that CPPJoin can handle */
445 typedef signed char SignedChar;
446 DEF_IntegerType(SignedChar,1,1)
447 DEF_IntegerType(SignedChar,2,1)
448 DEF_IntegerType(SignedChar,4,1)
449 DEF_IntegerType(SignedChar,8,1)
450 DEF_IntegerType(SignedChar,1,0)
451 DEF_IntegerType(SignedChar,2,0)
452 DEF_IntegerType(SignedChar,4,0)
453 DEF_IntegerType(SignedChar,8,0)
454 
455 typedef unsigned char UnsignedChar;
456 DEF_IntegerType(UnsignedChar,1,1)
457 DEF_IntegerType(UnsignedChar,2,1)
458 DEF_IntegerType(UnsignedChar,4,1)
459 DEF_IntegerType(UnsignedChar,8,1)
460 DEF_IntegerType(UnsignedChar,1,0)
461 DEF_IntegerType(UnsignedChar,2,0)
462 DEF_IntegerType(UnsignedChar,4,0)
463 DEF_IntegerType(UnsignedChar,8,0)
464 
465 DEF_RealType(PetscReal,1,1)
466 DEF_RealType(PetscReal,2,1)
467 DEF_RealType(PetscReal,4,1)
468 DEF_RealType(PetscReal,8,1)
469 DEF_RealType(PetscReal,1,0)
470 DEF_RealType(PetscReal,2,0)
471 DEF_RealType(PetscReal,4,0)
472 DEF_RealType(PetscReal,8,0)
473 
474 #if defined(PETSC_HAVE_COMPLEX)
475 DEF_ComplexType(PetscComplex,1,1)
476 DEF_ComplexType(PetscComplex,2,1)
477 DEF_ComplexType(PetscComplex,4,1)
478 DEF_ComplexType(PetscComplex,8,1)
479 DEF_ComplexType(PetscComplex,1,0)
480 DEF_ComplexType(PetscComplex,2,0)
481 DEF_ComplexType(PetscComplex,4,0)
482 DEF_ComplexType(PetscComplex,8,0)
483 #endif
484 
485 #define PairType(Type1,Type2) Type1##_##Type2
486 typedef struct {int u; int i;}           PairType(int,int);
487 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
488 DEF_PairType(PairType(int,int),1,1)
489 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
490 
491 /* If we don't know the basic type, we treat it as a stream of chars or ints */
492 DEF_DumbType(char,1,1)
493 DEF_DumbType(char,2,1)
494 DEF_DumbType(char,4,1)
495 DEF_DumbType(char,1,0)
496 DEF_DumbType(char,2,0)
497 DEF_DumbType(char,4,0)
498 
499 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
500 DEF_DumbType(DumbInt,1,1)
501 DEF_DumbType(DumbInt,2,1)
502 DEF_DumbType(DumbInt,4,1)
503 DEF_DumbType(DumbInt,8,1)
504 DEF_DumbType(DumbInt,1,0)
505 DEF_DumbType(DumbInt,2,0)
506 DEF_DumbType(DumbInt,4,0)
507 DEF_DumbType(DumbInt,8,0)
508 
509 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
510 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
511 {
512   int ierr;
513   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
514   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
515   return MPI_SUCCESS;
516 }
517 #endif
518 
519 /*
520    The routine Creates a communication link for the given operation. It first looks up its link cache. If
521    there is a free & suitable one, it uses it. Otherwise it creates a new one.
522 
523    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
524    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
525    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
526    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
527 
528    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
529 
530    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
531 
532    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
533    need pack/unpack data.
534 */
535 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
536 {
537   PetscErrorCode    ierr;
538   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
539   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
540   PetscSFLink       *p,link;
541   PetscSFDirection  direction;
542   MPI_Request       *reqs = NULL;
543   PetscBool         match,rootdirect[2],leafdirect[2];
544   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
545   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
546 
547   PetscFunctionBegin;
548   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
549 
550   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
551   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
552     if (sfop == PETSCSF_BCAST) {
553       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
554       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
555     } else if (sfop == PETSCSF_REDUCE) {
556       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
557       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
558     } else { /* PETSCSF_FETCH */
559       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
560       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
561     }
562   }
563 
564   if (use_gpu_aware_mpi) {
565     rootmtype_mpi = rootmtype;
566     leafmtype_mpi = leafmtype;
567   } else {
568     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
569   }
570   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
571   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
572   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
573 
574   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
575   nrootreqs = bas->nrootreqs;
576   nleafreqs = sf->nleafreqs;
577 
578   /* Look for free links in cache */
579   for (p=&bas->avail; (link=*p); p=&link->next) {
580     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
581     if (match) {
582       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
583          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
584       */
585       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
586         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
587         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
588         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
589       }
590       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
591         reqs = link->leafreqs[direction][leafmtype][1];
592         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
593         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
594       }
595       *p = link->next; /* Remove from available list */
596       goto found;
597     }
598   }
599 
600   ierr = PetscNew(&link);CHKERRQ(ierr);
601   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
602   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
603 
604   nreqs = (nrootreqs+nleafreqs)*8;
605   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
606   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
607 
608   for (i=0; i<2; i++) { /* Two communication directions */
609     for (j=0; j<2; j++) { /* Two memory types */
610       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
611         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
612         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
613       }
614     }
615   }
616 
617 found:
618   if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);}
619 
620   /* Allocate buffers along root/leafdata */
621   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
622     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
623     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
624     if (bas->rootbuflen[i]) {
625       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
626         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
627       } else { /* Have to have a separate rootbuf */
628         if (!link->rootbuf_alloc[i][rootmtype]) {
629           ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
630         }
631         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
632       }
633     }
634 
635     if (sf->leafbuflen[i]) {
636       if (leafdirect[i]) {
637         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
638       } else {
639         if (!link->leafbuf_alloc[i][leafmtype]) {
640           ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
641         }
642         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
643       }
644     }
645   }
646 
647   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
648   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
649     if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
650       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
651     }
652     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
653   }
654   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
655     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
656       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
657     }
658     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
659   }
660 
661   /* Set `current` state of the link. They may change between different SF invocations with the same link */
662   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
663     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
664     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
665   }
666 
667   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
668   link->leafdata = leafdata;
669   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
670     link->rootdirect[i] = rootdirect[i];
671     link->leafdirect[i] = leafdirect[i];
672   }
673   link->rootdirect_mpi  = rootdirect_mpi;
674   link->leafdirect_mpi  = leafdirect_mpi;
675   link->rootmtype       = rootmtype;
676   link->leafmtype       = leafmtype;
677   link->rootmtype_mpi   = rootmtype_mpi;
678   link->leafmtype_mpi   = leafmtype_mpi;
679 
680   link->next            = bas->inuse;
681   bas->inuse            = link;
682   *mylink               = link;
683   PetscFunctionReturn(0);
684 }
685 
686 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
687    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
688 */
689 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
690 {
691   PetscErrorCode       ierr;
692   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
693   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
694   const PetscInt       *rootoffset,*leafoffset;
695   PetscMPIInt          n;
696   MPI_Aint             disp;
697   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
698   MPI_Datatype         unit = link->unit;
699   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
700   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
701 
702   PetscFunctionBegin;
703   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
704   if (sf->persistent) {
705     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
706       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
707       if (direction == PETSCSF_LEAF2ROOT) {
708         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
709           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
710           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
711           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
712         }
713       } else { /* PETSCSF_ROOT2LEAF */
714         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
715           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
716           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
717           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
718         }
719       }
720       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
721     }
722 
723     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
724       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
725       if (direction == PETSCSF_LEAF2ROOT) {
726         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
727           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
728           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
729           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
730         }
731       } else { /* PETSCSF_ROOT2LEAF */
732         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
733           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
734           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
735           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
736         }
737       }
738       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
739     }
740   }
741   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
742   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
743   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
744   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
745   PetscFunctionReturn(0);
746 }
747 
748 
749 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
750 {
751   PetscErrorCode    ierr;
752   PetscSFLink       link,*p;
753   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
754 
755   PetscFunctionBegin;
756   /* Look for types in cache */
757   for (p=&bas->inuse; (link=*p); p=&link->next) {
758     PetscBool match;
759     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
760     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
761       switch (cmode) {
762       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
763       case PETSC_USE_POINTER: break;
764       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
765       }
766       *mylink = link;
767       PetscFunctionReturn(0);
768     }
769   }
770   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
771   PetscFunctionReturn(0);
772 }
773 
774 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
775 {
776   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
777 
778   PetscFunctionBegin;
779   (*link)->rootdata = NULL;
780   (*link)->leafdata = NULL;
781   (*link)->next     = bas->avail;
782   bas->avail        = *link;
783   *link             = NULL;
784   PetscFunctionReturn(0);
785 }
786 
787 /* Destroy all links chained in 'avail' */
788 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
789 {
790   PetscErrorCode    ierr;
791   PetscSFLink       link = *avail,next;
792   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
793   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
794 
795   PetscFunctionBegin;
796   for (; link; link=next) {
797     next = link->next;
798     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
799     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
800       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
801     }
802     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
803     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
804 #if defined(PETSC_HAVE_CUDA)
805       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
806       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
807       if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
808 #endif
809       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
810       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
811     }
812     ierr = PetscFree(link);CHKERRQ(ierr);
813   }
814   *avail = NULL;
815   PetscFunctionReturn(0);
816 }
817 
818 #if defined(PETSC_USE_DEBUG)
819 /* Error out on unsupported overlapped communications */
820 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
821 {
822   PetscErrorCode    ierr;
823   PetscSFLink       link,*p;
824   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
825   PetscBool         match;
826 
827   PetscFunctionBegin;
828   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
829      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
830   */
831   if (rootdata || leafdata) {
832     for (p=&bas->inuse; (link=*p); p=&link->next) {
833       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
834       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
835     }
836   }
837   PetscFunctionReturn(0);
838 }
839 #endif
840 
841 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
842 {
843   PetscErrorCode ierr;
844   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
845   PetscBool      is2Int,is2PetscInt;
846   PetscMPIInt    ni,na,nd,combiner;
847 #if defined(PETSC_HAVE_COMPLEX)
848   PetscInt       nPetscComplex=0;
849 #endif
850 
851   PetscFunctionBegin;
852   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
853   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
854   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
855   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
856   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
857   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
858 #if defined(PETSC_HAVE_COMPLEX)
859   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
860 #endif
861   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
862   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
863   /* TODO: shaell we also handle Fortran MPI_2REAL? */
864   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
865   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
866   link->bs = 1; /* default */
867 
868   if (is2Int) {
869     PackInit_PairType_int_int_1_1(link);
870     link->bs        = 1;
871     link->unitbytes = 2*sizeof(int);
872     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
873     link->basicunit = MPI_2INT;
874     link->unit      = MPI_2INT;
875   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
876     PackInit_PairType_PetscInt_PetscInt_1_1(link);
877     link->bs        = 1;
878     link->unitbytes = 2*sizeof(PetscInt);
879     link->basicunit = MPIU_2INT;
880     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
881     link->unit      = MPIU_2INT;
882   } else if (nPetscReal) {
883     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
884     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
885     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
886     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
887     link->bs        = nPetscReal;
888     link->unitbytes = nPetscReal*sizeof(PetscReal);
889     link->basicunit = MPIU_REAL;
890     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
891   } else if (nPetscInt) {
892     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
893     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
894     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
895     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
896     link->bs        = nPetscInt;
897     link->unitbytes = nPetscInt*sizeof(PetscInt);
898     link->basicunit = MPIU_INT;
899     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
900 #if defined(PETSC_USE_64BIT_INDICES)
901   } else if (nInt) {
902     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
903     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
904     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
905     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
906     link->bs        = nInt;
907     link->unitbytes = nInt*sizeof(int);
908     link->basicunit = MPI_INT;
909     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
910 #endif
911   } else if (nSignedChar) {
912     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
913     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
914     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
915     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
916     link->bs        = nSignedChar;
917     link->unitbytes = nSignedChar*sizeof(SignedChar);
918     link->basicunit = MPI_SIGNED_CHAR;
919     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
920   }  else if (nUnsignedChar) {
921     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
922     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
923     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
924     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
925     link->bs        = nUnsignedChar;
926     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
927     link->basicunit = MPI_UNSIGNED_CHAR;
928     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
929 #if defined(PETSC_HAVE_COMPLEX)
930   } else if (nPetscComplex) {
931     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
932     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
933     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
934     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
935     link->bs        = nPetscComplex;
936     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
937     link->basicunit = MPIU_COMPLEX;
938     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
939 #endif
940   } else {
941     MPI_Aint lb,nbyte;
942     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
943     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
944     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
945       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
946       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
947       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
948       link->bs        = nbyte;
949       link->unitbytes = nbyte;
950       link->basicunit = MPI_BYTE;
951     } else {
952       nInt = nbyte / sizeof(int);
953       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
954       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
955       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
956       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
957       link->bs        = nInt;
958       link->unitbytes = nbyte;
959       link->basicunit = MPI_INT;
960     }
961     if (link->isbuiltin) link->unit = unit;
962   }
963 
964   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
965   PetscFunctionReturn(0);
966 }
967 
968 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*))
969 {
970   PetscFunctionBegin;
971   *UnpackAndOp = NULL;
972   if (mtype == PETSC_MEMTYPE_HOST) {
973     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
974     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
975     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
976     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
977     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
978     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
979     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
980     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
981     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
982     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
983     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
984     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
985     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
986   }
987 #if defined(PETSC_HAVE_CUDA)
988   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
989     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
990     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
991     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
992     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
993     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
994     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
995     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
996     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
997     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
998     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
999     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
1000     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
1001     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
1002   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1003     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
1004     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
1005     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
1006     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
1007     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
1008     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
1009     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
1010     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
1011     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
1012     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
1013     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
1014     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
1015     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
1016   }
1017 #endif
1018   PetscFunctionReturn(0);
1019 }
1020 
1021 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*))
1022 {
1023   PetscFunctionBegin;
1024   *ScatterAndOp = NULL;
1025   if (mtype == PETSC_MEMTYPE_HOST) {
1026     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
1027     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
1028     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
1029     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
1030     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
1031     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
1032     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
1033     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
1034     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
1035     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
1036     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
1037     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
1038     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
1039   }
1040 #if defined(PETSC_HAVE_CUDA)
1041   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1042     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
1043     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1044     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
1045     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1046     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1047     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1048     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1049     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1050     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1051     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1052     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1053     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1054     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1055   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1056     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1057     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1058     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1059     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1060     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1061     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1062     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1063     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1064     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1065     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1066     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1067     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1068     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1069   }
1070 #endif
1071   PetscFunctionReturn(0);
1072 }
1073 
1074 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*))
1075 {
1076   PetscFunctionBegin;
1077   *FetchAndOp = NULL;
1078   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1079   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1080 #if defined(PETSC_HAVE_CUDA)
1081   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1082   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1083 #endif
1084   PetscFunctionReturn(0);
1085 }
1086 
1087 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*))
1088 {
1089   PetscFunctionBegin;
1090   *FetchAndOpLocal = NULL;
1091   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1092   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1093 #if defined(PETSC_HAVE_CUDA)
1094   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1095   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1096 #endif
1097   PetscFunctionReturn(0);
1098 }
1099 
1100 /*=============================================================================
1101               A set of helper routines for Pack/Unpack/Scatter on GPUs
1102  ============================================================================*/
1103 #if defined(PETSC_HAVE_CUDA)
1104 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
1105    make sure the data is ready for packing.
1106  */
1107 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
1108 {
1109   PetscFunctionBegin;
1110   if (sf->use_default_stream) PetscFunctionReturn(0);
1111   if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {
1112     cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
1113   }
1114   PetscFunctionReturn(0);
1115 }
1116 
1117 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
1118 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
1119 {
1120   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1121 
1122   PetscFunctionBegin;
1123   /* rootdata is on device && send non-empty to remote && (we called a packing kernel || we async-copied rootdata from device to host) */
1124   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && bas->rootbuflen[PETSCSF_REMOTE] && (!link->rootdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi)) {
1125     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1126   }
1127   PetscFunctionReturn(0);
1128 }
1129 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
1130 {
1131   PetscFunctionBegin;
1132   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->leafbuflen[PETSCSF_REMOTE] && (!link->leafdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi)) {
1133     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1134   }
1135   PetscFunctionReturn(0);
1136 }
1137 
1138 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
1139    does not know which stream the callers will use.
1140 */
1141 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
1142 {
1143   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1144 
1145   PetscFunctionBegin;
1146   if (sf->use_default_stream) PetscFunctionReturn(0);
1147   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (bas->rootbuflen[PETSCSF_LOCAL] || bas->rootbuflen[PETSCSF_REMOTE])) {
1148     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1149   }
1150   PetscFunctionReturn(0);
1151 }
1152 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
1153 {
1154   PetscFunctionBegin;
1155   if (sf->use_default_stream) PetscFunctionReturn(0);
1156   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (sf->leafbuflen[PETSCSF_LOCAL] || sf->leafbuflen[PETSCSF_REMOTE])) {
1157     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1158   }
1159   PetscFunctionReturn(0);
1160 }
1161 
1162 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
1163    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
1164 */
1165 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1166 {
1167   PetscErrorCode ierr;
1168   cudaError_t    cerr;
1169   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1170 
1171   PetscFunctionBegin;
1172   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
1173     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1174     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1175     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
1176     if (device2host) {
1177       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1178       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1179     } else {
1180       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1181       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1182     }
1183   }
1184   PetscFunctionReturn(0);
1185 }
1186 
1187 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1188 {
1189   PetscErrorCode ierr;
1190   cudaError_t    cerr;
1191 
1192   PetscFunctionBegin;
1193   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
1194     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1195     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1196     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
1197     if (device2host) {
1198       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1199       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1200     } else {
1201       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1202       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1203     }
1204   }
1205   PetscFunctionReturn(0);
1206 }
1207 #else
1208 
1209 #define PetscSFLinkSyncDeviceBeforePackData(a,b)                0
1210 #define PetscSFLinkSyncStreamAfterPackRootData(a,b)             0
1211 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b)             0
1212 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b)           0
1213 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b)           0
1214 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1215 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1216 
1217 #endif
1218 
1219 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1220 {
1221   PetscErrorCode ierr;
1222   PetscLogDouble flops;
1223   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1224 
1225 
1226   PetscFunctionBegin;
1227   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1228     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1229 #if defined(PETSC_HAVE_CUDA)
1230     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1231 #endif
1232     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1233   }
1234   PetscFunctionReturn(0);
1235 }
1236 
1237 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1238 {
1239   PetscLogDouble flops;
1240   PetscErrorCode ierr;
1241 
1242   PetscFunctionBegin;
1243   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1244     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1245 #if defined(PETSC_HAVE_CUDA)
1246     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1247 #endif
1248     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1249   }
1250   PetscFunctionReturn(0);
1251 }
1252 
1253 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1254   Input Arguments:
1255   +sf      - The StarForest
1256   .link    - The link
1257   .count   - Number of entries to unpack
1258   .start   - The first index, significent when indices=NULL
1259   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1260   .buf     - A contiguous buffer to unpack from
1261   -op      - Operation after unpack
1262 
1263   Output Arguments:
1264   .data    - The data to unpack to
1265 */
1266 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1267 {
1268   PetscFunctionBegin;
1269 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1270   {
1271     PetscErrorCode ierr;
1272     PetscInt       i;
1273     PetscMPIInt    n;
1274     if (indices) {
1275       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1276          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1277       */
1278       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1279     } else {
1280       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1281       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1282     }
1283   }
1284 #else
1285   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1286 #endif
1287   PetscFunctionReturn(0);
1288 }
1289 
1290 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata,MPI_Op op)
1291 {
1292   PetscFunctionBegin;
1293 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1294   {
1295     PetscErrorCode ierr;
1296     PetscInt       i,disp;
1297     for (i=0; i<count; i++) {
1298       disp = idy? idy[i] : starty + i;
1299       ierr = MPI_Reduce_local((const char*)xdata+idx[i]*link->unitbytes,(char*)ydata+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1300     }
1301   }
1302 #else
1303   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1304 #endif
1305   PetscFunctionReturn(0);
1306 }
1307 
1308 /*=============================================================================
1309               Pack/Unpack/Fetch/Scatter routines
1310  ============================================================================*/
1311 
1312 /* Pack rootdata to rootbuf
1313   Input Arguments:
1314   + sf       - The SF this packing works on.
1315   . link     - It gives the memtype of the roots and also provides root buffer.
1316   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1317   - rootdata - Where to read the roots.
1318 
1319   Notes:
1320   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1321   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1322  */
1323 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1324 {
1325   PetscErrorCode   ierr;
1326   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1327   const PetscInt   *rootindices = NULL;
1328   PetscInt         count,start;
1329   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL;
1330   PetscMemType     rootmtype = link->rootmtype;
1331 
1332   PetscFunctionBegin;
1333   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1334   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1335   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1336     ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1337     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1338     ierr = (*Pack)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1339   }
1340   if (scope == PETSCSF_REMOTE) {
1341     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1342     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1343   }
1344   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1345   PetscFunctionReturn(0);
1346 }
1347 
1348 /* Pack leafdata to leafbuf */
1349 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1350 {
1351   PetscErrorCode   ierr;
1352   const PetscInt   *leafindices = NULL;
1353   PetscInt         count,start;
1354   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL;
1355   PetscMemType     leafmtype = link->leafmtype;
1356 
1357   PetscFunctionBegin;
1358   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1359   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1360   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1361     ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr);
1362     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1363     ierr = (*Pack)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1364   }
1365   if (scope == PETSCSF_REMOTE) {
1366     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1367     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1368   }
1369   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 /* Unpack rootbuf to rootdata */
1374 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1375 {
1376   PetscErrorCode   ierr;
1377   const PetscInt   *rootindices = NULL;
1378   PetscInt         count,start;
1379   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1380   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1381   PetscMemType     rootmtype = link->rootmtype;
1382 
1383   PetscFunctionBegin;
1384   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1385   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1386   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1387     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1388     if (UnpackAndOp) {
1389       ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1390       ierr = (*UnpackAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1391     } else {
1392       ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1393       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1394     }
1395   }
1396   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1397   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1398   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1399   PetscFunctionReturn(0);
1400 }
1401 
1402 /* Unpack leafbuf to leafdata */
1403 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1404 {
1405   PetscErrorCode   ierr;
1406   const PetscInt   *leafindices = NULL;
1407   PetscInt         count,start;
1408   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1409   PetscMemType     leafmtype = link->leafmtype;
1410 
1411   PetscFunctionBegin;
1412   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1413   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1414   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1415     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1416     if (UnpackAndOp) {
1417       ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr);
1418       ierr = (*UnpackAndOp)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1419     } else {
1420       ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&leafindices);CHKERRQ(ierr);
1421       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1422     }
1423   }
1424   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1425   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1426   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1427   PetscFunctionReturn(0);
1428 }
1429 
1430 /* FetchAndOp rootdata with rootbuf */
1431 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1432 {
1433   PetscErrorCode     ierr;
1434   const PetscInt     *rootindices = NULL;
1435   PetscInt           count,start;
1436   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1437   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*) = NULL;
1438   PetscMemType       rootmtype = link->rootmtype;
1439 
1440   PetscFunctionBegin;
1441   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1442   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1443   if (bas->rootbuflen[scope]) {
1444     /* Do FetchAndOp on rootdata with rootbuf */
1445     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1446     ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1447     ierr = (*FetchAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1448   }
1449   if (scope == PETSCSF_REMOTE) {
1450     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1451     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1452   }
1453   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1454   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1455   PetscFunctionReturn(0);
1456 }
1457 
1458 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1459 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1460 {
1461   PetscErrorCode       ierr;
1462   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1463   PetscInt             count,rootstart,leafstart;
1464   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1465   PetscErrorCode       (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1466   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL;
1467   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1468 
1469   PetscFunctionBegin;
1470   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1471   if (rootmtype != leafmtype) { /* Uncommon case */
1472      /* The local communication has to go through pack and unpack */
1473     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1474     ierr = PetscMemcpyWithMemType(leafmtype,rootmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1475     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1476   } else {
1477     if (bas->rootcontig[PETSCSF_LOCAL]) { /* If root indices are contiguous, Scatter becomes Unpack */
1478       ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr);
1479       rootdata = (const char*)rootdata + bas->rootstart[PETSCSF_LOCAL]*link->unitbytes; /* Make rootdata point to start of the buffer */
1480       if (UnpackAndOp) {
1481         ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr);
1482         ierr = (*UnpackAndOp)(link,count,leafstart,leafindices,sf->leafpackopt[PETSCSF_LOCAL],leafdata,rootdata);CHKERRQ(ierr);
1483       } else {
1484         ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr);
1485         ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootdata,op);CHKERRQ(ierr);
1486       }
1487     } else { /* ScatterAndOp */
1488       ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1489       if (ScatterAndOp) {
1490         ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1491         ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr);
1492         ierr = (*ScatterAndOp)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata);CHKERRQ(ierr);
1493       } else {
1494         ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,NULL,&rootindices);CHKERRQ(ierr);
1495         ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr);
1496         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1497       }
1498     }
1499   }
1500   PetscFunctionReturn(0);
1501 }
1502 
1503 /* Reduce leafdata to rootdata locally */
1504 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1505 {
1506   PetscErrorCode       ierr;
1507   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1508   PetscInt             count,rootstart,leafstart;
1509   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1510   PetscErrorCode       (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1511   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL;
1512   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1513 
1514   PetscFunctionBegin;
1515   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1516   if (rootmtype != leafmtype) {
1517     /* The local communication has to go through pack and unpack */
1518     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1519     ierr = PetscMemcpyWithMemType(rootmtype,leafmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1520     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1521   } else {
1522     if (sf->leafcontig[PETSCSF_LOCAL]) {
1523       /* If leaf indices are contiguous, Scatter becomes Unpack */
1524       ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr);
1525       leafdata = (const char*)leafdata + sf->leafstart[PETSCSF_LOCAL]*link->unitbytes; /* Make leafdata point to start of the buffer */
1526       if (UnpackAndOp) {
1527         ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1528         ierr = (*UnpackAndOp)(link,count,rootstart,rootindices,bas->rootpackopt[PETSCSF_LOCAL],rootdata,leafdata);CHKERRQ(ierr);
1529       } else {
1530         ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1531         ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafdata,op);CHKERRQ(ierr);
1532       }
1533     } else {
1534       ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1535       if (ScatterAndOp) {
1536         ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1537         ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr);
1538         ierr = (*ScatterAndOp)(link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata);CHKERRQ(ierr);
1539       } else {
1540         ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1541         ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,NULL,&leafindices);CHKERRQ(ierr);
1542         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1543       }
1544     }
1545   }
1546   PetscFunctionReturn(0);
1547 }
1548 
1549 /* Fetch rootdata to leafdata and leafupdate locally */
1550 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1551 {
1552   PetscErrorCode       ierr;
1553   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1554   PetscInt             count,rootstart,leafstart;
1555   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1556   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*) = NULL;
1557   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1558 
1559   PetscFunctionBegin;
1560   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1561   if (rootmtype != leafmtype) {
1562    /* The local communication has to go through pack and unpack */
1563     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1564   } else {
1565     ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1566     ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr);
1567     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1568     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1569   }
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 /*
1574   Create per-rank pack/unpack optimizations based on indice patterns
1575 
1576    Input Parameters:
1577   +  n       - Number of target ranks
1578   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1579   -  idx     - [*]   Array storing indices
1580 
1581    Output Parameters:
1582   +  opt     - Pack optimizations. NULL if no optimizations.
1583 */
1584 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1585 {
1586   PetscErrorCode ierr;
1587   PetscInt       i,j,k,n_copies,tot_copies = 0,step;
1588   PetscBool      strided,optimized = PETSC_FALSE;
1589   PetscSFPackOpt opt;
1590 
1591   PetscFunctionBegin;
1592   if (!n) {
1593     *out = NULL;
1594     PetscFunctionReturn(0);
1595   }
1596 
1597   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
1598   ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr);
1599   ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr);
1600   /* Make opt->offset[] zero-based. If one calls this routine with non-zero offset[0], one should use packing routine in this way, Pack(count,idx+offset[0],packopt,...) */
1601   if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];}
1602 
1603   opt->n = n;
1604 
1605   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with multiple memcpy's ) */
1606   for (i=0; i<n; i++) { /* for each target processor */
1607     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
1608     n_copies = 1;
1609     for (j=offset[i]; j<offset[i+1]-1; j++) {
1610       if (idx[j]+1 != idx[j+1]) n_copies++;
1611     }
1612     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
1613        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
1614      */
1615     if ((offset[i+1]-offset[i])/n_copies >= 32) {
1616       opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
1617       optimized    = PETSC_TRUE;
1618       tot_copies  += n_copies;
1619     }
1620   }
1621 
1622   /* Setup memcpy plan for each contiguous piece */
1623   k    = 0; /* k-th copy */
1624   ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
1625   for (i=0; i<n; i++) { /* for each target processor */
1626     if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
1627       n_copies           = 1;
1628       opt->copy_start[k] = offset[i] - offset[0];
1629       for (j=offset[i]; j<offset[i+1]-1; j++) {
1630         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
1631           n_copies++;
1632           opt->copy_start[k+1] = j-offset[0]+1;
1633           opt->copy_length[k]  = opt->copy_start[k+1] - opt->copy_start[k];
1634           k++;
1635         }
1636       }
1637       /* Set copy length of the last copy for this target */
1638       opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
1639       k++;
1640     }
1641     /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
1642     opt->copy_offset[i+1] = k;
1643   }
1644 
1645   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
1646   for (i=0; i<n; i++) { /* for each remote */
1647     if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
1648       strided = PETSC_TRUE;
1649       step    = idx[offset[i]+1] - idx[offset[i]];
1650       for (j=offset[i]; j<offset[i+1]-1; j++) {
1651         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
1652       }
1653       if (strided) {
1654         opt->type[i]         = PETSCSF_PACKOPT_STRIDE;
1655         opt->stride_step[i]  = step;
1656         opt->stride_n[i]     = offset[i+1] - offset[i];
1657         optimized            = PETSC_TRUE;
1658       }
1659     }
1660   }
1661   /* If no rank gets optimized, free arrays to save memory */
1662   if (!optimized) {
1663     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
1664     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
1665     ierr = PetscFree(opt);CHKERRQ(ierr);
1666     *out = NULL;
1667   } else *out = opt;
1668   PetscFunctionReturn(0);
1669 }
1670 
1671 PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out)
1672 {
1673   PetscErrorCode ierr;
1674   PetscSFPackOpt opt = *out;
1675 
1676   PetscFunctionBegin;
1677   if (opt) {
1678     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
1679     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
1680     ierr = PetscFree(opt);CHKERRQ(ierr);
1681     *out = NULL;
1682   }
1683   PetscFunctionReturn(0);
1684 }
1685 
1686 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1687 {
1688   PetscErrorCode ierr;
1689   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1690   PetscInt       i,j;
1691 
1692   PetscFunctionBegin;
1693   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1694   for (i=0; i<2; i++) { /* Set defaults */
1695     sf->leafstart[i]   = 0;
1696     sf->leafcontig[i]  = PETSC_TRUE;
1697     sf->leafdups[i]    = PETSC_FALSE;
1698     bas->rootstart[i]  = 0;
1699     bas->rootcontig[i] = PETSC_TRUE;
1700     bas->rootdups[i]   = PETSC_FALSE;
1701   }
1702 
1703   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1704   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1705 
1706   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1707   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1708 
1709   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1710   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1711     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1712   }
1713   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1714     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1715   }
1716 
1717   /* If not, see if we can have per-rank optimizations by doing index analysis */
1718   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1719   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1720 
1721   /* Are root indices for self and remote contiguous? */
1722   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1723   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1724 
1725   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1726   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1727 
1728   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1729     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1730   }
1731   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1732     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1733   }
1734 
1735   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1736   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1737 
1738 #if defined(PETSC_HAVE_CUDA)
1739     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1740   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1741   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1742   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1743   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1744 #endif
1745 
1746   PetscFunctionReturn(0);
1747 }
1748 
1749 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1750 {
1751   PetscErrorCode ierr;
1752   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1753   PetscInt       i;
1754 
1755   PetscFunctionBegin;
1756   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1757     ierr = PetscSFDestroyPackOpt(&sf->leafpackopt[i]);CHKERRQ(ierr);
1758     ierr = PetscSFDestroyPackOpt(&bas->rootpackopt[i]);CHKERRQ(ierr);
1759   }
1760   PetscFunctionReturn(0);
1761 }
1762