xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 7fd2d3dbf8105d3f2e002a5ca11f019cd0ad7420)
1 #include <../src/vec/is/sf/impls/basic/sfpack.h>
2 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
3 
4 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
5 
6 #if defined(PETSC_HAVE_CUDA)
7 #include <cuda_runtime.h>
8 #include <petsccublas.h>
9 #endif
10 /*
11  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
12  * therefore we pack data types manually. This file defines packing routines for the standard data types.
13  */
14 
15 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
16 
17 /* Operations working like s += t */
18 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
19 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
20 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
21 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
22 /* Ref MPI MAXLOC */
23 #define OP_XLOC(op,s,t) \
24   do {                                       \
25     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
26     else if (!((s).u op (t).u)) s = t;           \
27   } while (0)
28 
29 /* DEF_PackFunc - macro defining a Pack routine
30 
31    Arguments of the macro:
32    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
33    .BS        Block size for vectorization. It is a factor of bsz.
34    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
35 
36    Arguments of the Pack routine:
37    +count     Number of indices in idx[].
38    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
39    .opt       Per-pack optimization plan. NULL means no such plan.
40    .idx       Indices of entries to packed.
41    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
42    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
43    -packed    Address of the packed data.
44  */
45 #define DEF_PackFunc(Type,BS,EQ) \
46   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
47   {                                                                                                          \
48     PetscErrorCode ierr;                                                                                     \
49     const Type     *u = (const Type*)unpacked,*u2;                                                           \
50     Type           *p = (Type*)packed,*p2;                                                                   \
51     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
52     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
53     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
54     PetscFunctionBegin;                                                                                      \
55     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
56     else if (opt) { /* has optimizations available */                                                        \
57       p2 = p;                                                                                                \
58       for (r=0; r<opt->n; r++) {                                                                             \
59         u2 = u + opt->start[r]*MBS;                                                                          \
60         X  = opt->X[r];                                                                                      \
61         Y  = opt->Y[r];                                                                                      \
62         for (k=0; k<opt->dz[r]; k++)                                                                         \
63           for (j=0; j<opt->dy[r]; j++) {                                                                     \
64             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
65             p2  += opt->dx[r]*MBS;                                                                           \
66           }                                                                                                  \
67       }                                                                                                      \
68     } else {                                                                                                 \
69       for (i=0; i<count; i++)                                                                                \
70         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
71           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
72             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
73     }                                                                                                        \
74     PetscFunctionReturn(0);                                                                                  \
75   }
76 
77 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
78                 and inserts into a sparse array.
79 
80    Arguments:
81   .Type       Type of the data
82   .BS         Block size for vectorization
83   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
84 
85   Notes:
86    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
87  */
88 #define DEF_UnpackFunc(Type,BS,EQ)               \
89   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
90   {                                                                                                          \
91     PetscErrorCode ierr;                                                                                     \
92     Type           *u = (Type*)unpacked,*u2;                                                                 \
93     const Type     *p = (const Type*)packed;                                                                 \
94     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
95     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
96     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
97     PetscFunctionBegin;                                                                                      \
98     if (!idx) {                                                                                              \
99       u += start*MBS;                                                                                        \
100       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
101     } else if (opt) { /* has optimizations available */                                                      \
102       for (r=0; r<opt->n; r++) {                                                                             \
103         u2 = u + opt->start[r]*MBS;                                                                          \
104         X  = opt->X[r];                                                                                      \
105         Y  = opt->Y[r];                                                                                      \
106         for (k=0; k<opt->dz[r]; k++)                                                                         \
107           for (j=0; j<opt->dy[r]; j++) {                                                                     \
108             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
109             p   += opt->dx[r]*MBS;                                                                           \
110           }                                                                                                  \
111       }                                                                                                      \
112     } else {                                                                                                 \
113       for (i=0; i<count; i++)                                                                                \
114         for (j=0; j<M; j++)                                                                                  \
115           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
116     }                                                                                                        \
117     PetscFunctionReturn(0);                                                                                  \
118   }
119 
120 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
121 
122    Arguments:
123   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
124   .Type       Type of the data
125   .BS         Block size for vectorization
126   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
127   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
128   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
129  */
130 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
131   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
132   {                                                                                                          \
133     Type           *u = (Type*)unpacked,*u2;                                                                 \
134     const Type     *p = (const Type*)packed;                                                                 \
135     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
136     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
137     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
138     PetscFunctionBegin;                                                                                      \
139     if (!idx) {                                                                                              \
140       u += start*MBS;                                                                                        \
141       for (i=0; i<count; i++)                                                                                \
142         for (j=0; j<M; j++)                                                                                  \
143           for (k=0; k<BS; k++)                                                                               \
144             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
145     } else if (opt) { /* idx[] has patterns */                                                               \
146       for (r=0; r<opt->n; r++) {                                                                             \
147         u2 = u + opt->start[r]*MBS;                                                                          \
148         X  = opt->X[r];                                                                                      \
149         Y  = opt->Y[r];                                                                                      \
150         for (k=0; k<opt->dz[r]; k++)                                                                         \
151           for (j=0; j<opt->dy[r]; j++) {                                                                     \
152             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
153             p += opt->dx[r]*MBS;                                                                             \
154           }                                                                                                  \
155       }                                                                                                      \
156     } else {                                                                                                 \
157       for (i=0; i<count; i++)                                                                                \
158         for (j=0; j<M; j++)                                                                                  \
159           for (k=0; k<BS; k++)                                                                               \
160             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
161     }                                                                                                        \
162     PetscFunctionReturn(0);                                                                                  \
163   }
164 
165 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
166   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
167   {                                                                                                          \
168     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
169     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
170     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
171     const PetscInt MBS = M*BS;                                                                               \
172     PetscFunctionBegin;                                                                                      \
173     for (i=0; i<count; i++) {                                                                                \
174       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
175       l = i*MBS;                                                                                             \
176       for (j=0; j<M; j++)                                                                                    \
177         for (k=0; k<BS; k++) {                                                                               \
178           tmp = u[r+j*BS+k];                                                                                 \
179           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
180           p[l+j*BS+k] = tmp;                                                                                 \
181         }                                                                                                    \
182     }                                                                                                        \
183     PetscFunctionReturn(0);                                                                                  \
184   }
185 
186 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
187   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
188   {                                                                                                          \
189     PetscErrorCode ierr;                                                                                     \
190     const Type     *u = (const Type*)src;                                                                    \
191     Type           *v = (Type*)dst;                                                                          \
192     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
193     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
194     const PetscInt MBS = M*BS;                                                                               \
195     PetscFunctionBegin;                                                                                      \
196     if (!srcIdx) { /* src is contiguous */                                                                   \
197       u += srcStart*MBS;                                                                                     \
198       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
199     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
200       u += srcOpt->start[0]*MBS;                                                                             \
201       v += dstStart*MBS;                                                                                     \
202       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
203       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
204         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
205           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
206           v += srcOpt->dx[0]*MBS;                                                                            \
207         }                                                                                                    \
208     } else { /* all other cases */                                                                           \
209       for (i=0; i<count; i++) {                                                                              \
210         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
211         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
212         for (j=0; j<M; j++)                                                                                  \
213           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
214       }                                                                                                      \
215     }                                                                                                        \
216     PetscFunctionReturn(0);                                                                                  \
217   }
218 
219 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
220   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
221   {                                                                                                          \
222     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
223     const Type     *ldata = (const Type*)leafdata;                                                           \
224     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
225     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
226     const PetscInt MBS = M*BS;                                                                               \
227     PetscFunctionBegin;                                                                                      \
228     for (i=0; i<count; i++) {                                                                                \
229       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
230       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
231       for (j=0; j<M; j++)                                                                                    \
232         for (k=0; k<BS; k++) {                                                                               \
233           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
234           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
235         }                                                                                                    \
236     }                                                                                                        \
237     PetscFunctionReturn(0);                                                                                  \
238   }
239 
240 /* Pack, Unpack/Fetch ops */
241 #define DEF_Pack(Type,BS,EQ)                                                      \
242   DEF_PackFunc(Type,BS,EQ)                                                        \
243   DEF_UnpackFunc(Type,BS,EQ)                                                      \
244   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
245   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
246     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
247     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
248     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
249   }
250 
251 /* Add, Mult ops */
252 #define DEF_Add(Type,BS,EQ)                                                       \
253   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
254   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
255   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
256   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
257   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
258   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
259   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
260     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
261     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
262     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
263     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
264     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
265     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
266   }
267 
268 /* Max, Min ops */
269 #define DEF_Cmp(Type,BS,EQ)                                                       \
270   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
271   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
272   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
273   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
274   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
275     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
276     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
277     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
278     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
279   }
280 
281 /* Logical ops.
282   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
283   the compilation warning "empty macro arguments are undefined in ISO C90"
284  */
285 #define DEF_Log(Type,BS,EQ)                                                       \
286   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
287   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
288   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
289   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
290   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
291   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
292   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
293     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
294     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
295     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
296     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
297     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
298     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
299   }
300 
301 /* Bitwise ops */
302 #define DEF_Bit(Type,BS,EQ)                                                       \
303   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
304   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
305   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
306   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
307   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
308   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
309   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
310     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
311     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
312     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
313     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
314     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
315     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
316   }
317 
318 /* Maxloc, Minloc ops */
319 #define DEF_Xloc(Type,BS,EQ)                                                      \
320   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
321   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
322   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
323   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
324   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
325     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
326     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
327     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
328     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
329   }
330 
331 #define DEF_IntegerType(Type,BS,EQ)                                               \
332   DEF_Pack(Type,BS,EQ)                                                            \
333   DEF_Add(Type,BS,EQ)                                                             \
334   DEF_Cmp(Type,BS,EQ)                                                             \
335   DEF_Log(Type,BS,EQ)                                                             \
336   DEF_Bit(Type,BS,EQ)                                                             \
337   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
338     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
339     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
340     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
341     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
342     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
343   }
344 
345 #define DEF_RealType(Type,BS,EQ)                                                  \
346   DEF_Pack(Type,BS,EQ)                                                            \
347   DEF_Add(Type,BS,EQ)                                                             \
348   DEF_Cmp(Type,BS,EQ)                                                             \
349   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
350     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
351     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
352     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
353   }
354 
355 #if defined(PETSC_HAVE_COMPLEX)
356 #define DEF_ComplexType(Type,BS,EQ)                                               \
357   DEF_Pack(Type,BS,EQ)                                                            \
358   DEF_Add(Type,BS,EQ)                                                             \
359   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
360     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
361     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
362   }
363 #endif
364 
365 #define DEF_DumbType(Type,BS,EQ)                                                  \
366   DEF_Pack(Type,BS,EQ)                                                            \
367   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
368     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
369   }
370 
371 /* Maxloc, Minloc */
372 #define DEF_PairType(Type,BS,EQ)                                                  \
373   DEF_Pack(Type,BS,EQ)                                                            \
374   DEF_Xloc(Type,BS,EQ)                                                            \
375   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
376     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
377     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
378   }
379 
380 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
381 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
382 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
383 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
384 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
385 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
386 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
387 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
388 
389 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
390 DEF_IntegerType(int,1,1)
391 DEF_IntegerType(int,2,1)
392 DEF_IntegerType(int,4,1)
393 DEF_IntegerType(int,8,1)
394 DEF_IntegerType(int,1,0)
395 DEF_IntegerType(int,2,0)
396 DEF_IntegerType(int,4,0)
397 DEF_IntegerType(int,8,0)
398 #endif
399 
400 /* The typedefs are used to get a typename without space that CPPJoin can handle */
401 typedef signed char SignedChar;
402 DEF_IntegerType(SignedChar,1,1)
403 DEF_IntegerType(SignedChar,2,1)
404 DEF_IntegerType(SignedChar,4,1)
405 DEF_IntegerType(SignedChar,8,1)
406 DEF_IntegerType(SignedChar,1,0)
407 DEF_IntegerType(SignedChar,2,0)
408 DEF_IntegerType(SignedChar,4,0)
409 DEF_IntegerType(SignedChar,8,0)
410 
411 typedef unsigned char UnsignedChar;
412 DEF_IntegerType(UnsignedChar,1,1)
413 DEF_IntegerType(UnsignedChar,2,1)
414 DEF_IntegerType(UnsignedChar,4,1)
415 DEF_IntegerType(UnsignedChar,8,1)
416 DEF_IntegerType(UnsignedChar,1,0)
417 DEF_IntegerType(UnsignedChar,2,0)
418 DEF_IntegerType(UnsignedChar,4,0)
419 DEF_IntegerType(UnsignedChar,8,0)
420 
421 DEF_RealType(PetscReal,1,1)
422 DEF_RealType(PetscReal,2,1)
423 DEF_RealType(PetscReal,4,1)
424 DEF_RealType(PetscReal,8,1)
425 DEF_RealType(PetscReal,1,0)
426 DEF_RealType(PetscReal,2,0)
427 DEF_RealType(PetscReal,4,0)
428 DEF_RealType(PetscReal,8,0)
429 
430 #if defined(PETSC_HAVE_COMPLEX)
431 DEF_ComplexType(PetscComplex,1,1)
432 DEF_ComplexType(PetscComplex,2,1)
433 DEF_ComplexType(PetscComplex,4,1)
434 DEF_ComplexType(PetscComplex,8,1)
435 DEF_ComplexType(PetscComplex,1,0)
436 DEF_ComplexType(PetscComplex,2,0)
437 DEF_ComplexType(PetscComplex,4,0)
438 DEF_ComplexType(PetscComplex,8,0)
439 #endif
440 
441 #define PairType(Type1,Type2) Type1##_##Type2
442 typedef struct {int u; int i;}           PairType(int,int);
443 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
444 DEF_PairType(PairType(int,int),1,1)
445 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
446 
447 /* If we don't know the basic type, we treat it as a stream of chars or ints */
448 DEF_DumbType(char,1,1)
449 DEF_DumbType(char,2,1)
450 DEF_DumbType(char,4,1)
451 DEF_DumbType(char,1,0)
452 DEF_DumbType(char,2,0)
453 DEF_DumbType(char,4,0)
454 
455 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
456 DEF_DumbType(DumbInt,1,1)
457 DEF_DumbType(DumbInt,2,1)
458 DEF_DumbType(DumbInt,4,1)
459 DEF_DumbType(DumbInt,8,1)
460 DEF_DumbType(DumbInt,1,0)
461 DEF_DumbType(DumbInt,2,0)
462 DEF_DumbType(DumbInt,4,0)
463 DEF_DumbType(DumbInt,8,0)
464 
465 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
466 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
467 {
468   int ierr;
469   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
470   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
471   return MPI_SUCCESS;
472 }
473 #endif
474 
475 /*
476    The routine Creates a communication link for the given operation. It first looks up its link cache. If
477    there is a free & suitable one, it uses it. Otherwise it creates a new one.
478 
479    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
480    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
481    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
482    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
483 
484    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
485 
486    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
487 
488    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
489    need pack/unpack data.
490 */
491 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
492 {
493   PetscErrorCode    ierr;
494   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
495   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
496   PetscSFLink       *p,link;
497   PetscSFDirection  direction;
498   MPI_Request       *reqs = NULL;
499   PetscBool         match,rootdirect[2],leafdirect[2];
500   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
501   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
502 
503   PetscFunctionBegin;
504   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
505 
506   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
507   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
508     if (sfop == PETSCSF_BCAST) {
509       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
510       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
511     } else if (sfop == PETSCSF_REDUCE) {
512       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
513       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
514     } else { /* PETSCSF_FETCH */
515       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
516       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
517     }
518   }
519 
520   if (sf->use_gpu_aware_mpi) {
521     rootmtype_mpi = rootmtype;
522     leafmtype_mpi = leafmtype;
523   } else {
524     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
525   }
526   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
527   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
528   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
529 
530   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
531   nrootreqs = bas->nrootreqs;
532   nleafreqs = sf->nleafreqs;
533 
534   /* Look for free links in cache */
535   for (p=&bas->avail; (link=*p); p=&link->next) {
536     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
537     if (match) {
538       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
539          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
540       */
541       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
542         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
543         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
544         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
545       }
546       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
547         reqs = link->leafreqs[direction][leafmtype][1];
548         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
549         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
550       }
551       *p = link->next; /* Remove from available list */
552       goto found;
553     }
554   }
555 
556   ierr = PetscNew(&link);CHKERRQ(ierr);
557   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
558   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
559 
560   nreqs = (nrootreqs+nleafreqs)*8;
561   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
562   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
563 
564   for (i=0; i<2; i++) { /* Two communication directions */
565     for (j=0; j<2; j++) { /* Two memory types */
566       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
567         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
568         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
569       }
570     }
571   }
572 
573 found:
574 #if defined(PETSC_HAVE_DEVICE)
575   if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);}
576 #endif
577 
578   /* Allocate buffers along root/leafdata */
579   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
580     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
581     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
582     if (bas->rootbuflen[i]) {
583       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
584         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
585       } else { /* Have to have a separate rootbuf */
586         if (!link->rootbuf_alloc[i][rootmtype]) {
587           ierr = PetscSFMalloc(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
588         }
589         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
590       }
591     }
592 
593     if (sf->leafbuflen[i]) {
594       if (leafdirect[i]) {
595         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
596       } else {
597         if (!link->leafbuf_alloc[i][leafmtype]) {
598           ierr = PetscSFMalloc(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
599         }
600         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
601       }
602     }
603   }
604 
605 #if defined(PETSC_HAVE_DEVICE)
606   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
607   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
608     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
609       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
610     }
611     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
612   }
613   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
614     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
615       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
616     }
617     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
618   }
619 #endif
620 
621   /* Set `current` state of the link. They may change between different SF invocations with the same link */
622   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
623     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
624     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
625   }
626 
627   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
628   link->leafdata = leafdata;
629   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
630     link->rootdirect[i] = rootdirect[i];
631     link->leafdirect[i] = leafdirect[i];
632   }
633   link->rootdirect_mpi  = rootdirect_mpi;
634   link->leafdirect_mpi  = leafdirect_mpi;
635   link->rootmtype       = rootmtype;
636   link->leafmtype       = leafmtype;
637   link->rootmtype_mpi   = rootmtype_mpi;
638   link->leafmtype_mpi   = leafmtype_mpi;
639 
640   link->next            = bas->inuse;
641   bas->inuse            = link;
642   *mylink               = link;
643   PetscFunctionReturn(0);
644 }
645 
646 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
647    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
648 */
649 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
650 {
651   PetscErrorCode       ierr;
652   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
653   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
654   const PetscInt       *rootoffset,*leafoffset;
655   PetscMPIInt          n;
656   MPI_Aint             disp;
657   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
658   MPI_Datatype         unit = link->unit;
659   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
660   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
661 
662   PetscFunctionBegin;
663   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
664   if (sf->persistent) {
665     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
666       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
667       if (direction == PETSCSF_LEAF2ROOT) {
668         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
669           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
670           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
671           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
672         }
673       } else { /* PETSCSF_ROOT2LEAF */
674         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
675           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
676           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
677           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
678         }
679       }
680       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
681     }
682 
683     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
684       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
685       if (direction == PETSCSF_LEAF2ROOT) {
686         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
687           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
688           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
689           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
690         }
691       } else { /* PETSCSF_ROOT2LEAF */
692         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
693           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
694           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
695           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
696         }
697       }
698       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
699     }
700   }
701   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
702   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
703   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
704   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
705   PetscFunctionReturn(0);
706 }
707 
708 
709 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
710 {
711   PetscErrorCode    ierr;
712   PetscSFLink       link,*p;
713   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
714 
715   PetscFunctionBegin;
716   /* Look for types in cache */
717   for (p=&bas->inuse; (link=*p); p=&link->next) {
718     PetscBool match;
719     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
720     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
721       switch (cmode) {
722       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
723       case PETSC_USE_POINTER: break;
724       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
725       }
726       *mylink = link;
727       PetscFunctionReturn(0);
728     }
729   }
730   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
731   PetscFunctionReturn(0);
732 }
733 
734 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
735 {
736   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
737 
738   PetscFunctionBegin;
739   (*link)->rootdata = NULL;
740   (*link)->leafdata = NULL;
741   (*link)->next     = bas->avail;
742   bas->avail        = *link;
743   *link             = NULL;
744   PetscFunctionReturn(0);
745 }
746 
747 /* Destroy all links chained in 'avail' */
748 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
749 {
750   PetscErrorCode    ierr;
751   PetscSFLink       link = *avail,next;
752   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
753   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
754 
755   PetscFunctionBegin;
756   for (; link; link=next) {
757     next = link->next;
758     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
759     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
760       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
761     }
762     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
763     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
764       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
765       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
766 #if defined(PETSC_HAVE_DEVICE)
767       ierr = PetscSFFree(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
768       ierr = PetscSFFree(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
769 #endif
770     }
771 #if defined(PETSC_HAVE_DEVICE)
772     if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);}
773   #if defined(PETSC_HAVE_CUDA)
774     if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;}
775   #elif defined(PETSC_HAVE_HIP)
776     if (link->stream) {hipError_t  cerr = hipStreamDestroy(link->stream);CHKERRQ(cerr); link->stream = NULL;} /* TODO: CHKERRHIP? */
777   #endif
778 #endif
779     ierr = PetscFree(link);CHKERRQ(ierr);
780   }
781   *avail = NULL;
782   PetscFunctionReturn(0);
783 }
784 
785 /* Error out on unsupported overlapped communications */
786 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
787 {
788   PetscErrorCode    ierr;
789   PetscSFLink       link,*p;
790   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
791   PetscBool         match;
792 
793   PetscFunctionBegin;
794   if (!PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
795   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
796      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
797   */
798   if (rootdata || leafdata) {
799     for (p=&bas->inuse; (link=*p); p=&link->next) {
800       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
801       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
802     }
803   }
804   PetscFunctionReturn(0);
805 }
806 
807 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
808 {
809   PetscErrorCode ierr;
810   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
811   PetscBool      is2Int,is2PetscInt;
812   PetscMPIInt    ni,na,nd,combiner;
813 #if defined(PETSC_HAVE_COMPLEX)
814   PetscInt       nPetscComplex=0;
815 #endif
816 
817   PetscFunctionBegin;
818   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
819   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
820   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
821   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
822   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
823   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
824 #if defined(PETSC_HAVE_COMPLEX)
825   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
826 #endif
827   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
828   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
829   /* TODO: shaell we also handle Fortran MPI_2REAL? */
830   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
831   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
832   link->bs = 1; /* default */
833 
834   if (is2Int) {
835     PackInit_PairType_int_int_1_1(link);
836     link->bs        = 1;
837     link->unitbytes = 2*sizeof(int);
838     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
839     link->basicunit = MPI_2INT;
840     link->unit      = MPI_2INT;
841   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
842     PackInit_PairType_PetscInt_PetscInt_1_1(link);
843     link->bs        = 1;
844     link->unitbytes = 2*sizeof(PetscInt);
845     link->basicunit = MPIU_2INT;
846     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
847     link->unit      = MPIU_2INT;
848   } else if (nPetscReal) {
849     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
850     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
851     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
852     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
853     link->bs        = nPetscReal;
854     link->unitbytes = nPetscReal*sizeof(PetscReal);
855     link->basicunit = MPIU_REAL;
856     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
857   } else if (nPetscInt) {
858     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
859     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
860     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
861     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
862     link->bs        = nPetscInt;
863     link->unitbytes = nPetscInt*sizeof(PetscInt);
864     link->basicunit = MPIU_INT;
865     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
866 #if defined(PETSC_USE_64BIT_INDICES)
867   } else if (nInt) {
868     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
869     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
870     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
871     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
872     link->bs        = nInt;
873     link->unitbytes = nInt*sizeof(int);
874     link->basicunit = MPI_INT;
875     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
876 #endif
877   } else if (nSignedChar) {
878     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
879     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
880     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
881     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
882     link->bs        = nSignedChar;
883     link->unitbytes = nSignedChar*sizeof(SignedChar);
884     link->basicunit = MPI_SIGNED_CHAR;
885     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
886   }  else if (nUnsignedChar) {
887     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
888     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
889     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
890     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
891     link->bs        = nUnsignedChar;
892     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
893     link->basicunit = MPI_UNSIGNED_CHAR;
894     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
895 #if defined(PETSC_HAVE_COMPLEX)
896   } else if (nPetscComplex) {
897     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
898     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
899     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
900     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
901     link->bs        = nPetscComplex;
902     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
903     link->basicunit = MPIU_COMPLEX;
904     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
905 #endif
906   } else {
907     MPI_Aint lb,nbyte;
908     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
909     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
910     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
911       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
912       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
913       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
914       link->bs        = nbyte;
915       link->unitbytes = nbyte;
916       link->basicunit = MPI_BYTE;
917     } else {
918       nInt = nbyte / sizeof(int);
919       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
920       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
921       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
922       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
923       link->bs        = nInt;
924       link->unitbytes = nbyte;
925       link->basicunit = MPI_INT;
926     }
927     if (link->isbuiltin) link->unit = unit;
928   }
929 
930   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
931   PetscFunctionReturn(0);
932 }
933 
934 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
935 {
936   PetscFunctionBegin;
937   *UnpackAndOp = NULL;
938   if (mtype == PETSC_MEMTYPE_HOST) {
939     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
940     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
941     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
942     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
943     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
944     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
945     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
946     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
947     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
948     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
949     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
950     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
951     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
952   }
953 #if defined(PETSC_HAVE_DEVICE)
954   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
955     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
956     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
957     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
958     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
959     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
960     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
961     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
962     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
963     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
964     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
965     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
966     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
967     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
968   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
969     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
970     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
971     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
972     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
973     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
974     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
975     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
976     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
977     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
978     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
979     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
980     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
981     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
982   }
983 #endif
984   PetscFunctionReturn(0);
985 }
986 
987 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
988 {
989   PetscFunctionBegin;
990   *ScatterAndOp = NULL;
991   if (mtype == PETSC_MEMTYPE_HOST) {
992     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
993     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
994     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
995     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
996     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
997     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
998     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
999     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
1000     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
1001     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
1002     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
1003     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
1004     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
1005   }
1006 #if defined(PETSC_HAVE_DEVICE)
1007   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1008     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
1009     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1010     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
1011     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1012     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1013     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1014     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1015     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1016     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1017     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1018     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1019     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1020     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1021   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1022     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1023     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1024     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1025     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1026     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1027     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1028     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1029     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1030     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1031     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1032     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1033     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1034     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1035   }
1036 #endif
1037   PetscFunctionReturn(0);
1038 }
1039 
1040 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1041 {
1042   PetscFunctionBegin;
1043   *FetchAndOp = NULL;
1044   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1045   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1046 #if defined(PETSC_HAVE_DEVICE)
1047   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1048   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1049 #endif
1050   PetscFunctionReturn(0);
1051 }
1052 
1053 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1054 {
1055   PetscFunctionBegin;
1056   *FetchAndOpLocal = NULL;
1057   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1058   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1059 #if defined(PETSC_HAVE_DEVICE)
1060   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1061   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1062 #endif
1063   PetscFunctionReturn(0);
1064 }
1065 
1066 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1067 {
1068   PetscErrorCode ierr;
1069   PetscLogDouble flops;
1070   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1071 
1072 
1073   PetscFunctionBegin;
1074   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1075     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1076 #if defined(PETSC_HAVE_DEVICE)
1077     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1078 #endif
1079     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1080   }
1081   PetscFunctionReturn(0);
1082 }
1083 
1084 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1085 {
1086   PetscLogDouble flops;
1087   PetscErrorCode ierr;
1088 
1089   PetscFunctionBegin;
1090   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1091     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1092 #if defined(PETSC_HAVE_DEVICE)
1093     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1094 #endif
1095     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1096   }
1097   PetscFunctionReturn(0);
1098 }
1099 
1100 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1101   Input Arguments:
1102   +sf      - The StarForest
1103   .link    - The link
1104   .count   - Number of entries to unpack
1105   .start   - The first index, significent when indices=NULL
1106   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1107   .buf     - A contiguous buffer to unpack from
1108   -op      - Operation after unpack
1109 
1110   Output Arguments:
1111   .data    - The data to unpack to
1112 */
1113 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1114 {
1115   PetscFunctionBegin;
1116 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1117   {
1118     PetscErrorCode ierr;
1119     PetscInt       i;
1120     PetscMPIInt    n;
1121     if (indices) {
1122       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1123          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1124       */
1125       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1126     } else {
1127       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1128       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1129     }
1130   }
1131 #else
1132   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1133 #endif
1134   PetscFunctionReturn(0);
1135 }
1136 
1137 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1138 {
1139   PetscFunctionBegin;
1140 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1141   {
1142     PetscErrorCode ierr;
1143     PetscInt       i,disp;
1144     if (!srcIdx) {
1145       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
1146     } else {
1147       for (i=0; i<count; i++) {
1148         disp = dstIdx? dstIdx[i] : dstStart + i;
1149         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1150       }
1151     }
1152   }
1153 #else
1154   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1155 #endif
1156   PetscFunctionReturn(0);
1157 }
1158 
1159 /*=============================================================================
1160               Pack/Unpack/Fetch/Scatter routines
1161  ============================================================================*/
1162 
1163 /* Pack rootdata to rootbuf
1164   Input Arguments:
1165   + sf       - The SF this packing works on.
1166   . link     - It gives the memtype of the roots and also provides root buffer.
1167   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1168   - rootdata - Where to read the roots.
1169 
1170   Notes:
1171   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1172   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1173  */
1174 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1175 {
1176   PetscErrorCode   ierr;
1177   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1178   const PetscInt   *rootindices = NULL;
1179   PetscInt         count,start;
1180   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1181   PetscMemType     rootmtype = link->rootmtype;
1182   PetscSFPackOpt   opt = NULL;
1183 
1184   PetscFunctionBegin;
1185   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1186   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1187   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1188     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1189     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1190     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1191   }
1192   if (scope == PETSCSF_REMOTE) {
1193     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1194     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1195   }
1196   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1197   PetscFunctionReturn(0);
1198 }
1199 
1200 /* Pack leafdata to leafbuf */
1201 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1202 {
1203   PetscErrorCode   ierr;
1204   const PetscInt   *leafindices = NULL;
1205   PetscInt         count,start;
1206   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1207   PetscMemType     leafmtype = link->leafmtype;
1208   PetscSFPackOpt   opt = NULL;
1209 
1210   PetscFunctionBegin;
1211   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1212   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1213   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1214     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1215     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1216     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1217   }
1218   if (scope == PETSCSF_REMOTE) {
1219     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1220     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1221   }
1222   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1223   PetscFunctionReturn(0);
1224 }
1225 
1226 /* Unpack rootbuf to rootdata */
1227 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1228 {
1229   PetscErrorCode   ierr;
1230   const PetscInt   *rootindices = NULL;
1231   PetscInt         count,start;
1232   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1233   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1234   PetscMemType     rootmtype = link->rootmtype;
1235   PetscSFPackOpt   opt = NULL;
1236 
1237   PetscFunctionBegin;
1238   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1239   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1240   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1241     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1242     if (UnpackAndOp) {
1243       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1244       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1245     } else {
1246       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1247       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1248     }
1249   }
1250   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1251   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1252   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1253   PetscFunctionReturn(0);
1254 }
1255 
1256 /* Unpack leafbuf to leafdata */
1257 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1258 {
1259   PetscErrorCode   ierr;
1260   const PetscInt   *leafindices = NULL;
1261   PetscInt         count,start;
1262   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1263   PetscMemType     leafmtype = link->leafmtype;
1264   PetscSFPackOpt   opt = NULL;
1265 
1266   PetscFunctionBegin;
1267   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1268   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1269   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1270     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1271     if (UnpackAndOp) {
1272       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1273       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1274     } else {
1275       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1276       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1277     }
1278   }
1279   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1280   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1281   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1282   PetscFunctionReturn(0);
1283 }
1284 
1285 /* FetchAndOp rootdata with rootbuf */
1286 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1287 {
1288   PetscErrorCode     ierr;
1289   const PetscInt     *rootindices = NULL;
1290   PetscInt           count,start;
1291   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1292   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1293   PetscMemType       rootmtype = link->rootmtype;
1294   PetscSFPackOpt     opt = NULL;
1295 
1296   PetscFunctionBegin;
1297   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1298   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1299   if (bas->rootbuflen[scope]) {
1300     /* Do FetchAndOp on rootdata with rootbuf */
1301     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1302     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1303     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1304   }
1305   if (scope == PETSCSF_REMOTE) {
1306     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1307     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1308   }
1309   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1310   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1311   PetscFunctionReturn(0);
1312 }
1313 
1314 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1315 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1316 {
1317   PetscErrorCode       ierr;
1318   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1319   PetscInt             count,rootstart,leafstart;
1320   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1321   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1322   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1323   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1324 
1325   PetscFunctionBegin;
1326   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1327   if (rootmtype != leafmtype) { /* Uncommon case */
1328      /* The local communication has to go through pack and unpack */
1329     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1330     ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1331     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1332   } else {
1333     ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1334     if (ScatterAndOp) {
1335       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1336       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1337       ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1338     } else {
1339       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1340       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1341       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1342     }
1343   }
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 /* Reduce leafdata to rootdata locally */
1348 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1349 {
1350   PetscErrorCode       ierr;
1351   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1352   PetscInt             count,rootstart,leafstart;
1353   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1354   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1355   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1356   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1357 
1358   PetscFunctionBegin;
1359   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1360   if (rootmtype != leafmtype) {
1361     /* The local communication has to go through pack and unpack */
1362     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1363     ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1364     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1365   } else {
1366     ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1367     if (ScatterAndOp) {
1368       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1369       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1370       ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1371     } else {
1372       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1373       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1374       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1375     }
1376   }
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 /* Fetch rootdata to leafdata and leafupdate locally */
1381 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1382 {
1383   PetscErrorCode       ierr;
1384   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1385   PetscInt             count,rootstart,leafstart;
1386   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1387   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1388   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1389   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1390 
1391   PetscFunctionBegin;
1392   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1393   if (rootmtype != leafmtype) {
1394    /* The local communication has to go through pack and unpack */
1395     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1396   } else {
1397     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1398     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1399     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1400     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1401   }
1402   PetscFunctionReturn(0);
1403 }
1404 
1405 /*
1406   Create per-rank pack/unpack optimizations based on indice patterns
1407 
1408    Input Parameters:
1409   +  n       - Number of destination ranks
1410   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1411   -  idx     - [*]   Array storing indices
1412 
1413    Output Parameters:
1414   +  opt     - Pack optimizations. NULL if no optimizations.
1415 */
1416 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1417 {
1418   PetscErrorCode ierr;
1419   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1420   PetscBool      optimizable = PETSC_TRUE;
1421   PetscSFPackOpt opt;
1422 
1423   PetscFunctionBegin;
1424   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1425   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1426   opt->n      = opt->array[0] = n;
1427   opt->offset = opt->array + 1;
1428   opt->start  = opt->array + n   + 2;
1429   opt->dx     = opt->array + 2*n + 2;
1430   opt->dy     = opt->array + 3*n + 2;
1431   opt->dz     = opt->array + 4*n + 2;
1432   opt->X      = opt->array + 5*n + 2;
1433   opt->Y      = opt->array + 6*n + 2;
1434 
1435   for (r=0; r<n; r++) { /* For each destination rank */
1436     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1437     p     = offset[r];
1438     start = idx[p]; /* First index for this rank */
1439     p++;
1440 
1441     /* Search in X dimension */
1442     for (dx=1; dx<m; dx++,p++) {
1443       if (start+dx != idx[p]) break;
1444     }
1445 
1446     dydz = m/dx;
1447     X    = dydz > 1 ? (idx[p]-start) : dx;
1448     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1449     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1450     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1451       for (i=0; i<dx; i++,p++) {
1452         if (start+X*dy+i != idx[p]) {
1453           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1454           else goto Z_dimension;
1455         }
1456       }
1457     }
1458 
1459 Z_dimension:
1460     dz = m/(dx*dy);
1461     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1462     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1463     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1464     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1465       for (j=0; j<dy; j++) {
1466         for (i=0; i<dx; i++,p++) {
1467           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1468         }
1469       }
1470     }
1471     opt->start[r] = start;
1472     opt->dx[r]    = dx;
1473     opt->dy[r]    = dy;
1474     opt->dz[r]    = dz;
1475     opt->X[r]     = X;
1476     opt->Y[r]     = Y;
1477   }
1478 
1479 finish:
1480   /* If not optimizable, free arrays to save memory */
1481   if (!n || !optimizable) {
1482     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1483     ierr = PetscFree(opt);CHKERRQ(ierr);
1484     *out = NULL;
1485   } else {
1486     opt->offset[0] = 0;
1487     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1488     *out = opt;
1489   }
1490   PetscFunctionReturn(0);
1491 }
1492 
1493 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out)
1494 {
1495   PetscErrorCode ierr;
1496   PetscSFPackOpt opt = *out;
1497 
1498   PetscFunctionBegin;
1499   if (opt) {
1500     ierr = PetscSFFree(mtype,opt->array);CHKERRQ(ierr);
1501     ierr = PetscFree(opt);CHKERRQ(ierr);
1502     *out = NULL;
1503   }
1504   PetscFunctionReturn(0);
1505 }
1506 
1507 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1508 {
1509   PetscErrorCode ierr;
1510   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1511   PetscInt       i,j;
1512 
1513   PetscFunctionBegin;
1514   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1515   for (i=0; i<2; i++) { /* Set defaults */
1516     sf->leafstart[i]   = 0;
1517     sf->leafcontig[i]  = PETSC_TRUE;
1518     sf->leafdups[i]    = PETSC_FALSE;
1519     bas->rootstart[i]  = 0;
1520     bas->rootcontig[i] = PETSC_TRUE;
1521     bas->rootdups[i]   = PETSC_FALSE;
1522   }
1523 
1524   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1525   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1526 
1527   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1528   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1529 
1530   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1531   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1532     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1533   }
1534   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1535     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1536   }
1537 
1538   /* If not, see if we can have per-rank optimizations by doing index analysis */
1539   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1540   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1541 
1542   /* Are root indices for self and remote contiguous? */
1543   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1544   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1545 
1546   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1547   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1548 
1549   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1550     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1551   }
1552   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1553     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1554   }
1555 
1556   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1557   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1558 
1559 #if defined(PETSC_HAVE_DEVICE)
1560     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1561   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1562   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1563   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1564   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1565 #endif
1566 
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1571 {
1572   PetscErrorCode ierr;
1573   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1574   PetscInt       i;
1575 
1576   PetscFunctionBegin;
1577   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1578     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1579     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1580 #if defined(PETSC_HAVE_DEVICE)
1581     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1582     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1583 #endif
1584   }
1585   PetscFunctionReturn(0);
1586 }
1587