xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 5c23ca1c1eeefe18642a598910eb468c11d8e237)
1 #include "petsc/private/sfimpl.h"
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
6 
7 #if defined(PETSC_HAVE_CUDA)
8 #include <cuda_runtime.h>
9 #include <petsccublas.h>
10 #endif
11 #if defined(PETSC_HAVE_HIP)
12 #include <hip/hip_runtime.h>
13 #include <petschipblas.h>
14 #endif
15 /*
16  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
17  * therefore we pack data types manually. This file defines packing routines for the standard data types.
18  */
19 
20 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
21 
22 /* Operations working like s += t */
23 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
24 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
25 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
26 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
27 /* Ref MPI MAXLOC */
28 #define OP_XLOC(op,s,t) \
29   do {                                       \
30     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
31     else if (!((s).u op (t).u)) s = t;           \
32   } while (0)
33 
34 /* DEF_PackFunc - macro defining a Pack routine
35 
36    Arguments of the macro:
37    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
38    .BS        Block size for vectorization. It is a factor of bsz.
39    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
40 
41    Arguments of the Pack routine:
42    +count     Number of indices in idx[].
43    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
44    .opt       Per-pack optimization plan. NULL means no such plan.
45    .idx       Indices of entries to packed.
46    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
47    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
48    -packed    Address of the packed data.
49  */
50 #define DEF_PackFunc(Type,BS,EQ) \
51   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
52   {                                                                                                          \
53     PetscErrorCode ierr;                                                                                     \
54     const Type     *u = (const Type*)unpacked,*u2;                                                           \
55     Type           *p = (Type*)packed,*p2;                                                                   \
56     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
57     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
58     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
59     PetscFunctionBegin;                                                                                      \
60     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
61     else if (opt) { /* has optimizations available */                                                        \
62       p2 = p;                                                                                                \
63       for (r=0; r<opt->n; r++) {                                                                             \
64         u2 = u + opt->start[r]*MBS;                                                                          \
65         X  = opt->X[r];                                                                                      \
66         Y  = opt->Y[r];                                                                                      \
67         for (k=0; k<opt->dz[r]; k++)                                                                         \
68           for (j=0; j<opt->dy[r]; j++) {                                                                     \
69             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
70             p2  += opt->dx[r]*MBS;                                                                           \
71           }                                                                                                  \
72       }                                                                                                      \
73     } else {                                                                                                 \
74       for (i=0; i<count; i++)                                                                                \
75         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
76           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
77             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
78     }                                                                                                        \
79     PetscFunctionReturn(0);                                                                                  \
80   }
81 
82 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
83                 and inserts into a sparse array.
84 
85    Arguments:
86   .Type       Type of the data
87   .BS         Block size for vectorization
88   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
89 
90   Notes:
91    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
92  */
93 #define DEF_UnpackFunc(Type,BS,EQ)               \
94   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
95   {                                                                                                          \
96     PetscErrorCode ierr;                                                                                     \
97     Type           *u = (Type*)unpacked,*u2;                                                                 \
98     const Type     *p = (const Type*)packed;                                                                 \
99     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
100     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
101     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
102     PetscFunctionBegin;                                                                                      \
103     if (!idx) {                                                                                              \
104       u += start*MBS;                                                                                        \
105       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
106     } else if (opt) { /* has optimizations available */                                                      \
107       for (r=0; r<opt->n; r++) {                                                                             \
108         u2 = u + opt->start[r]*MBS;                                                                          \
109         X  = opt->X[r];                                                                                      \
110         Y  = opt->Y[r];                                                                                      \
111         for (k=0; k<opt->dz[r]; k++)                                                                         \
112           for (j=0; j<opt->dy[r]; j++) {                                                                     \
113             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
114             p   += opt->dx[r]*MBS;                                                                           \
115           }                                                                                                  \
116       }                                                                                                      \
117     } else {                                                                                                 \
118       for (i=0; i<count; i++)                                                                                \
119         for (j=0; j<M; j++)                                                                                  \
120           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
121     }                                                                                                        \
122     PetscFunctionReturn(0);                                                                                  \
123   }
124 
125 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
126 
127    Arguments:
128   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
129   .Type       Type of the data
130   .BS         Block size for vectorization
131   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
132   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
133   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
134  */
135 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
136   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
137   {                                                                                                          \
138     Type           *u = (Type*)unpacked,*u2;                                                                 \
139     const Type     *p = (const Type*)packed;                                                                 \
140     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
141     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
142     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
143     PetscFunctionBegin;                                                                                      \
144     if (!idx) {                                                                                              \
145       u += start*MBS;                                                                                        \
146       for (i=0; i<count; i++)                                                                                \
147         for (j=0; j<M; j++)                                                                                  \
148           for (k=0; k<BS; k++)                                                                               \
149             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
150     } else if (opt) { /* idx[] has patterns */                                                               \
151       for (r=0; r<opt->n; r++) {                                                                             \
152         u2 = u + opt->start[r]*MBS;                                                                          \
153         X  = opt->X[r];                                                                                      \
154         Y  = opt->Y[r];                                                                                      \
155         for (k=0; k<opt->dz[r]; k++)                                                                         \
156           for (j=0; j<opt->dy[r]; j++) {                                                                     \
157             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
158             p += opt->dx[r]*MBS;                                                                             \
159           }                                                                                                  \
160       }                                                                                                      \
161     } else {                                                                                                 \
162       for (i=0; i<count; i++)                                                                                \
163         for (j=0; j<M; j++)                                                                                  \
164           for (k=0; k<BS; k++)                                                                               \
165             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
166     }                                                                                                        \
167     PetscFunctionReturn(0);                                                                                  \
168   }
169 
170 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
171   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
172   {                                                                                                          \
173     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
174     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
175     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
176     const PetscInt MBS = M*BS;                                                                               \
177     PetscFunctionBegin;                                                                                      \
178     for (i=0; i<count; i++) {                                                                                \
179       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
180       l = i*MBS;                                                                                             \
181       for (j=0; j<M; j++)                                                                                    \
182         for (k=0; k<BS; k++) {                                                                               \
183           tmp = u[r+j*BS+k];                                                                                 \
184           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
185           p[l+j*BS+k] = tmp;                                                                                 \
186         }                                                                                                    \
187     }                                                                                                        \
188     PetscFunctionReturn(0);                                                                                  \
189   }
190 
191 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
192   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
193   {                                                                                                          \
194     PetscErrorCode ierr;                                                                                     \
195     const Type     *u = (const Type*)src;                                                                    \
196     Type           *v = (Type*)dst;                                                                          \
197     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
198     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
199     const PetscInt MBS = M*BS;                                                                               \
200     PetscFunctionBegin;                                                                                      \
201     if (!srcIdx) { /* src is contiguous */                                                                   \
202       u += srcStart*MBS;                                                                                     \
203       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
204     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
205       u += srcOpt->start[0]*MBS;                                                                             \
206       v += dstStart*MBS;                                                                                     \
207       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
208       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
209         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
210           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
211           v += srcOpt->dx[0]*MBS;                                                                            \
212         }                                                                                                    \
213     } else { /* all other cases */                                                                           \
214       for (i=0; i<count; i++) {                                                                              \
215         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
216         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
217         for (j=0; j<M; j++)                                                                                  \
218           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
219       }                                                                                                      \
220     }                                                                                                        \
221     PetscFunctionReturn(0);                                                                                  \
222   }
223 
224 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
225   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
226   {                                                                                                          \
227     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
228     const Type     *ldata = (const Type*)leafdata;                                                           \
229     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
230     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
231     const PetscInt MBS = M*BS;                                                                               \
232     PetscFunctionBegin;                                                                                      \
233     for (i=0; i<count; i++) {                                                                                \
234       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
235       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
236       for (j=0; j<M; j++)                                                                                    \
237         for (k=0; k<BS; k++) {                                                                               \
238           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
239           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
240         }                                                                                                    \
241     }                                                                                                        \
242     PetscFunctionReturn(0);                                                                                  \
243   }
244 
245 /* Pack, Unpack/Fetch ops */
246 #define DEF_Pack(Type,BS,EQ)                                                      \
247   DEF_PackFunc(Type,BS,EQ)                                                        \
248   DEF_UnpackFunc(Type,BS,EQ)                                                      \
249   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
250   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
251     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
252     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
253     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
254   }
255 
256 /* Add, Mult ops */
257 #define DEF_Add(Type,BS,EQ)                                                       \
258   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
259   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
260   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
261   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
262   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
263   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
264   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
265     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
266     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
267     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
268     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
269     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
270     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
271   }
272 
273 /* Max, Min ops */
274 #define DEF_Cmp(Type,BS,EQ)                                                       \
275   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
276   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
277   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
278   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
279   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
280     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
281     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
282     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
283     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
284   }
285 
286 /* Logical ops.
287   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
288   the compilation warning "empty macro arguments are undefined in ISO C90"
289  */
290 #define DEF_Log(Type,BS,EQ)                                                       \
291   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
292   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
293   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
294   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
295   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
296   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
297   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
298     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
299     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
300     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
301     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
302     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
303     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
304   }
305 
306 /* Bitwise ops */
307 #define DEF_Bit(Type,BS,EQ)                                                       \
308   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
309   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
310   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
311   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
312   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
313   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
314   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
315     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
316     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
317     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
318     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
319     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
320     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
321   }
322 
323 /* Maxloc, Minloc ops */
324 #define DEF_Xloc(Type,BS,EQ)                                                      \
325   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
326   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
327   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
328   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
329   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
330     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
331     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
332     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
333     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
334   }
335 
336 #define DEF_IntegerType(Type,BS,EQ)                                               \
337   DEF_Pack(Type,BS,EQ)                                                            \
338   DEF_Add(Type,BS,EQ)                                                             \
339   DEF_Cmp(Type,BS,EQ)                                                             \
340   DEF_Log(Type,BS,EQ)                                                             \
341   DEF_Bit(Type,BS,EQ)                                                             \
342   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
343     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
344     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
345     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
346     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
347     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
348   }
349 
350 #define DEF_RealType(Type,BS,EQ)                                                  \
351   DEF_Pack(Type,BS,EQ)                                                            \
352   DEF_Add(Type,BS,EQ)                                                             \
353   DEF_Cmp(Type,BS,EQ)                                                             \
354   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
355     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
356     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
357     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
358   }
359 
360 #if defined(PETSC_HAVE_COMPLEX)
361 #define DEF_ComplexType(Type,BS,EQ)                                               \
362   DEF_Pack(Type,BS,EQ)                                                            \
363   DEF_Add(Type,BS,EQ)                                                             \
364   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
365     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
367   }
368 #endif
369 
370 #define DEF_DumbType(Type,BS,EQ)                                                  \
371   DEF_Pack(Type,BS,EQ)                                                            \
372   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
373     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374   }
375 
376 /* Maxloc, Minloc */
377 #define DEF_PairType(Type,BS,EQ)                                                  \
378   DEF_Pack(Type,BS,EQ)                                                            \
379   DEF_Xloc(Type,BS,EQ)                                                            \
380   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
381     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
382     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
383   }
384 
385 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
386 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
387 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
388 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
389 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
390 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
391 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
392 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
393 
394 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
395 DEF_IntegerType(int,1,1)
396 DEF_IntegerType(int,2,1)
397 DEF_IntegerType(int,4,1)
398 DEF_IntegerType(int,8,1)
399 DEF_IntegerType(int,1,0)
400 DEF_IntegerType(int,2,0)
401 DEF_IntegerType(int,4,0)
402 DEF_IntegerType(int,8,0)
403 #endif
404 
405 /* The typedefs are used to get a typename without space that CPPJoin can handle */
406 typedef signed char SignedChar;
407 DEF_IntegerType(SignedChar,1,1)
408 DEF_IntegerType(SignedChar,2,1)
409 DEF_IntegerType(SignedChar,4,1)
410 DEF_IntegerType(SignedChar,8,1)
411 DEF_IntegerType(SignedChar,1,0)
412 DEF_IntegerType(SignedChar,2,0)
413 DEF_IntegerType(SignedChar,4,0)
414 DEF_IntegerType(SignedChar,8,0)
415 
416 typedef unsigned char UnsignedChar;
417 DEF_IntegerType(UnsignedChar,1,1)
418 DEF_IntegerType(UnsignedChar,2,1)
419 DEF_IntegerType(UnsignedChar,4,1)
420 DEF_IntegerType(UnsignedChar,8,1)
421 DEF_IntegerType(UnsignedChar,1,0)
422 DEF_IntegerType(UnsignedChar,2,0)
423 DEF_IntegerType(UnsignedChar,4,0)
424 DEF_IntegerType(UnsignedChar,8,0)
425 
426 DEF_RealType(PetscReal,1,1)
427 DEF_RealType(PetscReal,2,1)
428 DEF_RealType(PetscReal,4,1)
429 DEF_RealType(PetscReal,8,1)
430 DEF_RealType(PetscReal,1,0)
431 DEF_RealType(PetscReal,2,0)
432 DEF_RealType(PetscReal,4,0)
433 DEF_RealType(PetscReal,8,0)
434 
435 #if defined(PETSC_HAVE_COMPLEX)
436 DEF_ComplexType(PetscComplex,1,1)
437 DEF_ComplexType(PetscComplex,2,1)
438 DEF_ComplexType(PetscComplex,4,1)
439 DEF_ComplexType(PetscComplex,8,1)
440 DEF_ComplexType(PetscComplex,1,0)
441 DEF_ComplexType(PetscComplex,2,0)
442 DEF_ComplexType(PetscComplex,4,0)
443 DEF_ComplexType(PetscComplex,8,0)
444 #endif
445 
446 #define PairType(Type1,Type2) Type1##_##Type2
447 typedef struct {int u; int i;}           PairType(int,int);
448 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
449 DEF_PairType(PairType(int,int),1,1)
450 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
451 
452 /* If we don't know the basic type, we treat it as a stream of chars or ints */
453 DEF_DumbType(char,1,1)
454 DEF_DumbType(char,2,1)
455 DEF_DumbType(char,4,1)
456 DEF_DumbType(char,1,0)
457 DEF_DumbType(char,2,0)
458 DEF_DumbType(char,4,0)
459 
460 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
461 DEF_DumbType(DumbInt,1,1)
462 DEF_DumbType(DumbInt,2,1)
463 DEF_DumbType(DumbInt,4,1)
464 DEF_DumbType(DumbInt,8,1)
465 DEF_DumbType(DumbInt,1,0)
466 DEF_DumbType(DumbInt,2,0)
467 DEF_DumbType(DumbInt,4,0)
468 DEF_DumbType(DumbInt,8,0)
469 
470 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
471 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
472 {
473   int ierr;
474   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
475   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
476   return MPI_SUCCESS;
477 }
478 #endif
479 
480 /*
481    The routine Creates a communication link for the given operation. It first looks up its link cache. If
482    there is a free & suitable one, it uses it. Otherwise it creates a new one.
483 
484    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
485    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
486    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
487    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
488 
489    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
490 
491    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
492 
493    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
494    need pack/unpack data.
495 */
496 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
497 {
498   PetscErrorCode    ierr;
499   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
500   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
501   PetscSFLink       *p,link;
502   PetscSFDirection  direction;
503   MPI_Request       *reqs = NULL;
504   PetscBool         match,rootdirect[2],leafdirect[2];
505   PetscMemType      rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1*/
506   PetscMemType      leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
507   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
508   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
509 
510   PetscFunctionBegin;
511   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
512 
513   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
514   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
515     if (sfop == PETSCSF_BCAST) {
516       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
517       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
518     } else if (sfop == PETSCSF_REDUCE) {
519       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
520       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
521     } else { /* PETSCSF_FETCH */
522       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
523       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
524     }
525   }
526 
527   if (sf->use_gpu_aware_mpi) {
528     rootmtype_mpi = rootmtype;
529     leafmtype_mpi = leafmtype;
530   } else {
531     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
532   }
533   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
534   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
535   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
536 
537   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
538   nrootreqs = bas->nrootreqs;
539   nleafreqs = sf->nleafreqs;
540 
541   /* Look for free links in cache */
542   for (p=&bas->avail; (link=*p); p=&link->next) {
543     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
544     if (match) {
545       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
546          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
547       */
548       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
549         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
550         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}}
551         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
552       }
553       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
554         reqs = link->leafreqs[direction][leafmtype][1];
555         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}}
556         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
557       }
558       *p = link->next; /* Remove from available list */
559       goto found;
560     }
561   }
562 
563   ierr = PetscNew(&link);CHKERRQ(ierr);
564   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
565   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
566 
567   nreqs = (nrootreqs+nleafreqs)*8;
568   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
569   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
570 
571   for (i=0; i<2; i++) { /* Two communication directions */
572     for (j=0; j<2; j++) { /* Two memory types */
573       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
574         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
575         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
576       }
577     }
578   }
579 
580 found:
581 
582 #if defined(PETSC_HAVE_DEVICE)
583   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
584     #if defined(PETSC_HAVE_CUDA)
585       if (sf->backend == PETSCSF_BACKEND_CUDA)   {ierr = PetscSFLinkSetUp_Cuda(sf,link,unit);CHKERRQ(ierr);}
586     #endif
587     #if defined(PETSC_HAVE_HIP)
588       if (sf->backend == PETSCSF_BACKEND_HIP)   {ierr = PetscSFLinkSetUp_Hip(sf,link,unit);CHKERRQ(ierr);}
589     #endif
590     #if defined(PETSC_HAVE_KOKKOS)
591       if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);}
592     #endif
593   }
594 #endif
595 
596   /* Allocate buffers along root/leafdata */
597   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
598     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
599     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
600     if (bas->rootbuflen[i]) {
601       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
602         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
603       } else { /* Have to have a separate rootbuf */
604         if (!link->rootbuf_alloc[i][rootmtype]) {
605           ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
606         }
607         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
608       }
609     }
610 
611     if (sf->leafbuflen[i]) {
612       if (leafdirect[i]) {
613         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
614       } else {
615         if (!link->leafbuf_alloc[i][leafmtype]) {
616           ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
617         }
618         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
619       }
620     }
621   }
622 
623 #if defined(PETSC_HAVE_DEVICE)
624   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
625   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
626     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
627       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
628     }
629     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
630   }
631   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
632     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
633       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
634     }
635     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
636   }
637 #endif
638 
639   /* Set `current` state of the link. They may change between different SF invocations with the same link */
640   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
641     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
642     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
643   }
644 
645   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
646   link->leafdata = leafdata;
647   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
648     link->rootdirect[i] = rootdirect[i];
649     link->leafdirect[i] = leafdirect[i];
650   }
651   link->rootdirect_mpi  = rootdirect_mpi;
652   link->leafdirect_mpi  = leafdirect_mpi;
653   link->rootmtype       = rootmtype;
654   link->leafmtype       = leafmtype;
655   link->rootmtype_mpi   = rootmtype_mpi;
656   link->leafmtype_mpi   = leafmtype_mpi;
657 
658   link->next            = bas->inuse;
659   bas->inuse            = link;
660   *mylink               = link;
661   PetscFunctionReturn(0);
662 }
663 
664 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
665    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
666 */
667 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
668 {
669   PetscErrorCode       ierr;
670   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
671   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
672   const PetscInt       *rootoffset,*leafoffset;
673   PetscMPIInt          n;
674   MPI_Aint             disp;
675   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
676   MPI_Datatype         unit = link->unit;
677   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
678   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
679 
680   PetscFunctionBegin;
681   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
682   if (sf->persistent) {
683     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
684       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
685       if (direction == PETSCSF_LEAF2ROOT) {
686         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
687           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
688           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
689           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
690         }
691       } else { /* PETSCSF_ROOT2LEAF */
692         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
693           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
694           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
695           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
696         }
697       }
698       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
699     }
700 
701     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
702       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
703       if (direction == PETSCSF_LEAF2ROOT) {
704         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
705           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
706           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
707           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
708         }
709       } else { /* PETSCSF_ROOT2LEAF */
710         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
711           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
712           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
713           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
714         }
715       }
716       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
717     }
718   }
719   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
720   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
721   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
722   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
723   PetscFunctionReturn(0);
724 }
725 
726 
727 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
728 {
729   PetscErrorCode    ierr;
730   PetscSFLink       link,*p;
731   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
732 
733   PetscFunctionBegin;
734   /* Look for types in cache */
735   for (p=&bas->inuse; (link=*p); p=&link->next) {
736     PetscBool match;
737     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
738     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
739       switch (cmode) {
740       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
741       case PETSC_USE_POINTER: break;
742       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
743       }
744       *mylink = link;
745       PetscFunctionReturn(0);
746     }
747   }
748   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
749 }
750 
751 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
752 {
753   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
754 
755   PetscFunctionBegin;
756   (*link)->rootdata = NULL;
757   (*link)->leafdata = NULL;
758   (*link)->next     = bas->avail;
759   bas->avail        = *link;
760   *link             = NULL;
761   PetscFunctionReturn(0);
762 }
763 
764 /* Destroy all links chained in 'avail' */
765 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
766 {
767   PetscErrorCode    ierr;
768   PetscSFLink       link = *avail,next;
769   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
770   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
771 
772   PetscFunctionBegin;
773   for (; link; link=next) {
774     next = link->next;
775     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);}
776     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
777       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);}
778     }
779     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
780     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
781       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
782       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
783       #if defined(PETSC_HAVE_DEVICE)
784       ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
785       ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
786       #endif
787     }
788   #if defined(PETSC_HAVE_DEVICE)
789     if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);}
790    /*TODO:  Make runtime */
791    #if defined(PETSC_HAVE_CUDA)
792     if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;}
793    #elif defined(PETSC_HAVE_HIP)
794     if (link->stream) {hipError_t  cerr = hipStreamDestroy(link->stream);CHKERRHIP(cerr); link->stream = NULL;}
795    #endif
796   #endif
797     ierr = PetscFree(link);CHKERRQ(ierr);
798   }
799   *avail = NULL;
800   PetscFunctionReturn(0);
801 }
802 
803 /* Error out on unsupported overlapped communications */
804 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
805 {
806   PetscErrorCode    ierr;
807   PetscSFLink       link,*p;
808   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
809   PetscBool         match;
810 
811   PetscFunctionBegin;
812   if (PetscDefined(USE_DEBUG)) {
813     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
814        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
815     */
816     if (rootdata || leafdata) {
817       for (p=&bas->inuse; (link=*p); p=&link->next) {
818         ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
819         if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
820       }
821     }
822   }
823   PetscFunctionReturn(0);
824 }
825 
826 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
827 {
828   PetscFunctionBegin;
829   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
830   PetscFunctionReturn(0);
831 }
832 
833 
834 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
835 {
836   PetscErrorCode ierr;
837   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
838   PetscBool      is2Int,is2PetscInt;
839   PetscMPIInt    ni,na,nd,combiner;
840 #if defined(PETSC_HAVE_COMPLEX)
841   PetscInt       nPetscComplex=0;
842 #endif
843 
844   PetscFunctionBegin;
845   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
846   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
847   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
848   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
849   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
850   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
851 #if defined(PETSC_HAVE_COMPLEX)
852   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
853 #endif
854   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
855   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
856   /* TODO: shaell we also handle Fortran MPI_2REAL? */
857   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr);
858   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
859   link->bs = 1; /* default */
860 
861   if (is2Int) {
862     PackInit_PairType_int_int_1_1(link);
863     link->bs        = 1;
864     link->unitbytes = 2*sizeof(int);
865     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
866     link->basicunit = MPI_2INT;
867     link->unit      = MPI_2INT;
868   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
869     PackInit_PairType_PetscInt_PetscInt_1_1(link);
870     link->bs        = 1;
871     link->unitbytes = 2*sizeof(PetscInt);
872     link->basicunit = MPIU_2INT;
873     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
874     link->unit      = MPIU_2INT;
875   } else if (nPetscReal) {
876     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
877     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
878     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
879     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
880     link->bs        = nPetscReal;
881     link->unitbytes = nPetscReal*sizeof(PetscReal);
882     link->basicunit = MPIU_REAL;
883     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
884   } else if (nPetscInt) {
885     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
886     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
887     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
888     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
889     link->bs        = nPetscInt;
890     link->unitbytes = nPetscInt*sizeof(PetscInt);
891     link->basicunit = MPIU_INT;
892     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
893 #if defined(PETSC_USE_64BIT_INDICES)
894   } else if (nInt) {
895     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
896     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
897     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
898     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
899     link->bs        = nInt;
900     link->unitbytes = nInt*sizeof(int);
901     link->basicunit = MPI_INT;
902     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
903 #endif
904   } else if (nSignedChar) {
905     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
906     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
907     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
908     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
909     link->bs        = nSignedChar;
910     link->unitbytes = nSignedChar*sizeof(SignedChar);
911     link->basicunit = MPI_SIGNED_CHAR;
912     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
913   }  else if (nUnsignedChar) {
914     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
915     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
916     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
917     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
918     link->bs        = nUnsignedChar;
919     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
920     link->basicunit = MPI_UNSIGNED_CHAR;
921     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
922 #if defined(PETSC_HAVE_COMPLEX)
923   } else if (nPetscComplex) {
924     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
925     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
926     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
927     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
928     link->bs        = nPetscComplex;
929     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
930     link->basicunit = MPIU_COMPLEX;
931     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
932 #endif
933   } else {
934     MPI_Aint lb,nbyte;
935     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr);
936     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
937     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
938       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
939       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
940       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
941       link->bs        = nbyte;
942       link->unitbytes = nbyte;
943       link->basicunit = MPI_BYTE;
944     } else {
945       nInt = nbyte / sizeof(int);
946       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
947       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
948       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
949       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
950       link->bs        = nInt;
951       link->unitbytes = nbyte;
952       link->basicunit = MPI_INT;
953     }
954     if (link->isbuiltin) link->unit = unit;
955   }
956 
957   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);}
958 
959   link->Memcpy = PetscSFLinkMemcpy_Host;
960   PetscFunctionReturn(0);
961 }
962 
963 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
964 {
965   PetscFunctionBegin;
966   *UnpackAndOp = NULL;
967   if (mtype == PETSC_MEMTYPE_HOST) {
968     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
969     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
970     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
971     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
972     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
973     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
974     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
975     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
976     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
977     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
978     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
979     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
980     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
981   }
982 #if defined(PETSC_HAVE_DEVICE)
983   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
984     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
985     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
986     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
987     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
988     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
989     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
990     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
991     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
992     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
993     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
994     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
995     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
996     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
997   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
998     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
999     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
1000     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
1001     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
1002     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
1003     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
1004     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
1005     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
1006     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
1007     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
1008     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
1009     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
1010     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
1011   }
1012 #endif
1013   PetscFunctionReturn(0);
1014 }
1015 
1016 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
1017 {
1018   PetscFunctionBegin;
1019   *ScatterAndOp = NULL;
1020   if (mtype == PETSC_MEMTYPE_HOST) {
1021     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
1022     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
1023     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
1024     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
1025     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
1026     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
1027     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
1028     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
1029     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
1030     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
1031     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
1032     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
1033     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
1034   }
1035 #if defined(PETSC_HAVE_DEVICE)
1036   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1037     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
1038     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1039     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
1040     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1041     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1042     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1043     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1044     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1045     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1046     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1047     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1048     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1049     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1050   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1051     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1052     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1053     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1054     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1055     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1056     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1057     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1058     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1059     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1060     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1061     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1062     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1063     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1064   }
1065 #endif
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1070 {
1071   PetscFunctionBegin;
1072   *FetchAndOp = NULL;
1073   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1074   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1075 #if defined(PETSC_HAVE_DEVICE)
1076   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1077   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1078 #endif
1079   PetscFunctionReturn(0);
1080 }
1081 
1082 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1083 {
1084   PetscFunctionBegin;
1085   *FetchAndOpLocal = NULL;
1086   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1087   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1088 #if defined(PETSC_HAVE_DEVICE)
1089   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1090   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1091 #endif
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1096 {
1097   PetscErrorCode ierr;
1098   PetscLogDouble flops;
1099   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1100 
1101 
1102   PetscFunctionBegin;
1103   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1104     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1105 #if defined(PETSC_HAVE_DEVICE)
1106     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1107 #endif
1108     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1109   }
1110   PetscFunctionReturn(0);
1111 }
1112 
1113 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1114 {
1115   PetscLogDouble flops;
1116   PetscErrorCode ierr;
1117 
1118   PetscFunctionBegin;
1119   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1120     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1121 #if defined(PETSC_HAVE_DEVICE)
1122     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1123 #endif
1124     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1125   }
1126   PetscFunctionReturn(0);
1127 }
1128 
1129 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1130   Input Arguments:
1131   +sf      - The StarForest
1132   .link    - The link
1133   .count   - Number of entries to unpack
1134   .start   - The first index, significent when indices=NULL
1135   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1136   .buf     - A contiguous buffer to unpack from
1137   -op      - Operation after unpack
1138 
1139   Output Arguments:
1140   .data    - The data to unpack to
1141 */
1142 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1143 {
1144   PetscFunctionBegin;
1145 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1146   {
1147     PetscErrorCode ierr;
1148     PetscInt       i;
1149     PetscMPIInt    n;
1150     if (indices) {
1151       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1152          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1153       */
1154       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);}
1155     } else {
1156       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1157       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr);
1158     }
1159   }
1160   PetscFunctionReturn(0);
1161 #else
1162   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1163 #endif
1164 }
1165 
1166 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1167 {
1168   PetscFunctionBegin;
1169 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1170   {
1171     PetscErrorCode ierr;
1172     PetscInt       i,disp;
1173     if (!srcIdx) {
1174       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
1175     } else {
1176       for (i=0; i<count; i++) {
1177         disp = dstIdx? dstIdx[i] : dstStart + i;
1178         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);
1179       }
1180     }
1181   }
1182   PetscFunctionReturn(0);
1183 #else
1184   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1185 #endif
1186 }
1187 
1188 /*=============================================================================
1189               Pack/Unpack/Fetch/Scatter routines
1190  ============================================================================*/
1191 
1192 /* Pack rootdata to rootbuf
1193   Input Arguments:
1194   + sf       - The SF this packing works on.
1195   . link     - It gives the memtype of the roots and also provides root buffer.
1196   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1197   - rootdata - Where to read the roots.
1198 
1199   Notes:
1200   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1201   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1202  */
1203 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1204 {
1205   PetscErrorCode   ierr;
1206   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1207   const PetscInt   *rootindices = NULL;
1208   PetscInt         count,start;
1209   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1210   PetscMemType     rootmtype = link->rootmtype;
1211   PetscSFPackOpt   opt = NULL;
1212 
1213   PetscFunctionBegin;
1214   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1215   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1216   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1217     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1218     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1219     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1220   }
1221   if (scope == PETSCSF_REMOTE) {
1222     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1223     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1224   }
1225   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1226   PetscFunctionReturn(0);
1227 }
1228 
1229 /* Pack leafdata to leafbuf */
1230 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1231 {
1232   PetscErrorCode   ierr;
1233   const PetscInt   *leafindices = NULL;
1234   PetscInt         count,start;
1235   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1236   PetscMemType     leafmtype = link->leafmtype;
1237   PetscSFPackOpt   opt = NULL;
1238 
1239   PetscFunctionBegin;
1240   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1241   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1242   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1243     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1244     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1245     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1246   }
1247   if (scope == PETSCSF_REMOTE) {
1248     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1249     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1250   }
1251   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1252   PetscFunctionReturn(0);
1253 }
1254 
1255 /* Unpack rootbuf to rootdata */
1256 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1257 {
1258   PetscErrorCode   ierr;
1259   const PetscInt   *rootindices = NULL;
1260   PetscInt         count,start;
1261   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1262   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1263   PetscMemType     rootmtype = link->rootmtype;
1264   PetscSFPackOpt   opt = NULL;
1265 
1266   PetscFunctionBegin;
1267   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1268   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1269   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1270     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1271     if (UnpackAndOp) {
1272       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1273       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1274     } else {
1275       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1276       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1277     }
1278   }
1279   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1280   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1281   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1282   PetscFunctionReturn(0);
1283 }
1284 
1285 /* Unpack leafbuf to leafdata */
1286 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1287 {
1288   PetscErrorCode   ierr;
1289   const PetscInt   *leafindices = NULL;
1290   PetscInt         count,start;
1291   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1292   PetscMemType     leafmtype = link->leafmtype;
1293   PetscSFPackOpt   opt = NULL;
1294 
1295   PetscFunctionBegin;
1296   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1297   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1298   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1299     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1300     if (UnpackAndOp) {
1301       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1302       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1303     } else {
1304       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1305       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1306     }
1307   }
1308   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1309   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1310   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1311   PetscFunctionReturn(0);
1312 }
1313 
1314 /* FetchAndOp rootdata with rootbuf */
1315 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1316 {
1317   PetscErrorCode     ierr;
1318   const PetscInt     *rootindices = NULL;
1319   PetscInt           count,start;
1320   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1321   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1322   PetscMemType       rootmtype = link->rootmtype;
1323   PetscSFPackOpt     opt = NULL;
1324 
1325   PetscFunctionBegin;
1326   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1327   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1328   if (bas->rootbuflen[scope]) {
1329     /* Do FetchAndOp on rootdata with rootbuf */
1330     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1331     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1332     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1333   }
1334   if (scope == PETSCSF_REMOTE) {
1335     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1336     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1337   }
1338   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1339   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1340   PetscFunctionReturn(0);
1341 }
1342 
1343 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1344 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1345 {
1346   PetscErrorCode       ierr;
1347   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1348   PetscInt             count,rootstart,leafstart;
1349   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1350   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1351   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1352   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1353 
1354   PetscFunctionBegin;
1355   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1356   if (rootmtype != leafmtype) { /* Uncommon case */
1357      /* The local communication has to go through pack and unpack */
1358     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1359     ierr = (*link->Memcpy)(link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1360     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1361   } else {
1362     ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1363     if (ScatterAndOp) {
1364       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1365       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1366       ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1367     } else {
1368       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1369       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1370       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1371     }
1372   }
1373   PetscFunctionReturn(0);
1374 }
1375 
1376 /* Reduce leafdata to rootdata locally */
1377 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1378 {
1379   PetscErrorCode       ierr;
1380   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1381   PetscInt             count,rootstart,leafstart;
1382   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1383   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1384   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1385   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1386 
1387   PetscFunctionBegin;
1388   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1389   if (rootmtype != leafmtype) {
1390     /* The local communication has to go through pack and unpack */
1391     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1392     ierr = (*link->Memcpy)(link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1393     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1394   } else {
1395     ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1396     if (ScatterAndOp) {
1397       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1398       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1399       ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1400     } else {
1401       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1402       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1403       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1404     }
1405   }
1406   PetscFunctionReturn(0);
1407 }
1408 
1409 /* Fetch rootdata to leafdata and leafupdate locally */
1410 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1411 {
1412   PetscErrorCode       ierr;
1413   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1414   PetscInt             count,rootstart,leafstart;
1415   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1416   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1417   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1418   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1419 
1420   PetscFunctionBegin;
1421   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1422   if (rootmtype != leafmtype) {
1423    /* The local communication has to go through pack and unpack */
1424     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1425   } else {
1426     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1427     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1428     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1429     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1430   }
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 /*
1435   Create per-rank pack/unpack optimizations based on indice patterns
1436 
1437    Input Parameters:
1438   +  n       - Number of destination ranks
1439   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1440   -  idx     - [*]   Array storing indices
1441 
1442    Output Parameters:
1443   +  opt     - Pack optimizations. NULL if no optimizations.
1444 */
1445 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1446 {
1447   PetscErrorCode ierr;
1448   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1449   PetscBool      optimizable = PETSC_TRUE;
1450   PetscSFPackOpt opt;
1451 
1452   PetscFunctionBegin;
1453   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1454   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1455   opt->n      = opt->array[0] = n;
1456   opt->offset = opt->array + 1;
1457   opt->start  = opt->array + n   + 2;
1458   opt->dx     = opt->array + 2*n + 2;
1459   opt->dy     = opt->array + 3*n + 2;
1460   opt->dz     = opt->array + 4*n + 2;
1461   opt->X      = opt->array + 5*n + 2;
1462   opt->Y      = opt->array + 6*n + 2;
1463 
1464   for (r=0; r<n; r++) { /* For each destination rank */
1465     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1466     p     = offset[r];
1467     start = idx[p]; /* First index for this rank */
1468     p++;
1469 
1470     /* Search in X dimension */
1471     for (dx=1; dx<m; dx++,p++) {
1472       if (start+dx != idx[p]) break;
1473     }
1474 
1475     dydz = m/dx;
1476     X    = dydz > 1 ? (idx[p]-start) : dx;
1477     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1478     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1479     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1480       for (i=0; i<dx; i++,p++) {
1481         if (start+X*dy+i != idx[p]) {
1482           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1483           else goto Z_dimension;
1484         }
1485       }
1486     }
1487 
1488 Z_dimension:
1489     dz = m/(dx*dy);
1490     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1491     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1492     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1493     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1494       for (j=0; j<dy; j++) {
1495         for (i=0; i<dx; i++,p++) {
1496           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1497         }
1498       }
1499     }
1500     opt->start[r] = start;
1501     opt->dx[r]    = dx;
1502     opt->dy[r]    = dy;
1503     opt->dz[r]    = dz;
1504     opt->X[r]     = X;
1505     opt->Y[r]     = Y;
1506   }
1507 
1508 finish:
1509   /* If not optimizable, free arrays to save memory */
1510   if (!n || !optimizable) {
1511     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1512     ierr = PetscFree(opt);CHKERRQ(ierr);
1513     *out = NULL;
1514   } else {
1515     opt->offset[0] = 0;
1516     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1517     *out = opt;
1518   }
1519   PetscFunctionReturn(0);
1520 }
1521 
1522 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1523 {
1524   PetscErrorCode ierr;
1525   PetscSFPackOpt opt = *out;
1526 
1527   PetscFunctionBegin;
1528   if (opt) {
1529     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
1530     ierr = PetscFree(opt);CHKERRQ(ierr);
1531     *out = NULL;
1532   }
1533   PetscFunctionReturn(0);
1534 }
1535 
1536 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1537 {
1538   PetscErrorCode ierr;
1539   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1540   PetscInt       i,j;
1541 
1542   PetscFunctionBegin;
1543   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1544   for (i=0; i<2; i++) { /* Set defaults */
1545     sf->leafstart[i]   = 0;
1546     sf->leafcontig[i]  = PETSC_TRUE;
1547     sf->leafdups[i]    = PETSC_FALSE;
1548     bas->rootstart[i]  = 0;
1549     bas->rootcontig[i] = PETSC_TRUE;
1550     bas->rootdups[i]   = PETSC_FALSE;
1551   }
1552 
1553   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1554   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1555 
1556   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1557   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1558 
1559   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1560   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1561     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1562   }
1563   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1564     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1565   }
1566 
1567   /* If not, see if we can have per-rank optimizations by doing index analysis */
1568   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1569   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1570 
1571   /* Are root indices for self and remote contiguous? */
1572   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1573   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1574 
1575   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1576   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1577 
1578   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1579     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1580   }
1581   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1582     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1583   }
1584 
1585   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1586   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1587 
1588 #if defined(PETSC_HAVE_DEVICE)
1589     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1590   if (PetscDefined(HAVE_DEVICE)) {
1591     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1592     if (!sf->leafcontig[0]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1593     if (!sf->leafcontig[1]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1594     if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1595     if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1596   }
1597 #endif
1598 
1599   PetscFunctionReturn(0);
1600 }
1601 
1602 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1603 {
1604   PetscErrorCode ierr;
1605   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1606   PetscInt       i;
1607 
1608   PetscFunctionBegin;
1609   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1610     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1611     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1612 #if defined(PETSC_HAVE_DEVICE)
1613     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1614     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1615 #endif
1616   }
1617   PetscFunctionReturn(0);
1618 }
1619