xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision c2c9d402870eb3e0b9d329459373e3140c32f915)
1 #include "petsc/private/sfimpl.h"
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
6 
7 #if defined(PETSC_HAVE_CUDA)
8 #include <cuda_runtime.h>
9 #include <petsccublas.h>
10 #endif
11 #if defined(PETSC_HAVE_HIP)
12 #include <hip/hip_runtime.h>
13 #include <petschipblas.h>
14 #endif
15 /*
16  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
17  * therefore we pack data types manually. This file defines packing routines for the standard data types.
18  */
19 
20 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
21 
22 /* Operations working like s += t */
23 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
24 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
25 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
26 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
27 /* Ref MPI MAXLOC */
28 #define OP_XLOC(op,s,t) \
29   do {                                       \
30     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
31     else if (!((s).u op (t).u)) s = t;           \
32   } while (0)
33 
34 /* DEF_PackFunc - macro defining a Pack routine
35 
36    Arguments of the macro:
37    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
38    .BS        Block size for vectorization. It is a factor of bsz.
39    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
40 
41    Arguments of the Pack routine:
42    +count     Number of indices in idx[].
43    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
44    .opt       Per-pack optimization plan. NULL means no such plan.
45    .idx       Indices of entries to packed.
46    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
47    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
48    -packed    Address of the packed data.
49  */
50 #define DEF_PackFunc(Type,BS,EQ) \
51   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
52   {                                                                                                          \
53     PetscErrorCode ierr;                                                                                     \
54     const Type     *u = (const Type*)unpacked,*u2;                                                           \
55     Type           *p = (Type*)packed,*p2;                                                                   \
56     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
57     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
58     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
59     PetscFunctionBegin;                                                                                      \
60     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
61     else if (opt) { /* has optimizations available */                                                        \
62       p2 = p;                                                                                                \
63       for (r=0; r<opt->n; r++) {                                                                             \
64         u2 = u + opt->start[r]*MBS;                                                                          \
65         X  = opt->X[r];                                                                                      \
66         Y  = opt->Y[r];                                                                                      \
67         for (k=0; k<opt->dz[r]; k++)                                                                         \
68           for (j=0; j<opt->dy[r]; j++) {                                                                     \
69             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
70             p2  += opt->dx[r]*MBS;                                                                           \
71           }                                                                                                  \
72       }                                                                                                      \
73     } else {                                                                                                 \
74       for (i=0; i<count; i++)                                                                                \
75         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
76           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
77             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
78     }                                                                                                        \
79     PetscFunctionReturn(0);                                                                                  \
80   }
81 
82 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
83                 and inserts into a sparse array.
84 
85    Arguments:
86   .Type       Type of the data
87   .BS         Block size for vectorization
88   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
89 
90   Notes:
91    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
92  */
93 #define DEF_UnpackFunc(Type,BS,EQ)               \
94   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
95   {                                                                                                          \
96     PetscErrorCode ierr;                                                                                     \
97     Type           *u = (Type*)unpacked,*u2;                                                                 \
98     const Type     *p = (const Type*)packed;                                                                 \
99     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
100     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
101     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
102     PetscFunctionBegin;                                                                                      \
103     if (!idx) {                                                                                              \
104       u += start*MBS;                                                                                        \
105       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
106     } else if (opt) { /* has optimizations available */                                                      \
107       for (r=0; r<opt->n; r++) {                                                                             \
108         u2 = u + opt->start[r]*MBS;                                                                          \
109         X  = opt->X[r];                                                                                      \
110         Y  = opt->Y[r];                                                                                      \
111         for (k=0; k<opt->dz[r]; k++)                                                                         \
112           for (j=0; j<opt->dy[r]; j++) {                                                                     \
113             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
114             p   += opt->dx[r]*MBS;                                                                           \
115           }                                                                                                  \
116       }                                                                                                      \
117     } else {                                                                                                 \
118       for (i=0; i<count; i++)                                                                                \
119         for (j=0; j<M; j++)                                                                                  \
120           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
121     }                                                                                                        \
122     PetscFunctionReturn(0);                                                                                  \
123   }
124 
125 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
126 
127    Arguments:
128   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
129   .Type       Type of the data
130   .BS         Block size for vectorization
131   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
132   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
133   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
134  */
135 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
136   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
137   {                                                                                                          \
138     Type           *u = (Type*)unpacked,*u2;                                                                 \
139     const Type     *p = (const Type*)packed;                                                                 \
140     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
141     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
142     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
143     PetscFunctionBegin;                                                                                      \
144     if (!idx) {                                                                                              \
145       u += start*MBS;                                                                                        \
146       for (i=0; i<count; i++)                                                                                \
147         for (j=0; j<M; j++)                                                                                  \
148           for (k=0; k<BS; k++)                                                                               \
149             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
150     } else if (opt) { /* idx[] has patterns */                                                               \
151       for (r=0; r<opt->n; r++) {                                                                             \
152         u2 = u + opt->start[r]*MBS;                                                                          \
153         X  = opt->X[r];                                                                                      \
154         Y  = opt->Y[r];                                                                                      \
155         for (k=0; k<opt->dz[r]; k++)                                                                         \
156           for (j=0; j<opt->dy[r]; j++) {                                                                     \
157             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
158             p += opt->dx[r]*MBS;                                                                             \
159           }                                                                                                  \
160       }                                                                                                      \
161     } else {                                                                                                 \
162       for (i=0; i<count; i++)                                                                                \
163         for (j=0; j<M; j++)                                                                                  \
164           for (k=0; k<BS; k++)                                                                               \
165             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
166     }                                                                                                        \
167     PetscFunctionReturn(0);                                                                                  \
168   }
169 
170 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
171   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
172   {                                                                                                          \
173     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
174     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
175     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
176     const PetscInt MBS = M*BS;                                                                               \
177     PetscFunctionBegin;                                                                                      \
178     for (i=0; i<count; i++) {                                                                                \
179       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
180       l = i*MBS;                                                                                             \
181       for (j=0; j<M; j++)                                                                                    \
182         for (k=0; k<BS; k++) {                                                                               \
183           tmp = u[r+j*BS+k];                                                                                 \
184           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
185           p[l+j*BS+k] = tmp;                                                                                 \
186         }                                                                                                    \
187     }                                                                                                        \
188     PetscFunctionReturn(0);                                                                                  \
189   }
190 
191 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
192   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
193   {                                                                                                          \
194     PetscErrorCode ierr;                                                                                     \
195     const Type     *u = (const Type*)src;                                                                    \
196     Type           *v = (Type*)dst;                                                                          \
197     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
198     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
199     const PetscInt MBS = M*BS;                                                                               \
200     PetscFunctionBegin;                                                                                      \
201     if (!srcIdx) { /* src is contiguous */                                                                   \
202       u += srcStart*MBS;                                                                                     \
203       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
204     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
205       u += srcOpt->start[0]*MBS;                                                                             \
206       v += dstStart*MBS;                                                                                     \
207       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
208       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
209         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
210           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
211           v += srcOpt->dx[0]*MBS;                                                                            \
212         }                                                                                                    \
213     } else { /* all other cases */                                                                           \
214       for (i=0; i<count; i++) {                                                                              \
215         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
216         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
217         for (j=0; j<M; j++)                                                                                  \
218           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
219       }                                                                                                      \
220     }                                                                                                        \
221     PetscFunctionReturn(0);                                                                                  \
222   }
223 
224 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
225   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
226   {                                                                                                          \
227     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
228     const Type     *ldata = (const Type*)leafdata;                                                           \
229     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
230     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
231     const PetscInt MBS = M*BS;                                                                               \
232     PetscFunctionBegin;                                                                                      \
233     for (i=0; i<count; i++) {                                                                                \
234       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
235       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
236       for (j=0; j<M; j++)                                                                                    \
237         for (k=0; k<BS; k++) {                                                                               \
238           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
239           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
240         }                                                                                                    \
241     }                                                                                                        \
242     PetscFunctionReturn(0);                                                                                  \
243   }
244 
245 /* Pack, Unpack/Fetch ops */
246 #define DEF_Pack(Type,BS,EQ)                                                      \
247   DEF_PackFunc(Type,BS,EQ)                                                        \
248   DEF_UnpackFunc(Type,BS,EQ)                                                      \
249   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
250   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
251     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
252     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
253     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
254   }
255 
256 /* Add, Mult ops */
257 #define DEF_Add(Type,BS,EQ)                                                       \
258   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
259   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
260   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
261   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
262   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
263   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
264   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
265     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
266     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
267     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
268     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
269     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
270     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
271   }
272 
273 /* Max, Min ops */
274 #define DEF_Cmp(Type,BS,EQ)                                                       \
275   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
276   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
277   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
278   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
279   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
280     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
281     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
282     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
283     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
284   }
285 
286 /* Logical ops.
287   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
288   the compilation warning "empty macro arguments are undefined in ISO C90"
289  */
290 #define DEF_Log(Type,BS,EQ)                                                       \
291   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
292   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
293   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
294   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
295   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
296   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
297   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
298     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
299     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
300     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
301     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
302     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
303     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
304   }
305 
306 /* Bitwise ops */
307 #define DEF_Bit(Type,BS,EQ)                                                       \
308   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
309   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
310   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
311   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
312   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
313   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
314   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
315     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
316     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
317     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
318     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
319     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
320     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
321   }
322 
323 /* Maxloc, Minloc ops */
324 #define DEF_Xloc(Type,BS,EQ)                                                      \
325   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
326   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
327   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
328   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
329   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
330     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
331     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
332     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
333     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
334   }
335 
336 #define DEF_IntegerType(Type,BS,EQ)                                               \
337   DEF_Pack(Type,BS,EQ)                                                            \
338   DEF_Add(Type,BS,EQ)                                                             \
339   DEF_Cmp(Type,BS,EQ)                                                             \
340   DEF_Log(Type,BS,EQ)                                                             \
341   DEF_Bit(Type,BS,EQ)                                                             \
342   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
343     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
344     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
345     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
346     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
347     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
348   }
349 
350 #define DEF_RealType(Type,BS,EQ)                                                  \
351   DEF_Pack(Type,BS,EQ)                                                            \
352   DEF_Add(Type,BS,EQ)                                                             \
353   DEF_Cmp(Type,BS,EQ)                                                             \
354   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
355     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
356     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
357     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
358   }
359 
360 #if defined(PETSC_HAVE_COMPLEX)
361 #define DEF_ComplexType(Type,BS,EQ)                                               \
362   DEF_Pack(Type,BS,EQ)                                                            \
363   DEF_Add(Type,BS,EQ)                                                             \
364   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
365     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
367   }
368 #endif
369 
370 #define DEF_DumbType(Type,BS,EQ)                                                  \
371   DEF_Pack(Type,BS,EQ)                                                            \
372   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
373     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374   }
375 
376 /* Maxloc, Minloc */
377 #define DEF_PairType(Type,BS,EQ)                                                  \
378   DEF_Pack(Type,BS,EQ)                                                            \
379   DEF_Xloc(Type,BS,EQ)                                                            \
380   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
381     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
382     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
383   }
384 
385 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
386 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
387 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
388 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
389 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
390 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
391 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
392 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
393 
394 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
395 DEF_IntegerType(int,1,1)
396 DEF_IntegerType(int,2,1)
397 DEF_IntegerType(int,4,1)
398 DEF_IntegerType(int,8,1)
399 DEF_IntegerType(int,1,0)
400 DEF_IntegerType(int,2,0)
401 DEF_IntegerType(int,4,0)
402 DEF_IntegerType(int,8,0)
403 #endif
404 
405 /* The typedefs are used to get a typename without space that CPPJoin can handle */
406 typedef signed char SignedChar;
407 DEF_IntegerType(SignedChar,1,1)
408 DEF_IntegerType(SignedChar,2,1)
409 DEF_IntegerType(SignedChar,4,1)
410 DEF_IntegerType(SignedChar,8,1)
411 DEF_IntegerType(SignedChar,1,0)
412 DEF_IntegerType(SignedChar,2,0)
413 DEF_IntegerType(SignedChar,4,0)
414 DEF_IntegerType(SignedChar,8,0)
415 
416 typedef unsigned char UnsignedChar;
417 DEF_IntegerType(UnsignedChar,1,1)
418 DEF_IntegerType(UnsignedChar,2,1)
419 DEF_IntegerType(UnsignedChar,4,1)
420 DEF_IntegerType(UnsignedChar,8,1)
421 DEF_IntegerType(UnsignedChar,1,0)
422 DEF_IntegerType(UnsignedChar,2,0)
423 DEF_IntegerType(UnsignedChar,4,0)
424 DEF_IntegerType(UnsignedChar,8,0)
425 
426 DEF_RealType(PetscReal,1,1)
427 DEF_RealType(PetscReal,2,1)
428 DEF_RealType(PetscReal,4,1)
429 DEF_RealType(PetscReal,8,1)
430 DEF_RealType(PetscReal,1,0)
431 DEF_RealType(PetscReal,2,0)
432 DEF_RealType(PetscReal,4,0)
433 DEF_RealType(PetscReal,8,0)
434 
435 #if defined(PETSC_HAVE_COMPLEX)
436 DEF_ComplexType(PetscComplex,1,1)
437 DEF_ComplexType(PetscComplex,2,1)
438 DEF_ComplexType(PetscComplex,4,1)
439 DEF_ComplexType(PetscComplex,8,1)
440 DEF_ComplexType(PetscComplex,1,0)
441 DEF_ComplexType(PetscComplex,2,0)
442 DEF_ComplexType(PetscComplex,4,0)
443 DEF_ComplexType(PetscComplex,8,0)
444 #endif
445 
446 #define PairType(Type1,Type2) Type1##_##Type2
447 typedef struct {int u; int i;}           PairType(int,int);
448 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
449 DEF_PairType(PairType(int,int),1,1)
450 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
451 
452 /* If we don't know the basic type, we treat it as a stream of chars or ints */
453 DEF_DumbType(char,1,1)
454 DEF_DumbType(char,2,1)
455 DEF_DumbType(char,4,1)
456 DEF_DumbType(char,1,0)
457 DEF_DumbType(char,2,0)
458 DEF_DumbType(char,4,0)
459 
460 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
461 DEF_DumbType(DumbInt,1,1)
462 DEF_DumbType(DumbInt,2,1)
463 DEF_DumbType(DumbInt,4,1)
464 DEF_DumbType(DumbInt,8,1)
465 DEF_DumbType(DumbInt,1,0)
466 DEF_DumbType(DumbInt,2,0)
467 DEF_DumbType(DumbInt,4,0)
468 DEF_DumbType(DumbInt,8,0)
469 
470 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
471 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
472 {
473   int ierr;
474   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
475   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
476   return MPI_SUCCESS;
477 }
478 #endif
479 
480 /*
481    The routine Creates a communication link for the given operation. It first looks up its link cache. If
482    there is a free & suitable one, it uses it. Otherwise it creates a new one.
483 
484    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
485    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
486    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
487    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
488 
489    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
490 
491    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
492 
493    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
494    need pack/unpack data.
495 */
496 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
497 {
498   PetscErrorCode    ierr;
499   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
500   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
501   PetscSFLink       *p,link;
502   PetscSFDirection  direction;
503   MPI_Request       *reqs = NULL;
504   PetscBool         match,rootdirect[2],leafdirect[2];
505   PetscMemType      rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1*/
506   PetscMemType      leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
507   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
508   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
509 
510   PetscFunctionBegin;
511   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
512 
513   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
514   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
515     if (sfop == PETSCSF_BCAST) {
516       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
517       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
518     } else if (sfop == PETSCSF_REDUCE) {
519       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
520       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
521     } else { /* PETSCSF_FETCH */
522       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
523       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
524     }
525   }
526 
527   if (sf->use_gpu_aware_mpi) {
528     rootmtype_mpi = rootmtype;
529     leafmtype_mpi = leafmtype;
530   } else {
531     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
532   }
533   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
534   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
535   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
536 
537   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
538   nrootreqs = bas->nrootreqs;
539   nleafreqs = sf->nleafreqs;
540 
541   /* Look for free links in cache */
542   for (p=&bas->avail; (link=*p); p=&link->next) {
543     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
544     if (match) {
545       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
546          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
547       */
548       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
549         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
550         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}}
551         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
552       }
553       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
554         reqs = link->leafreqs[direction][leafmtype][1];
555         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}}
556         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
557       }
558       *p = link->next; /* Remove from available list */
559       goto found;
560     }
561   }
562 
563   ierr = PetscNew(&link);CHKERRQ(ierr);
564   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
565   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
566 
567   nreqs = (nrootreqs+nleafreqs)*8;
568   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
569   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
570 
571   for (i=0; i<2; i++) { /* Two communication directions */
572     for (j=0; j<2; j++) { /* Two memory types */
573       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
574         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
575         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
576       }
577     }
578   }
579 
580 found:
581 
582 #if defined(PETSC_HAVE_DEVICE)
583   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
584     #if defined(PETSC_HAVE_CUDA)
585       if (sf->backend == PETSCSF_BACKEND_CUDA)   {ierr = PetscSFLinkSetUp_Cuda(sf,link,unit);CHKERRQ(ierr);}
586     #endif
587     #if defined(PETSC_HAVE_HIP)
588       if (sf->backend == PETSCSF_BACKEND_HIP)   {ierr = PetscSFLinkSetUp_Hip(sf,link,unit);CHKERRQ(ierr);}
589     #endif
590     #if defined(PETSC_HAVE_KOKKOS)
591       if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);}
592     #endif
593   }
594 #endif
595 
596   /* Allocate buffers along root/leafdata */
597   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
598     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
599     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
600     if (bas->rootbuflen[i]) {
601       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
602         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
603       } else { /* Have to have a separate rootbuf */
604         if (!link->rootbuf_alloc[i][rootmtype]) {
605           ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
606         }
607         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
608       }
609     }
610 
611     if (sf->leafbuflen[i]) {
612       if (leafdirect[i]) {
613         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
614       } else {
615         if (!link->leafbuf_alloc[i][leafmtype]) {
616           ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
617         }
618         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
619       }
620     }
621   }
622 
623 #if defined(PETSC_HAVE_DEVICE)
624   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
625   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
626     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
627       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
628     }
629     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
630   }
631   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
632     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
633       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
634     }
635     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
636   }
637 #endif
638 
639   /* Set `current` state of the link. They may change between different SF invocations with the same link */
640   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
641     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
642     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
643   }
644 
645   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
646   link->leafdata = leafdata;
647   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
648     link->rootdirect[i] = rootdirect[i];
649     link->leafdirect[i] = leafdirect[i];
650   }
651   link->rootdirect_mpi  = rootdirect_mpi;
652   link->leafdirect_mpi  = leafdirect_mpi;
653   link->rootmtype       = rootmtype;
654   link->leafmtype       = leafmtype;
655   link->rootmtype_mpi   = rootmtype_mpi;
656   link->leafmtype_mpi   = leafmtype_mpi;
657 
658   link->next            = bas->inuse;
659   bas->inuse            = link;
660   *mylink               = link;
661   PetscFunctionReturn(0);
662 }
663 
664 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
665    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
666 */
667 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
668 {
669   PetscErrorCode       ierr;
670   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
671   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
672   const PetscInt       *rootoffset,*leafoffset;
673   PetscMPIInt          n;
674   MPI_Aint             disp;
675   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
676   MPI_Datatype         unit = link->unit;
677   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
678   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
679 
680   PetscFunctionBegin;
681   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
682   if (sf->persistent) {
683     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
684       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
685       if (direction == PETSCSF_LEAF2ROOT) {
686         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
687           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
688           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
689           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
690         }
691       } else { /* PETSCSF_ROOT2LEAF */
692         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
693           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
694           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
695           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
696         }
697       }
698       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
699     }
700 
701     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
702       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
703       if (direction == PETSCSF_LEAF2ROOT) {
704         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
705           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
706           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
707           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
708         }
709       } else { /* PETSCSF_ROOT2LEAF */
710         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
711           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
712           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
713           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
714         }
715       }
716       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
717     }
718   }
719   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
720   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
721   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
722   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
723   PetscFunctionReturn(0);
724 }
725 
726 
727 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
728 {
729   PetscErrorCode    ierr;
730   PetscSFLink       link,*p;
731   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
732 
733   PetscFunctionBegin;
734   /* Look for types in cache */
735   for (p=&bas->inuse; (link=*p); p=&link->next) {
736     PetscBool match;
737     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
738     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
739       switch (cmode) {
740       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
741       case PETSC_USE_POINTER: break;
742       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
743       }
744       *mylink = link;
745       PetscFunctionReturn(0);
746     }
747   }
748   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
749 }
750 
751 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
752 {
753   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
754 
755   PetscFunctionBegin;
756   (*link)->rootdata = NULL;
757   (*link)->leafdata = NULL;
758   (*link)->next     = bas->avail;
759   bas->avail        = *link;
760   *link             = NULL;
761   PetscFunctionReturn(0);
762 }
763 
764 /* Destroy all links chained in 'avail' */
765 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
766 {
767   PetscErrorCode    ierr;
768   PetscSFLink       link = *avail,next;
769   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
770   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
771 
772   PetscFunctionBegin;
773   for (; link; link=next) {
774     next = link->next;
775     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);}
776     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
777       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);}
778     }
779     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
780     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
781       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
782       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
783       #if defined(PETSC_HAVE_DEVICE)
784       ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
785       ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
786       #endif
787     }
788   #if defined(PETSC_HAVE_DEVICE)
789     if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);}
790    /*TODO:  Make runtime */
791    #if defined(PETSC_HAVE_CUDA)
792     if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;}
793    #elif defined(PETSC_HAVE_HIP)
794     if (link->stream) {hipError_t  cerr = hipStreamDestroy(link->stream);CHKERRHIP(cerr); link->stream = NULL;}
795    #endif
796   #endif
797     ierr = PetscFree(link);CHKERRQ(ierr);
798   }
799   *avail = NULL;
800   PetscFunctionReturn(0);
801 }
802 
803 /* Error out on unsupported overlapped communications */
804 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
805 {
806   PetscErrorCode    ierr;
807   PetscSFLink       link,*p;
808   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
809   PetscBool         match;
810 
811   PetscFunctionBegin;
812   if (PetscDefined(USE_DEBUG)) {
813     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
814        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
815     */
816     if (rootdata || leafdata) {
817       for (p=&bas->inuse; (link=*p); p=&link->next) {
818         ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
819         if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
820       }
821     }
822   }
823   PetscFunctionReturn(0);
824 }
825 
826 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
827 {
828   PetscFunctionBegin;
829   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
830   PetscFunctionReturn(0);
831 }
832 
833 
834 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
835 {
836   PetscErrorCode ierr;
837   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
838   PetscBool      is2Int,is2PetscInt;
839   PetscMPIInt    ni,na,nd,combiner;
840 #if defined(PETSC_HAVE_COMPLEX)
841   PetscInt       nPetscComplex=0;
842 #endif
843 
844   PetscFunctionBegin;
845   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
846   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
847   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
848   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
849   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
850   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
851 #if defined(PETSC_HAVE_COMPLEX)
852   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
853 #endif
854   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
855   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
856   /* TODO: shaell we also handle Fortran MPI_2REAL? */
857   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr);
858   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
859   link->bs = 1; /* default */
860 
861   if (is2Int) {
862     PackInit_PairType_int_int_1_1(link);
863     link->bs        = 1;
864     link->unitbytes = 2*sizeof(int);
865     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
866     link->basicunit = MPI_2INT;
867     link->unit      = MPI_2INT;
868   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
869     PackInit_PairType_PetscInt_PetscInt_1_1(link);
870     link->bs        = 1;
871     link->unitbytes = 2*sizeof(PetscInt);
872     link->basicunit = MPIU_2INT;
873     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
874     link->unit      = MPIU_2INT;
875   } else if (nPetscReal) {
876     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
877     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
878     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
879     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
880     link->bs        = nPetscReal;
881     link->unitbytes = nPetscReal*sizeof(PetscReal);
882     link->basicunit = MPIU_REAL;
883     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
884   } else if (nPetscInt) {
885     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
886     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
887     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
888     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
889     link->bs        = nPetscInt;
890     link->unitbytes = nPetscInt*sizeof(PetscInt);
891     link->basicunit = MPIU_INT;
892     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
893 #if defined(PETSC_USE_64BIT_INDICES)
894   } else if (nInt) {
895     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
896     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
897     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
898     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
899     link->bs        = nInt;
900     link->unitbytes = nInt*sizeof(int);
901     link->basicunit = MPI_INT;
902     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
903 #endif
904   } else if (nSignedChar) {
905     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
906     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
907     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
908     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
909     link->bs        = nSignedChar;
910     link->unitbytes = nSignedChar*sizeof(SignedChar);
911     link->basicunit = MPI_SIGNED_CHAR;
912     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
913   }  else if (nUnsignedChar) {
914     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
915     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
916     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
917     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
918     link->bs        = nUnsignedChar;
919     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
920     link->basicunit = MPI_UNSIGNED_CHAR;
921     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
922 #if defined(PETSC_HAVE_COMPLEX)
923   } else if (nPetscComplex) {
924     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
925     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
926     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
927     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
928     link->bs        = nPetscComplex;
929     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
930     link->basicunit = MPIU_COMPLEX;
931     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
932 #endif
933   } else {
934     MPI_Aint lb,nbyte;
935     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr);
936     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
937     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
938       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
939       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
940       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
941       link->bs        = nbyte;
942       link->unitbytes = nbyte;
943       link->basicunit = MPI_BYTE;
944     } else {
945       nInt = nbyte / sizeof(int);
946       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
947       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
948       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
949       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
950       link->bs        = nInt;
951       link->unitbytes = nbyte;
952       link->basicunit = MPI_INT;
953     }
954     if (link->isbuiltin) link->unit = unit;
955   }
956 
957   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);}
958 
959   link->Memcpy = PetscSFLinkMemcpy_Host;
960   PetscFunctionReturn(0);
961 }
962 
963 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
964 {
965   PetscFunctionBegin;
966   *UnpackAndOp = NULL;
967   if (mtype == PETSC_MEMTYPE_HOST) {
968     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
969     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
970     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
971     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
972     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
973     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
974     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
975     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
976     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
977     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
978     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
979     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
980     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
981   }
982 #if defined(PETSC_HAVE_DEVICE)
983   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
984     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
985     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
986     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
987     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
988     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
989     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
990     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
991     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
992     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
993     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
994     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
995     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
996     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
997   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
998     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
999     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
1000     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
1001     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
1002     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
1003     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
1004     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
1005     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
1006     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
1007     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
1008     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
1009     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
1010     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
1011   }
1012 #endif
1013   PetscFunctionReturn(0);
1014 }
1015 
1016 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
1017 {
1018   PetscFunctionBegin;
1019   *ScatterAndOp = NULL;
1020   if (mtype == PETSC_MEMTYPE_HOST) {
1021     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
1022     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
1023     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
1024     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
1025     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
1026     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
1027     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
1028     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
1029     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
1030     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
1031     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
1032     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
1033     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
1034   }
1035 #if defined(PETSC_HAVE_DEVICE)
1036   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1037     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
1038     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1039     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
1040     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1041     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1042     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1043     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1044     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1045     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1046     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1047     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1048     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1049     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1050   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1051     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1052     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1053     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1054     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1055     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1056     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1057     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1058     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1059     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1060     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1061     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1062     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1063     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1064   }
1065 #endif
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1070 {
1071   PetscFunctionBegin;
1072   *FetchAndOp = NULL;
1073   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1074   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1075 #if defined(PETSC_HAVE_DEVICE)
1076   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1077   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1078 #endif
1079   PetscFunctionReturn(0);
1080 }
1081 
1082 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1083 {
1084   PetscFunctionBegin;
1085   *FetchAndOpLocal = NULL;
1086   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1087   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1088 #if defined(PETSC_HAVE_DEVICE)
1089   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1090   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1091 #endif
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1096 {
1097   PetscErrorCode ierr;
1098   PetscLogDouble flops;
1099   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1100 
1101 
1102   PetscFunctionBegin;
1103   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1104     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1105 #if defined(PETSC_HAVE_DEVICE)
1106     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1107 #endif
1108     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1109   }
1110   PetscFunctionReturn(0);
1111 }
1112 
1113 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1114 {
1115   PetscLogDouble flops;
1116   PetscErrorCode ierr;
1117 
1118   PetscFunctionBegin;
1119   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1120     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1121 #if defined(PETSC_HAVE_DEVICE)
1122     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1123 #endif
1124     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1125   }
1126   PetscFunctionReturn(0);
1127 }
1128 
1129 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1130   Input Arguments:
1131   +sf      - The StarForest
1132   .link    - The link
1133   .count   - Number of entries to unpack
1134   .start   - The first index, significent when indices=NULL
1135   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1136   .buf     - A contiguous buffer to unpack from
1137   -op      - Operation after unpack
1138 
1139   Output Arguments:
1140   .data    - The data to unpack to
1141 */
1142 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1143 {
1144   PetscFunctionBegin;
1145 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1146   {
1147     PetscErrorCode ierr;
1148     PetscInt       i;
1149     PetscMPIInt    n;
1150     if (indices) {
1151       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1152          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1153       */
1154       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);}
1155     } else {
1156       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1157       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr);
1158     }
1159   }
1160   PetscFunctionReturn(0);
1161 #else
1162   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1163 #endif
1164 }
1165 
1166 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1167 {
1168   PetscFunctionBegin;
1169 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1170   {
1171     PetscErrorCode ierr;
1172     PetscInt       i,disp;
1173     if (!srcIdx) {
1174       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
1175     } else {
1176       for (i=0; i<count; i++) {
1177         disp = dstIdx? dstIdx[i] : dstStart + i;
1178         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);
1179       }
1180     }
1181   }
1182   PetscFunctionReturn(0);
1183 #else
1184   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1185 #endif
1186 }
1187 
1188 /*=============================================================================
1189               Pack/Unpack/Fetch/Scatter routines
1190  ============================================================================*/
1191 
1192 /* Pack rootdata to rootbuf
1193   Input Arguments:
1194   + sf       - The SF this packing works on.
1195   . link     - It gives the memtype of the roots and also provides root buffer.
1196   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1197   - rootdata - Where to read the roots.
1198 
1199   Notes:
1200   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1201   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1202  */
1203 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1204 {
1205   PetscErrorCode   ierr;
1206   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1207   const PetscInt   *rootindices = NULL;
1208   PetscInt         count,start;
1209   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1210   PetscMemType     rootmtype = link->rootmtype;
1211   PetscSFPackOpt   opt = NULL;
1212 
1213   /*TODO: SEK Problems are in here*/
1214   PetscFunctionBegin;
1215   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1216   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1217   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1218     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1219     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1220     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1221   }
1222   if (scope == PETSCSF_REMOTE) {
1223     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1224     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1225   }
1226   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1227   PetscFunctionReturn(0);
1228 }
1229 
1230 /* Pack leafdata to leafbuf */
1231 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1232 {
1233   PetscErrorCode   ierr;
1234   const PetscInt   *leafindices = NULL;
1235   PetscInt         count,start;
1236   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1237   PetscMemType     leafmtype = link->leafmtype;
1238   PetscSFPackOpt   opt = NULL;
1239 
1240   PetscFunctionBegin;
1241   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1242   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1243   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1244     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1245     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1246     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1247   }
1248   if (scope == PETSCSF_REMOTE) {
1249     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1250     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1251   }
1252   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1253   PetscFunctionReturn(0);
1254 }
1255 
1256 /* Unpack rootbuf to rootdata */
1257 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1258 {
1259   PetscErrorCode   ierr;
1260   const PetscInt   *rootindices = NULL;
1261   PetscInt         count,start;
1262   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1263   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1264   PetscMemType     rootmtype = link->rootmtype;
1265   PetscSFPackOpt   opt = NULL;
1266 
1267   PetscFunctionBegin;
1268   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1269   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1270   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1271     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1272     if (UnpackAndOp) {
1273       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1274       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1275     } else {
1276       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1277       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1278     }
1279   }
1280   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1281   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1282   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1283   PetscFunctionReturn(0);
1284 }
1285 
1286 /* Unpack leafbuf to leafdata */
1287 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1288 {
1289   PetscErrorCode   ierr;
1290   const PetscInt   *leafindices = NULL;
1291   PetscInt         count,start;
1292   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1293   PetscMemType     leafmtype = link->leafmtype;
1294   PetscSFPackOpt   opt = NULL;
1295 
1296   PetscFunctionBegin;
1297   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1298   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1299   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1300     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1301     if (UnpackAndOp) {
1302       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1303       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1304     } else {
1305       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1306       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1307     }
1308   }
1309   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1310   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1311   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1312   PetscFunctionReturn(0);
1313 }
1314 
1315 /* FetchAndOp rootdata with rootbuf */
1316 PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1317 {
1318   PetscErrorCode     ierr;
1319   const PetscInt     *rootindices = NULL;
1320   PetscInt           count,start;
1321   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1322   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1323   PetscMemType       rootmtype = link->rootmtype;
1324   PetscSFPackOpt     opt = NULL;
1325 
1326   PetscFunctionBegin;
1327   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1328   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1329   if (bas->rootbuflen[scope]) {
1330     /* Do FetchAndOp on rootdata with rootbuf */
1331     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1332     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1333     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1334   }
1335   if (scope == PETSCSF_REMOTE) {
1336     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1337     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1338   }
1339   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1340   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1341   PetscFunctionReturn(0);
1342 }
1343 
1344 /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1345 PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1346 {
1347   PetscErrorCode       ierr;
1348   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1349   PetscInt             count,rootstart,leafstart;
1350   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1351   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1352   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1353   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1354 
1355   PetscFunctionBegin;
1356   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1357   if (rootmtype != leafmtype) { /* Uncommon case */
1358      /* The local communication has to go through pack and unpack */
1359     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1360     ierr = (*link->Memcpy)(link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1361     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1362   } else {
1363     ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1364     if (ScatterAndOp) {
1365       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1366       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1367       ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1368     } else {
1369       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1370       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1371       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1372     }
1373   }
1374   PetscFunctionReturn(0);
1375 }
1376 
1377 /* Reduce leafdata to rootdata locally */
1378 PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1379 {
1380   PetscErrorCode       ierr;
1381   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1382   PetscInt             count,rootstart,leafstart;
1383   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1384   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1385   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1386   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1387 
1388   PetscFunctionBegin;
1389   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1390   if (rootmtype != leafmtype) {
1391     /* The local communication has to go through pack and unpack */
1392     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1393     ierr = (*link->Memcpy)(link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1394     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1395   } else {
1396     ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1397     if (ScatterAndOp) {
1398       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1399       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1400       ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1401     } else {
1402       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1403       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1404       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1405     }
1406   }
1407   PetscFunctionReturn(0);
1408 }
1409 
1410 /* Fetch rootdata to leafdata and leafupdate locally */
1411 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1412 {
1413   PetscErrorCode       ierr;
1414   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1415   PetscInt             count,rootstart,leafstart;
1416   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1417   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1418   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1419   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1420 
1421   PetscFunctionBegin;
1422   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1423   if (rootmtype != leafmtype) {
1424    /* The local communication has to go through pack and unpack */
1425     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1426   } else {
1427     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1428     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1429     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1430     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1431   }
1432   PetscFunctionReturn(0);
1433 }
1434 
1435 /*
1436   Create per-rank pack/unpack optimizations based on indice patterns
1437 
1438    Input Parameters:
1439   +  n       - Number of destination ranks
1440   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1441   -  idx     - [*]   Array storing indices
1442 
1443    Output Parameters:
1444   +  opt     - Pack optimizations. NULL if no optimizations.
1445 */
1446 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1447 {
1448   PetscErrorCode ierr;
1449   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1450   PetscBool      optimizable = PETSC_TRUE;
1451   PetscSFPackOpt opt;
1452 
1453   PetscFunctionBegin;
1454   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1455   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1456   opt->n      = opt->array[0] = n;
1457   opt->offset = opt->array + 1;
1458   opt->start  = opt->array + n   + 2;
1459   opt->dx     = opt->array + 2*n + 2;
1460   opt->dy     = opt->array + 3*n + 2;
1461   opt->dz     = opt->array + 4*n + 2;
1462   opt->X      = opt->array + 5*n + 2;
1463   opt->Y      = opt->array + 6*n + 2;
1464 
1465   for (r=0; r<n; r++) { /* For each destination rank */
1466     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1467     p     = offset[r];
1468     start = idx[p]; /* First index for this rank */
1469     p++;
1470 
1471     /* Search in X dimension */
1472     for (dx=1; dx<m; dx++,p++) {
1473       if (start+dx != idx[p]) break;
1474     }
1475 
1476     dydz = m/dx;
1477     X    = dydz > 1 ? (idx[p]-start) : dx;
1478     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1479     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1480     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1481       for (i=0; i<dx; i++,p++) {
1482         if (start+X*dy+i != idx[p]) {
1483           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1484           else goto Z_dimension;
1485         }
1486       }
1487     }
1488 
1489 Z_dimension:
1490     dz = m/(dx*dy);
1491     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1492     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1493     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1494     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1495       for (j=0; j<dy; j++) {
1496         for (i=0; i<dx; i++,p++) {
1497           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1498         }
1499       }
1500     }
1501     opt->start[r] = start;
1502     opt->dx[r]    = dx;
1503     opt->dy[r]    = dy;
1504     opt->dz[r]    = dz;
1505     opt->X[r]     = X;
1506     opt->Y[r]     = Y;
1507   }
1508 
1509 finish:
1510   /* If not optimizable, free arrays to save memory */
1511   if (!n || !optimizable) {
1512     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1513     ierr = PetscFree(opt);CHKERRQ(ierr);
1514     *out = NULL;
1515   } else {
1516     opt->offset[0] = 0;
1517     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1518     *out = opt;
1519   }
1520   PetscFunctionReturn(0);
1521 }
1522 
1523 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1524 {
1525   PetscErrorCode ierr;
1526   PetscSFPackOpt opt = *out;
1527 
1528   PetscFunctionBegin;
1529   if (opt) {
1530     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
1531     ierr = PetscFree(opt);CHKERRQ(ierr);
1532     *out = NULL;
1533   }
1534   PetscFunctionReturn(0);
1535 }
1536 
1537 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1538 {
1539   PetscErrorCode ierr;
1540   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1541   PetscInt       i,j;
1542 
1543   PetscFunctionBegin;
1544   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1545   for (i=0; i<2; i++) { /* Set defaults */
1546     sf->leafstart[i]   = 0;
1547     sf->leafcontig[i]  = PETSC_TRUE;
1548     sf->leafdups[i]    = PETSC_FALSE;
1549     bas->rootstart[i]  = 0;
1550     bas->rootcontig[i] = PETSC_TRUE;
1551     bas->rootdups[i]   = PETSC_FALSE;
1552   }
1553 
1554   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1555   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1556 
1557   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1558   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1559 
1560   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1561   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1562     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1563   }
1564   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1565     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1566   }
1567 
1568   /* If not, see if we can have per-rank optimizations by doing index analysis */
1569   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1570   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1571 
1572   /* Are root indices for self and remote contiguous? */
1573   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1574   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1575 
1576   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1577   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1578 
1579   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1580     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1581   }
1582   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1583     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1584   }
1585 
1586   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1587   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1588 
1589 #if defined(PETSC_HAVE_DEVICE)
1590     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1591   if (PetscDefined(HAVE_DEVICE)) {
1592     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1593     if (!sf->leafcontig[0]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1594     if (!sf->leafcontig[1]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1595     if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1596     if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1597   }
1598 #endif
1599 
1600   PetscFunctionReturn(0);
1601 }
1602 
1603 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1604 {
1605   PetscErrorCode ierr;
1606   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1607   PetscInt       i;
1608 
1609   PetscFunctionBegin;
1610   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1611     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1612     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1613 #if defined(PETSC_HAVE_DEVICE)
1614     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1615     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1616 #endif
1617   }
1618   PetscFunctionReturn(0);
1619 }
1620