xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 93520041193d7b76aece9b436de8c2aa14914128)
1 #include "petsc/private/sfimpl.h"
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
6 
7 /*
8  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9  * therefore we pack data types manually. This file defines packing routines for the standard data types.
10  */
11 
12 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
13 
14 /* Operations working like s += t */
15 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
16 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
17 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
18 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
19 /* Ref MPI MAXLOC */
20 #define OP_XLOC(op,s,t) \
21   do {                                       \
22     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23     else if (!((s).u op (t).u)) s = t;           \
24   } while (0)
25 
26 /* DEF_PackFunc - macro defining a Pack routine
27 
28    Arguments of the macro:
29    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30    .BS        Block size for vectorization. It is a factor of bsz.
31    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
32 
33    Arguments of the Pack routine:
34    +count     Number of indices in idx[].
35    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36    .opt       Per-pack optimization plan. NULL means no such plan.
37    .idx       Indices of entries to packed.
38    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40    -packed    Address of the packed data.
41  */
42 #define DEF_PackFunc(Type,BS,EQ) \
43   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44   {                                                                                                          \
45     PetscErrorCode ierr;                                                                                     \
46     const Type     *u = (const Type*)unpacked,*u2;                                                           \
47     Type           *p = (Type*)packed,*p2;                                                                   \
48     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
49     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
50     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
51     PetscFunctionBegin;                                                                                      \
52     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
53     else if (opt) { /* has optimizations available */                                                        \
54       p2 = p;                                                                                                \
55       for (r=0; r<opt->n; r++) {                                                                             \
56         u2 = u + opt->start[r]*MBS;                                                                          \
57         X  = opt->X[r];                                                                                      \
58         Y  = opt->Y[r];                                                                                      \
59         for (k=0; k<opt->dz[r]; k++)                                                                         \
60           for (j=0; j<opt->dy[r]; j++) {                                                                     \
61             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
62             p2  += opt->dx[r]*MBS;                                                                           \
63           }                                                                                                  \
64       }                                                                                                      \
65     } else {                                                                                                 \
66       for (i=0; i<count; i++)                                                                                \
67         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
68           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
69             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
70     }                                                                                                        \
71     PetscFunctionReturn(0);                                                                                  \
72   }
73 
74 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
75                 and inserts into a sparse array.
76 
77    Arguments:
78   .Type       Type of the data
79   .BS         Block size for vectorization
80   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
81 
82   Notes:
83    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
84  */
85 #define DEF_UnpackFunc(Type,BS,EQ)               \
86   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
87   {                                                                                                          \
88     PetscErrorCode ierr;                                                                                     \
89     Type           *u = (Type*)unpacked,*u2;                                                                 \
90     const Type     *p = (const Type*)packed;                                                                 \
91     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
92     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
93     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
94     PetscFunctionBegin;                                                                                      \
95     if (!idx) {                                                                                              \
96       u += start*MBS;                                                                                        \
97       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
98     } else if (opt) { /* has optimizations available */                                                      \
99       for (r=0; r<opt->n; r++) {                                                                             \
100         u2 = u + opt->start[r]*MBS;                                                                          \
101         X  = opt->X[r];                                                                                      \
102         Y  = opt->Y[r];                                                                                      \
103         for (k=0; k<opt->dz[r]; k++)                                                                         \
104           for (j=0; j<opt->dy[r]; j++) {                                                                     \
105             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
106             p   += opt->dx[r]*MBS;                                                                           \
107           }                                                                                                  \
108       }                                                                                                      \
109     } else {                                                                                                 \
110       for (i=0; i<count; i++)                                                                                \
111         for (j=0; j<M; j++)                                                                                  \
112           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
113     }                                                                                                        \
114     PetscFunctionReturn(0);                                                                                  \
115   }
116 
117 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
118 
119    Arguments:
120   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
121   .Type       Type of the data
122   .BS         Block size for vectorization
123   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
124   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
125   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
126  */
127 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
128   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
129   {                                                                                                          \
130     Type           *u = (Type*)unpacked,*u2;                                                                 \
131     const Type     *p = (const Type*)packed;                                                                 \
132     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
133     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
134     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
135     PetscFunctionBegin;                                                                                      \
136     if (!idx) {                                                                                              \
137       u += start*MBS;                                                                                        \
138       for (i=0; i<count; i++)                                                                                \
139         for (j=0; j<M; j++)                                                                                  \
140           for (k=0; k<BS; k++)                                                                               \
141             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
142     } else if (opt) { /* idx[] has patterns */                                                               \
143       for (r=0; r<opt->n; r++) {                                                                             \
144         u2 = u + opt->start[r]*MBS;                                                                          \
145         X  = opt->X[r];                                                                                      \
146         Y  = opt->Y[r];                                                                                      \
147         for (k=0; k<opt->dz[r]; k++)                                                                         \
148           for (j=0; j<opt->dy[r]; j++) {                                                                     \
149             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
150             p += opt->dx[r]*MBS;                                                                             \
151           }                                                                                                  \
152       }                                                                                                      \
153     } else {                                                                                                 \
154       for (i=0; i<count; i++)                                                                                \
155         for (j=0; j<M; j++)                                                                                  \
156           for (k=0; k<BS; k++)                                                                               \
157             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
158     }                                                                                                        \
159     PetscFunctionReturn(0);                                                                                  \
160   }
161 
162 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
163   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
164   {                                                                                                          \
165     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
166     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
167     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
168     const PetscInt MBS = M*BS;                                                                               \
169     PetscFunctionBegin;                                                                                      \
170     for (i=0; i<count; i++) {                                                                                \
171       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
172       l = i*MBS;                                                                                             \
173       for (j=0; j<M; j++)                                                                                    \
174         for (k=0; k<BS; k++) {                                                                               \
175           tmp = u[r+j*BS+k];                                                                                 \
176           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
177           p[l+j*BS+k] = tmp;                                                                                 \
178         }                                                                                                    \
179     }                                                                                                        \
180     PetscFunctionReturn(0);                                                                                  \
181   }
182 
183 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
184   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
185   {                                                                                                          \
186     PetscErrorCode ierr;                                                                                     \
187     const Type     *u = (const Type*)src;                                                                    \
188     Type           *v = (Type*)dst;                                                                          \
189     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
190     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
191     const PetscInt MBS = M*BS;                                                                               \
192     PetscFunctionBegin;                                                                                      \
193     if (!srcIdx) { /* src is contiguous */                                                                   \
194       u += srcStart*MBS;                                                                                     \
195       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
196     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
197       u += srcOpt->start[0]*MBS;                                                                             \
198       v += dstStart*MBS;                                                                                     \
199       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
200       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
201         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
202           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
203           v += srcOpt->dx[0]*MBS;                                                                            \
204         }                                                                                                    \
205     } else { /* all other cases */                                                                           \
206       for (i=0; i<count; i++) {                                                                              \
207         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
208         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
209         for (j=0; j<M; j++)                                                                                  \
210           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
211       }                                                                                                      \
212     }                                                                                                        \
213     PetscFunctionReturn(0);                                                                                  \
214   }
215 
216 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
217   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
218   {                                                                                                          \
219     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
220     const Type     *ldata = (const Type*)leafdata;                                                           \
221     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
222     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
223     const PetscInt MBS = M*BS;                                                                               \
224     PetscFunctionBegin;                                                                                      \
225     for (i=0; i<count; i++) {                                                                                \
226       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
227       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
228       for (j=0; j<M; j++)                                                                                    \
229         for (k=0; k<BS; k++) {                                                                               \
230           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
231           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
232         }                                                                                                    \
233     }                                                                                                        \
234     PetscFunctionReturn(0);                                                                                  \
235   }
236 
237 /* Pack, Unpack/Fetch ops */
238 #define DEF_Pack(Type,BS,EQ)                                                      \
239   DEF_PackFunc(Type,BS,EQ)                                                        \
240   DEF_UnpackFunc(Type,BS,EQ)                                                      \
241   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
242   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
243     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
244     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
245     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
246   }
247 
248 /* Add, Mult ops */
249 #define DEF_Add(Type,BS,EQ)                                                       \
250   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
251   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
252   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
253   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
254   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
255   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
256   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
257     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
258     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
259     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
260     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
261     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
262     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
263   }
264 
265 /* Max, Min ops */
266 #define DEF_Cmp(Type,BS,EQ)                                                       \
267   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
268   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
269   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
270   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
271   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
272     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
273     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
274     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
275     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
276   }
277 
278 /* Logical ops.
279   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
280   the compilation warning "empty macro arguments are undefined in ISO C90"
281  */
282 #define DEF_Log(Type,BS,EQ)                                                       \
283   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
284   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
285   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
286   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
287   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
288   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
289   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
290     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
291     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
292     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
293     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
294     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
295     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
296   }
297 
298 /* Bitwise ops */
299 #define DEF_Bit(Type,BS,EQ)                                                       \
300   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
301   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
302   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
303   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
304   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
305   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
306   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
307     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
308     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
309     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
310     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
311     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
312     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
313   }
314 
315 /* Maxloc, Minloc ops */
316 #define DEF_Xloc(Type,BS,EQ)                                                      \
317   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
318   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
319   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
320   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
321   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
322     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
323     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
324     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
325     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
326   }
327 
328 #define DEF_IntegerType(Type,BS,EQ)                                               \
329   DEF_Pack(Type,BS,EQ)                                                            \
330   DEF_Add(Type,BS,EQ)                                                             \
331   DEF_Cmp(Type,BS,EQ)                                                             \
332   DEF_Log(Type,BS,EQ)                                                             \
333   DEF_Bit(Type,BS,EQ)                                                             \
334   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
335     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
336     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
337     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
338     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
339     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
340   }
341 
342 #define DEF_RealType(Type,BS,EQ)                                                  \
343   DEF_Pack(Type,BS,EQ)                                                            \
344   DEF_Add(Type,BS,EQ)                                                             \
345   DEF_Cmp(Type,BS,EQ)                                                             \
346   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
347     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
348     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
349     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
350   }
351 
352 #if defined(PETSC_HAVE_COMPLEX)
353 #define DEF_ComplexType(Type,BS,EQ)                                               \
354   DEF_Pack(Type,BS,EQ)                                                            \
355   DEF_Add(Type,BS,EQ)                                                             \
356   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
357     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
358     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
359   }
360 #endif
361 
362 #define DEF_DumbType(Type,BS,EQ)                                                  \
363   DEF_Pack(Type,BS,EQ)                                                            \
364   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
365     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366   }
367 
368 /* Maxloc, Minloc */
369 #define DEF_PairType(Type,BS,EQ)                                                  \
370   DEF_Pack(Type,BS,EQ)                                                            \
371   DEF_Xloc(Type,BS,EQ)                                                            \
372   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
373     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
375   }
376 
377 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
378 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
379 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
380 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
381 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
382 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
383 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
384 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
385 
386 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
387 DEF_IntegerType(int,1,1)
388 DEF_IntegerType(int,2,1)
389 DEF_IntegerType(int,4,1)
390 DEF_IntegerType(int,8,1)
391 DEF_IntegerType(int,1,0)
392 DEF_IntegerType(int,2,0)
393 DEF_IntegerType(int,4,0)
394 DEF_IntegerType(int,8,0)
395 #endif
396 
397 /* The typedefs are used to get a typename without space that CPPJoin can handle */
398 typedef signed char SignedChar;
399 DEF_IntegerType(SignedChar,1,1)
400 DEF_IntegerType(SignedChar,2,1)
401 DEF_IntegerType(SignedChar,4,1)
402 DEF_IntegerType(SignedChar,8,1)
403 DEF_IntegerType(SignedChar,1,0)
404 DEF_IntegerType(SignedChar,2,0)
405 DEF_IntegerType(SignedChar,4,0)
406 DEF_IntegerType(SignedChar,8,0)
407 
408 typedef unsigned char UnsignedChar;
409 DEF_IntegerType(UnsignedChar,1,1)
410 DEF_IntegerType(UnsignedChar,2,1)
411 DEF_IntegerType(UnsignedChar,4,1)
412 DEF_IntegerType(UnsignedChar,8,1)
413 DEF_IntegerType(UnsignedChar,1,0)
414 DEF_IntegerType(UnsignedChar,2,0)
415 DEF_IntegerType(UnsignedChar,4,0)
416 DEF_IntegerType(UnsignedChar,8,0)
417 
418 DEF_RealType(PetscReal,1,1)
419 DEF_RealType(PetscReal,2,1)
420 DEF_RealType(PetscReal,4,1)
421 DEF_RealType(PetscReal,8,1)
422 DEF_RealType(PetscReal,1,0)
423 DEF_RealType(PetscReal,2,0)
424 DEF_RealType(PetscReal,4,0)
425 DEF_RealType(PetscReal,8,0)
426 
427 #if defined(PETSC_HAVE_COMPLEX)
428 DEF_ComplexType(PetscComplex,1,1)
429 DEF_ComplexType(PetscComplex,2,1)
430 DEF_ComplexType(PetscComplex,4,1)
431 DEF_ComplexType(PetscComplex,8,1)
432 DEF_ComplexType(PetscComplex,1,0)
433 DEF_ComplexType(PetscComplex,2,0)
434 DEF_ComplexType(PetscComplex,4,0)
435 DEF_ComplexType(PetscComplex,8,0)
436 #endif
437 
438 #define PairType(Type1,Type2) Type1##_##Type2
439 typedef struct {int u; int i;}           PairType(int,int);
440 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
441 DEF_PairType(PairType(int,int),1,1)
442 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
443 
444 /* If we don't know the basic type, we treat it as a stream of chars or ints */
445 DEF_DumbType(char,1,1)
446 DEF_DumbType(char,2,1)
447 DEF_DumbType(char,4,1)
448 DEF_DumbType(char,1,0)
449 DEF_DumbType(char,2,0)
450 DEF_DumbType(char,4,0)
451 
452 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
453 DEF_DumbType(DumbInt,1,1)
454 DEF_DumbType(DumbInt,2,1)
455 DEF_DumbType(DumbInt,4,1)
456 DEF_DumbType(DumbInt,8,1)
457 DEF_DumbType(DumbInt,1,0)
458 DEF_DumbType(DumbInt,2,0)
459 DEF_DumbType(DumbInt,4,0)
460 DEF_DumbType(DumbInt,8,0)
461 
462 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
463 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
464 {
465   int ierr;
466   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
467   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
468   return MPI_SUCCESS;
469 }
470 #endif
471 
472 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
473 {
474   PetscErrorCode    ierr;
475   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
476   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
477 
478   PetscFunctionBegin;
479   /* Destroy device-specific fields */
480   if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);}
481 
482   /* Destroy host related fields */
483   if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);}
484   if (!link->use_nvshmem) {
485     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
486       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);}
487     }
488     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
489     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
490       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
491       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
492     }
493   }
494   ierr = PetscFree(link);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
499 {
500   PetscErrorCode    ierr;
501 
502   PetscFunctionBegin;
503   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
504  #if defined(PETSC_HAVE_NVSHMEM)
505   {
506     PetscBool use_nvshmem;
507     ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr);
508     if (use_nvshmem) {
509       ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
510       PetscFunctionReturn(0);
511     }
512   }
513  #endif
514   ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
515   PetscFunctionReturn(0);
516 }
517 
518 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
519    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
520 */
521 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
522 {
523   PetscErrorCode       ierr;
524   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
525   PetscInt             i,j,cnt,nrootranks,ndrootranks,nleafranks,ndleafranks;
526   const PetscInt       *rootoffset,*leafoffset;
527   MPI_Aint             disp;
528   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
529   MPI_Datatype         unit = link->unit;
530   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
531   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
532 
533   PetscFunctionBegin;
534   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
535   if (sf->persistent) {
536     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
537       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
538       if (direction == PETSCSF_LEAF2ROOT) {
539         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
540           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
541           cnt  = rootoffset[i+1]-rootoffset[i];
542           ierr = MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
543         }
544       } else { /* PETSCSF_ROOT2LEAF */
545         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
546           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
547           cnt  = rootoffset[i+1]-rootoffset[i];
548           ierr = MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
549         }
550       }
551       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
552     }
553 
554     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
555       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
556       if (direction == PETSCSF_LEAF2ROOT) {
557         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
558           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
559           cnt  = leafoffset[i+1]-leafoffset[i];
560           ierr = MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
561         }
562       } else { /* PETSCSF_ROOT2LEAF */
563         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
564           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
565           cnt  = leafoffset[i+1]-leafoffset[i];
566           ierr = MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
567         }
568       }
569       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
570     }
571   }
572   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
573   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
574   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
575   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
576   PetscFunctionReturn(0);
577 }
578 
579 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
580 {
581   PetscErrorCode    ierr;
582   PetscSFLink       link,*p;
583   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
584 
585   PetscFunctionBegin;
586   /* Look for types in cache */
587   for (p=&bas->inuse; (link=*p); p=&link->next) {
588     PetscBool match;
589     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
590     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
591       switch (cmode) {
592       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
593       case PETSC_USE_POINTER: break;
594       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
595       }
596       *mylink = link;
597       PetscFunctionReturn(0);
598     }
599   }
600   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
601 }
602 
603 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
604 {
605   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
606   PetscSFLink       link = *mylink;
607 
608   PetscFunctionBegin;
609   link->rootdata = NULL;
610   link->leafdata = NULL;
611   link->next     = bas->avail;
612   bas->avail     = link;
613   *mylink        = NULL;
614   PetscFunctionReturn(0);
615 }
616 
617 /* Error out on unsupported overlapped communications */
618 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
619 {
620   PetscErrorCode    ierr;
621   PetscSFLink       link,*p;
622   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
623   PetscBool         match;
624 
625   PetscFunctionBegin;
626   if (PetscDefined(USE_DEBUG)) {
627     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
628        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
629     */
630     if (rootdata || leafdata) {
631       for (p=&bas->inuse; (link=*p); p=&link->next) {
632         ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
633         if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
634       }
635     }
636   }
637   PetscFunctionReturn(0);
638 }
639 
640 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
641 {
642   PetscFunctionBegin;
643   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
644   PetscFunctionReturn(0);
645 }
646 
647 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
648 {
649   PetscErrorCode ierr;
650   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
651   PetscBool      is2Int,is2PetscInt;
652   PetscMPIInt    ni,na,nd,combiner;
653 #if defined(PETSC_HAVE_COMPLEX)
654   PetscInt       nPetscComplex=0;
655 #endif
656 
657   PetscFunctionBegin;
658   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
659   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
660   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
661   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
662   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
663   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
664 #if defined(PETSC_HAVE_COMPLEX)
665   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
666 #endif
667   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
668   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
669   /* TODO: shaell we also handle Fortran MPI_2REAL? */
670   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr);
671   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
672   link->bs = 1; /* default */
673 
674   if (is2Int) {
675     PackInit_PairType_int_int_1_1(link);
676     link->bs        = 1;
677     link->unitbytes = 2*sizeof(int);
678     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
679     link->basicunit = MPI_2INT;
680     link->unit      = MPI_2INT;
681   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
682     PackInit_PairType_PetscInt_PetscInt_1_1(link);
683     link->bs        = 1;
684     link->unitbytes = 2*sizeof(PetscInt);
685     link->basicunit = MPIU_2INT;
686     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
687     link->unit      = MPIU_2INT;
688   } else if (nPetscReal) {
689     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
690     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
691     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
692     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
693     link->bs        = nPetscReal;
694     link->unitbytes = nPetscReal*sizeof(PetscReal);
695     link->basicunit = MPIU_REAL;
696     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
697   } else if (nPetscInt) {
698     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
699     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
700     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
701     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
702     link->bs        = nPetscInt;
703     link->unitbytes = nPetscInt*sizeof(PetscInt);
704     link->basicunit = MPIU_INT;
705     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
706 #if defined(PETSC_USE_64BIT_INDICES)
707   } else if (nInt) {
708     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
709     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
710     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
711     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
712     link->bs        = nInt;
713     link->unitbytes = nInt*sizeof(int);
714     link->basicunit = MPI_INT;
715     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
716 #endif
717   } else if (nSignedChar) {
718     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
719     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
720     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
721     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
722     link->bs        = nSignedChar;
723     link->unitbytes = nSignedChar*sizeof(SignedChar);
724     link->basicunit = MPI_SIGNED_CHAR;
725     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
726   }  else if (nUnsignedChar) {
727     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
728     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
729     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
730     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
731     link->bs        = nUnsignedChar;
732     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
733     link->basicunit = MPI_UNSIGNED_CHAR;
734     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
735 #if defined(PETSC_HAVE_COMPLEX)
736   } else if (nPetscComplex) {
737     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
738     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
739     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
740     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
741     link->bs        = nPetscComplex;
742     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
743     link->basicunit = MPIU_COMPLEX;
744     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
745 #endif
746   } else {
747     MPI_Aint lb,nbyte;
748     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr);
749     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb);
750     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
751       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
752       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
753       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
754       link->bs        = nbyte;
755       link->unitbytes = nbyte;
756       link->basicunit = MPI_BYTE;
757     } else {
758       nInt = nbyte / sizeof(int);
759       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
760       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
761       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
762       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
763       link->bs        = nInt;
764       link->unitbytes = nbyte;
765       link->basicunit = MPI_INT;
766     }
767     if (link->isbuiltin) link->unit = unit;
768   }
769 
770   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);}
771 
772   link->Memcpy = PetscSFLinkMemcpy_Host;
773   PetscFunctionReturn(0);
774 }
775 
776 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
777 {
778   PetscFunctionBegin;
779   *UnpackAndOp = NULL;
780   if (PetscMemTypeHost(mtype)) {
781     if      (op == MPI_REPLACE)               *UnpackAndOp = link->h_UnpackAndInsert;
782     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
783     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
784     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
785     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
786     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
787     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
788     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
789     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
790     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
791     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
792     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
793     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
794   }
795 #if defined(PETSC_HAVE_DEVICE)
796   else if (PetscMemTypeDevice(mtype) && !atomic) {
797     if      (op == MPI_REPLACE)               *UnpackAndOp = link->d_UnpackAndInsert;
798     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
799     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
800     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
801     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
802     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
803     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
804     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
805     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
806     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
807     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
808     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
809     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
810   } else if (PetscMemTypeDevice(mtype) && atomic) {
811     if      (op == MPI_REPLACE)               *UnpackAndOp = link->da_UnpackAndInsert;
812     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
813     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
814     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
815     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
816     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
817     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
818     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
819     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
820     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
821     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
822     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
823     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
824   }
825 #endif
826   PetscFunctionReturn(0);
827 }
828 
829 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
830 {
831   PetscFunctionBegin;
832   *ScatterAndOp = NULL;
833   if (PetscMemTypeHost(mtype)) {
834     if      (op == MPI_REPLACE)               *ScatterAndOp = link->h_ScatterAndInsert;
835     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
836     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
837     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
838     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
839     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
840     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
841     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
842     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
843     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
844     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
845     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
846     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
847   }
848 #if defined(PETSC_HAVE_DEVICE)
849   else if (PetscMemTypeDevice(mtype) && !atomic) {
850     if      (op == MPI_REPLACE)               *ScatterAndOp = link->d_ScatterAndInsert;
851     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
852     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
853     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
854     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
855     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
856     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
857     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
858     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
859     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
860     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
861     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
862     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
863   } else if (PetscMemTypeDevice(mtype) && atomic) {
864     if      (op == MPI_REPLACE)               *ScatterAndOp = link->da_ScatterAndInsert;
865     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
866     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
867     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
868     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
869     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
870     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
871     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
872     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
873     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
874     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
875     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
876     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
877   }
878 #endif
879   PetscFunctionReturn(0);
880 }
881 
882 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
883 {
884   PetscFunctionBegin;
885   *FetchAndOp = NULL;
886   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
887   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
888 #if defined(PETSC_HAVE_DEVICE)
889   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
890   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOp = link->da_FetchAndAdd;
891 #endif
892   PetscFunctionReturn(0);
893 }
894 
895 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
896 {
897   PetscFunctionBegin;
898   *FetchAndOpLocal = NULL;
899   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
900   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
901 #if defined(PETSC_HAVE_DEVICE)
902   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
903   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
904 #endif
905   PetscFunctionReturn(0);
906 }
907 
908 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
909 {
910   PetscErrorCode ierr;
911   PetscLogDouble flops;
912   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
913 
914   PetscFunctionBegin;
915   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
916     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
917 #if defined(PETSC_HAVE_DEVICE)
918     if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
919 #endif
920     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
921   }
922   PetscFunctionReturn(0);
923 }
924 
925 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
926 {
927   PetscLogDouble flops;
928   PetscErrorCode ierr;
929 
930   PetscFunctionBegin;
931   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
932     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
933 #if defined(PETSC_HAVE_DEVICE)
934     if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
935 #endif
936     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
937   }
938   PetscFunctionReturn(0);
939 }
940 
941 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
942   Input Parameters:
943   +sf      - The StarForest
944   .link    - The link
945   .count   - Number of entries to unpack
946   .start   - The first index, significent when indices=NULL
947   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
948   .buf     - A contiguous buffer to unpack from
949   -op      - Operation after unpack
950 
951   Output Parameters:
952   .data    - The data to unpack to
953 */
954 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
955 {
956   PetscFunctionBegin;
957 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
958   {
959     PetscErrorCode ierr;
960     PetscInt       i;
961     if (indices) {
962       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
963          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
964       */
965       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);}
966     } else {
967       ierr = MPIU_Reduce_local(buf,(char*)data+start*link->unitbytes,count,link->unit,op);CHKERRMPI(ierr);
968     }
969   }
970   PetscFunctionReturn(0);
971 #else
972   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
973 #endif
974 }
975 
976 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
977 {
978   PetscFunctionBegin;
979 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
980   {
981     PetscErrorCode ierr;
982     PetscInt       i,disp;
983     if (!srcIdx) {
984       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
985     } else {
986       for (i=0; i<count; i++) {
987         disp = dstIdx? dstIdx[i] : dstStart + i;
988         ierr = MPIU_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);
989       }
990     }
991   }
992   PetscFunctionReturn(0);
993 #else
994   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
995 #endif
996 }
997 
998 /*=============================================================================
999               Pack/Unpack/Fetch/Scatter routines
1000  ============================================================================*/
1001 
1002 /* Pack rootdata to rootbuf
1003   Input Parameters:
1004   + sf       - The SF this packing works on.
1005   . link     - It gives the memtype of the roots and also provides root buffer.
1006   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1007   - rootdata - Where to read the roots.
1008 
1009   Notes:
1010   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1011   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
1012  */
1013 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1014 {
1015   PetscErrorCode   ierr;
1016   const PetscInt   *rootindices = NULL;
1017   PetscInt         count,start;
1018   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1019   PetscMemType     rootmtype = link->rootmtype;
1020   PetscSFPackOpt   opt = NULL;
1021 
1022   PetscFunctionBegin;
1023   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1024   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1025     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1026     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1027     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1028   }
1029   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1030   PetscFunctionReturn(0);
1031 }
1032 
1033 /* Pack leafdata to leafbuf */
1034 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1035 {
1036   PetscErrorCode   ierr;
1037   const PetscInt   *leafindices = NULL;
1038   PetscInt         count,start;
1039   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1040   PetscMemType     leafmtype = link->leafmtype;
1041   PetscSFPackOpt   opt = NULL;
1042 
1043   PetscFunctionBegin;
1044   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1045   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1046     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1047     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1048     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1049   }
1050   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1051   PetscFunctionReturn(0);
1052 }
1053 
1054 /* Pack rootdata to rootbuf, which are in the same memory space */
1055 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1056 {
1057   PetscErrorCode   ierr;
1058   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1059 
1060   PetscFunctionBegin;
1061   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1062     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1063     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */
1064   }
1065   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1066   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);}
1067   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1068   PetscFunctionReturn(0);
1069 }
1070 /* Pack leafdata to leafbuf, which are in the same memory space */
1071 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1072 {
1073   PetscErrorCode   ierr;
1074 
1075   PetscFunctionBegin;
1076   if (scope == PETSCSF_REMOTE) {
1077     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1078     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1079   }
1080   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1081   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);}
1082   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1083   PetscFunctionReturn(0);
1084 }
1085 
1086 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1087 {
1088   PetscErrorCode   ierr;
1089   const PetscInt   *rootindices = NULL;
1090   PetscInt         count,start;
1091   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1092   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1093   PetscMemType     rootmtype = link->rootmtype;
1094   PetscSFPackOpt   opt = NULL;
1095 
1096   PetscFunctionBegin;
1097   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1098     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1099     if (UnpackAndOp) {
1100       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1101       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1102     } else {
1103       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1104       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1105     }
1106   }
1107   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1108   PetscFunctionReturn(0);
1109 }
1110 
1111 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1112 {
1113   PetscErrorCode   ierr;
1114   const PetscInt   *leafindices = NULL;
1115   PetscInt         count,start;
1116   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1117   PetscMemType     leafmtype = link->leafmtype;
1118   PetscSFPackOpt   opt = NULL;
1119 
1120   PetscFunctionBegin;
1121   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1122     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1123     if (UnpackAndOp) {
1124       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1125       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1126     } else {
1127       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1128       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1129     }
1130   }
1131   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1132   PetscFunctionReturn(0);
1133 }
1134 /* Unpack rootbuf to rootdata, which are in the same memory space */
1135 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1136 {
1137   PetscErrorCode   ierr;
1138   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1139 
1140   PetscFunctionBegin;
1141   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1142   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);}
1143   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1144   if (scope == PETSCSF_REMOTE) {
1145     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1146     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1147   }
1148   PetscFunctionReturn(0);
1149 }
1150 
1151 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1152 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1153 {
1154   PetscErrorCode   ierr;
1155 
1156   PetscFunctionBegin;
1157   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1158   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);CHKERRQ(ierr);}
1159   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1160   if (scope == PETSCSF_REMOTE) {
1161     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1162     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1163   }
1164   PetscFunctionReturn(0);
1165 }
1166 
1167 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1168 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1169 {
1170   PetscErrorCode     ierr;
1171   const PetscInt     *rootindices = NULL;
1172   PetscInt           count,start;
1173   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1174   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1175   PetscMemType       rootmtype = link->rootmtype;
1176   PetscSFPackOpt     opt = NULL;
1177 
1178   PetscFunctionBegin;
1179   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1180   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1181     /* Do FetchAndOp on rootdata with rootbuf */
1182     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr);
1183     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1184     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr);
1185   }
1186   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr);
1187   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1192 {
1193   PetscErrorCode       ierr;
1194   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1195   PetscInt             count,rootstart,leafstart;
1196   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1197   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1198   PetscMemType         rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1199   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1200   PetscInt             buflen = sf->leafbuflen[PETSCSF_LOCAL];
1201   char                 *srcbuf = NULL,*dstbuf = NULL;
1202   PetscBool            dstdups;
1203 
1204   PetscFunctionBegin;
1205   if (!buflen) PetscFunctionReturn(0);
1206   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1207     if (direction == PETSCSF_ROOT2LEAF) {
1208       ierr     = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1209       srcmtype = rootmtype;
1210       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1211       dstmtype = leafmtype;
1212       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1213     } else {
1214       ierr     = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1215       srcmtype = leafmtype;
1216       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1217       dstmtype = rootmtype;
1218       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1219     }
1220     ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr);
1221     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1222     if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);}
1223     if (direction == PETSCSF_ROOT2LEAF) {
1224       ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1225     } else {
1226       ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1227     }
1228   } else {
1229     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1230     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
1231     ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr);
1232     if (ScatterAndOp) {
1233       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1234       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1235       if (direction == PETSCSF_ROOT2LEAF) {
1236         ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1237       } else {
1238         ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1239       }
1240     } else {
1241       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1242       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1243       if (direction == PETSCSF_ROOT2LEAF) {
1244         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1245       } else {
1246         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1247       }
1248     }
1249   }
1250   PetscFunctionReturn(0);
1251 }
1252 
1253 /* Fetch rootdata to leafdata and leafupdate locally */
1254 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1255 {
1256   PetscErrorCode       ierr;
1257   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1258   PetscInt             count,rootstart,leafstart;
1259   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1260   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1261   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1262   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1263 
1264   PetscFunctionBegin;
1265   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1266   if (rootmtype != leafmtype) {
1267     /* The local communication has to go through pack and unpack */
1268     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1269   } else {
1270     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1271     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1272     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1273     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1274   }
1275   PetscFunctionReturn(0);
1276 }
1277 
1278 /*
1279   Create per-rank pack/unpack optimizations based on indice patterns
1280 
1281    Input Parameters:
1282   +  n       - Number of destination ranks
1283   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1284   -  idx     - [*]   Array storing indices
1285 
1286    Output Parameters:
1287   +  opt     - Pack optimizations. NULL if no optimizations.
1288 */
1289 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1290 {
1291   PetscErrorCode ierr;
1292   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1293   PetscBool      optimizable = PETSC_TRUE;
1294   PetscSFPackOpt opt;
1295 
1296   PetscFunctionBegin;
1297   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1298   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1299   opt->n      = opt->array[0] = n;
1300   opt->offset = opt->array + 1;
1301   opt->start  = opt->array + n   + 2;
1302   opt->dx     = opt->array + 2*n + 2;
1303   opt->dy     = opt->array + 3*n + 2;
1304   opt->dz     = opt->array + 4*n + 2;
1305   opt->X      = opt->array + 5*n + 2;
1306   opt->Y      = opt->array + 6*n + 2;
1307 
1308   for (r=0; r<n; r++) { /* For each destination rank */
1309     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1310     p     = offset[r];
1311     start = idx[p]; /* First index for this rank */
1312     p++;
1313 
1314     /* Search in X dimension */
1315     for (dx=1; dx<m; dx++,p++) {
1316       if (start+dx != idx[p]) break;
1317     }
1318 
1319     dydz = m/dx;
1320     X    = dydz > 1 ? (idx[p]-start) : dx;
1321     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1322     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1323     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1324       for (i=0; i<dx; i++,p++) {
1325         if (start+X*dy+i != idx[p]) {
1326           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1327           else goto Z_dimension;
1328         }
1329       }
1330     }
1331 
1332 Z_dimension:
1333     dz = m/(dx*dy);
1334     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1335     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1336     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1337     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1338       for (j=0; j<dy; j++) {
1339         for (i=0; i<dx; i++,p++) {
1340           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1341         }
1342       }
1343     }
1344     opt->start[r] = start;
1345     opt->dx[r]    = dx;
1346     opt->dy[r]    = dy;
1347     opt->dz[r]    = dz;
1348     opt->X[r]     = X;
1349     opt->Y[r]     = Y;
1350   }
1351 
1352 finish:
1353   /* If not optimizable, free arrays to save memory */
1354   if (!n || !optimizable) {
1355     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1356     ierr = PetscFree(opt);CHKERRQ(ierr);
1357     *out = NULL;
1358   } else {
1359     opt->offset[0] = 0;
1360     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1361     *out = opt;
1362   }
1363   PetscFunctionReturn(0);
1364 }
1365 
1366 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1367 {
1368   PetscErrorCode ierr;
1369   PetscSFPackOpt opt = *out;
1370 
1371   PetscFunctionBegin;
1372   if (opt) {
1373     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
1374     ierr = PetscFree(opt);CHKERRQ(ierr);
1375     *out = NULL;
1376   }
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1381 {
1382   PetscErrorCode ierr;
1383   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1384   PetscInt       i,j;
1385 
1386   PetscFunctionBegin;
1387   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1388   for (i=0; i<2; i++) { /* Set defaults */
1389     sf->leafstart[i]   = 0;
1390     sf->leafcontig[i]  = PETSC_TRUE;
1391     sf->leafdups[i]    = PETSC_FALSE;
1392     bas->rootstart[i]  = 0;
1393     bas->rootcontig[i] = PETSC_TRUE;
1394     bas->rootdups[i]   = PETSC_FALSE;
1395   }
1396 
1397   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1398   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1399 
1400   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1401   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1402 
1403   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1404   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1405     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1406   }
1407   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1408     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1409   }
1410 
1411   /* If not, see if we can have per-rank optimizations by doing index analysis */
1412   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1413   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1414 
1415   /* Are root indices for self and remote contiguous? */
1416   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1417   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1418 
1419   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1420   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1421 
1422   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1423     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1424   }
1425   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1426     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1427   }
1428 
1429   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1430   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1431 
1432  #if defined(PETSC_HAVE_DEVICE)
1433     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1434   if (PetscDefined(HAVE_DEVICE)) {
1435     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1436     if (!sf->leafcontig[0]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1437     if (!sf->leafcontig[1]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1438     if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1439     if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1440   }
1441 #endif
1442   PetscFunctionReturn(0);
1443 }
1444 
1445 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1446 {
1447   PetscErrorCode ierr;
1448   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1449   PetscInt       i;
1450 
1451   PetscFunctionBegin;
1452   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1453     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1454     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1455    #if defined(PETSC_HAVE_DEVICE)
1456     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1457     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1458    #endif
1459   }
1460   PetscFunctionReturn(0);
1461 }
1462