xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 2fa40bb9206b96114faa7cb222621ec184d31cd2)
1 #include "petsc/private/sfimpl.h"
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
6 
7 /*
8  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9  * therefore we pack data types manually. This file defines packing routines for the standard data types.
10  */
11 
12 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
13 
14 /* Operations working like s += t */
15 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
16 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
17 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
18 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
19 /* Ref MPI MAXLOC */
20 #define OP_XLOC(op,s,t) \
21   do {                                       \
22     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23     else if (!((s).u op (t).u)) s = t;           \
24   } while (0)
25 
26 /* DEF_PackFunc - macro defining a Pack routine
27 
28    Arguments of the macro:
29    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30    .BS        Block size for vectorization. It is a factor of bsz.
31    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
32 
33    Arguments of the Pack routine:
34    +count     Number of indices in idx[].
35    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36    .opt       Per-pack optimization plan. NULL means no such plan.
37    .idx       Indices of entries to packed.
38    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40    -packed    Address of the packed data.
41  */
42 #define DEF_PackFunc(Type,BS,EQ) \
43   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44   {                                                                                                          \
45     PetscErrorCode ierr;                                                                                     \
46     const Type     *u = (const Type*)unpacked,*u2;                                                           \
47     Type           *p = (Type*)packed,*p2;                                                                   \
48     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
49     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
50     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
51     PetscFunctionBegin;                                                                                      \
52     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
53     else if (opt) { /* has optimizations available */                                                        \
54       p2 = p;                                                                                                \
55       for (r=0; r<opt->n; r++) {                                                                             \
56         u2 = u + opt->start[r]*MBS;                                                                          \
57         X  = opt->X[r];                                                                                      \
58         Y  = opt->Y[r];                                                                                      \
59         for (k=0; k<opt->dz[r]; k++)                                                                         \
60           for (j=0; j<opt->dy[r]; j++) {                                                                     \
61             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
62             p2  += opt->dx[r]*MBS;                                                                           \
63           }                                                                                                  \
64       }                                                                                                      \
65     } else {                                                                                                 \
66       for (i=0; i<count; i++)                                                                                \
67         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
68           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
69             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
70     }                                                                                                        \
71     PetscFunctionReturn(0);                                                                                  \
72   }
73 
74 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
75                 and inserts into a sparse array.
76 
77    Arguments:
78   .Type       Type of the data
79   .BS         Block size for vectorization
80   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
81 
82   Notes:
83    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
84  */
85 #define DEF_UnpackFunc(Type,BS,EQ)               \
86   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
87   {                                                                                                          \
88     PetscErrorCode ierr;                                                                                     \
89     Type           *u = (Type*)unpacked,*u2;                                                                 \
90     const Type     *p = (const Type*)packed;                                                                 \
91     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
92     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
93     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
94     PetscFunctionBegin;                                                                                      \
95     if (!idx) {                                                                                              \
96       u += start*MBS;                                                                                        \
97       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
98     } else if (opt) { /* has optimizations available */                                                      \
99       for (r=0; r<opt->n; r++) {                                                                             \
100         u2 = u + opt->start[r]*MBS;                                                                          \
101         X  = opt->X[r];                                                                                      \
102         Y  = opt->Y[r];                                                                                      \
103         for (k=0; k<opt->dz[r]; k++)                                                                         \
104           for (j=0; j<opt->dy[r]; j++) {                                                                     \
105             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
106             p   += opt->dx[r]*MBS;                                                                           \
107           }                                                                                                  \
108       }                                                                                                      \
109     } else {                                                                                                 \
110       for (i=0; i<count; i++)                                                                                \
111         for (j=0; j<M; j++)                                                                                  \
112           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
113     }                                                                                                        \
114     PetscFunctionReturn(0);                                                                                  \
115   }
116 
117 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
118 
119    Arguments:
120   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
121   .Type       Type of the data
122   .BS         Block size for vectorization
123   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
124   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
125   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
126  */
127 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
128   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
129   {                                                                                                          \
130     Type           *u = (Type*)unpacked,*u2;                                                                 \
131     const Type     *p = (const Type*)packed;                                                                 \
132     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
133     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
134     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
135     PetscFunctionBegin;                                                                                      \
136     if (!idx) {                                                                                              \
137       u += start*MBS;                                                                                        \
138       for (i=0; i<count; i++)                                                                                \
139         for (j=0; j<M; j++)                                                                                  \
140           for (k=0; k<BS; k++)                                                                               \
141             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
142     } else if (opt) { /* idx[] has patterns */                                                               \
143       for (r=0; r<opt->n; r++) {                                                                             \
144         u2 = u + opt->start[r]*MBS;                                                                          \
145         X  = opt->X[r];                                                                                      \
146         Y  = opt->Y[r];                                                                                      \
147         for (k=0; k<opt->dz[r]; k++)                                                                         \
148           for (j=0; j<opt->dy[r]; j++) {                                                                     \
149             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
150             p += opt->dx[r]*MBS;                                                                             \
151           }                                                                                                  \
152       }                                                                                                      \
153     } else {                                                                                                 \
154       for (i=0; i<count; i++)                                                                                \
155         for (j=0; j<M; j++)                                                                                  \
156           for (k=0; k<BS; k++)                                                                               \
157             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
158     }                                                                                                        \
159     PetscFunctionReturn(0);                                                                                  \
160   }
161 
162 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
163   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
164   {                                                                                                          \
165     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
166     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
167     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
168     const PetscInt MBS = M*BS;                                                                               \
169     PetscFunctionBegin;                                                                                      \
170     for (i=0; i<count; i++) {                                                                                \
171       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
172       l = i*MBS;                                                                                             \
173       for (j=0; j<M; j++)                                                                                    \
174         for (k=0; k<BS; k++) {                                                                               \
175           tmp = u[r+j*BS+k];                                                                                 \
176           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
177           p[l+j*BS+k] = tmp;                                                                                 \
178         }                                                                                                    \
179     }                                                                                                        \
180     PetscFunctionReturn(0);                                                                                  \
181   }
182 
183 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
184   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
185   {                                                                                                          \
186     PetscErrorCode ierr;                                                                                     \
187     const Type     *u = (const Type*)src;                                                                    \
188     Type           *v = (Type*)dst;                                                                          \
189     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
190     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
191     const PetscInt MBS = M*BS;                                                                               \
192     PetscFunctionBegin;                                                                                      \
193     if (!srcIdx) { /* src is contiguous */                                                                   \
194       u += srcStart*MBS;                                                                                     \
195       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
196     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
197       u += srcOpt->start[0]*MBS;                                                                             \
198       v += dstStart*MBS;                                                                                     \
199       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
200       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
201         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
202           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
203           v += srcOpt->dx[0]*MBS;                                                                            \
204         }                                                                                                    \
205     } else { /* all other cases */                                                                           \
206       for (i=0; i<count; i++) {                                                                              \
207         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
208         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
209         for (j=0; j<M; j++)                                                                                  \
210           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
211       }                                                                                                      \
212     }                                                                                                        \
213     PetscFunctionReturn(0);                                                                                  \
214   }
215 
216 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
217   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
218   {                                                                                                          \
219     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
220     const Type     *ldata = (const Type*)leafdata;                                                           \
221     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
222     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
223     const PetscInt MBS = M*BS;                                                                               \
224     PetscFunctionBegin;                                                                                      \
225     for (i=0; i<count; i++) {                                                                                \
226       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
227       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
228       for (j=0; j<M; j++)                                                                                    \
229         for (k=0; k<BS; k++) {                                                                               \
230           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
231           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
232         }                                                                                                    \
233     }                                                                                                        \
234     PetscFunctionReturn(0);                                                                                  \
235   }
236 
237 /* Pack, Unpack/Fetch ops */
238 #define DEF_Pack(Type,BS,EQ)                                                      \
239   DEF_PackFunc(Type,BS,EQ)                                                        \
240   DEF_UnpackFunc(Type,BS,EQ)                                                      \
241   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
242   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
243     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
244     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
245     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
246   }
247 
248 /* Add, Mult ops */
249 #define DEF_Add(Type,BS,EQ)                                                       \
250   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
251   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
252   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
253   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
254   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
255   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
256   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
257     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
258     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
259     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
260     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
261     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
262     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
263   }
264 
265 /* Max, Min ops */
266 #define DEF_Cmp(Type,BS,EQ)                                                       \
267   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
268   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
269   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
270   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
271   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
272     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
273     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
274     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
275     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
276   }
277 
278 /* Logical ops.
279   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
280   the compilation warning "empty macro arguments are undefined in ISO C90"
281  */
282 #define DEF_Log(Type,BS,EQ)                                                       \
283   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
284   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
285   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
286   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
287   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
288   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
289   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
290     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
291     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
292     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
293     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
294     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
295     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
296   }
297 
298 /* Bitwise ops */
299 #define DEF_Bit(Type,BS,EQ)                                                       \
300   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
301   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
302   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
303   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
304   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
305   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
306   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
307     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
308     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
309     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
310     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
311     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
312     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
313   }
314 
315 /* Maxloc, Minloc ops */
316 #define DEF_Xloc(Type,BS,EQ)                                                      \
317   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
318   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
319   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
320   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
321   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
322     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
323     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
324     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
325     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
326   }
327 
328 #define DEF_IntegerType(Type,BS,EQ)                                               \
329   DEF_Pack(Type,BS,EQ)                                                            \
330   DEF_Add(Type,BS,EQ)                                                             \
331   DEF_Cmp(Type,BS,EQ)                                                             \
332   DEF_Log(Type,BS,EQ)                                                             \
333   DEF_Bit(Type,BS,EQ)                                                             \
334   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
335     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
336     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
337     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
338     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
339     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
340   }
341 
342 #define DEF_RealType(Type,BS,EQ)                                                  \
343   DEF_Pack(Type,BS,EQ)                                                            \
344   DEF_Add(Type,BS,EQ)                                                             \
345   DEF_Cmp(Type,BS,EQ)                                                             \
346   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
347     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
348     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
349     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
350   }
351 
352 #if defined(PETSC_HAVE_COMPLEX)
353 #define DEF_ComplexType(Type,BS,EQ)                                               \
354   DEF_Pack(Type,BS,EQ)                                                            \
355   DEF_Add(Type,BS,EQ)                                                             \
356   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
357     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
358     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
359   }
360 #endif
361 
362 #define DEF_DumbType(Type,BS,EQ)                                                  \
363   DEF_Pack(Type,BS,EQ)                                                            \
364   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
365     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366   }
367 
368 /* Maxloc, Minloc */
369 #define DEF_PairType(Type,BS,EQ)                                                  \
370   DEF_Pack(Type,BS,EQ)                                                            \
371   DEF_Xloc(Type,BS,EQ)                                                            \
372   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
373     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
375   }
376 
377 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
378 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
379 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
380 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
381 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
382 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
383 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
384 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
385 
386 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
387 DEF_IntegerType(int,1,1)
388 DEF_IntegerType(int,2,1)
389 DEF_IntegerType(int,4,1)
390 DEF_IntegerType(int,8,1)
391 DEF_IntegerType(int,1,0)
392 DEF_IntegerType(int,2,0)
393 DEF_IntegerType(int,4,0)
394 DEF_IntegerType(int,8,0)
395 #endif
396 
397 /* The typedefs are used to get a typename without space that CPPJoin can handle */
398 typedef signed char SignedChar;
399 DEF_IntegerType(SignedChar,1,1)
400 DEF_IntegerType(SignedChar,2,1)
401 DEF_IntegerType(SignedChar,4,1)
402 DEF_IntegerType(SignedChar,8,1)
403 DEF_IntegerType(SignedChar,1,0)
404 DEF_IntegerType(SignedChar,2,0)
405 DEF_IntegerType(SignedChar,4,0)
406 DEF_IntegerType(SignedChar,8,0)
407 
408 typedef unsigned char UnsignedChar;
409 DEF_IntegerType(UnsignedChar,1,1)
410 DEF_IntegerType(UnsignedChar,2,1)
411 DEF_IntegerType(UnsignedChar,4,1)
412 DEF_IntegerType(UnsignedChar,8,1)
413 DEF_IntegerType(UnsignedChar,1,0)
414 DEF_IntegerType(UnsignedChar,2,0)
415 DEF_IntegerType(UnsignedChar,4,0)
416 DEF_IntegerType(UnsignedChar,8,0)
417 
418 DEF_RealType(PetscReal,1,1)
419 DEF_RealType(PetscReal,2,1)
420 DEF_RealType(PetscReal,4,1)
421 DEF_RealType(PetscReal,8,1)
422 DEF_RealType(PetscReal,1,0)
423 DEF_RealType(PetscReal,2,0)
424 DEF_RealType(PetscReal,4,0)
425 DEF_RealType(PetscReal,8,0)
426 
427 #if defined(PETSC_HAVE_COMPLEX)
428 DEF_ComplexType(PetscComplex,1,1)
429 DEF_ComplexType(PetscComplex,2,1)
430 DEF_ComplexType(PetscComplex,4,1)
431 DEF_ComplexType(PetscComplex,8,1)
432 DEF_ComplexType(PetscComplex,1,0)
433 DEF_ComplexType(PetscComplex,2,0)
434 DEF_ComplexType(PetscComplex,4,0)
435 DEF_ComplexType(PetscComplex,8,0)
436 #endif
437 
438 #define PairType(Type1,Type2) Type1##_##Type2
439 typedef struct {int u; int i;}           PairType(int,int);
440 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
441 DEF_PairType(PairType(int,int),1,1)
442 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
443 
444 /* If we don't know the basic type, we treat it as a stream of chars or ints */
445 DEF_DumbType(char,1,1)
446 DEF_DumbType(char,2,1)
447 DEF_DumbType(char,4,1)
448 DEF_DumbType(char,1,0)
449 DEF_DumbType(char,2,0)
450 DEF_DumbType(char,4,0)
451 
452 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
453 DEF_DumbType(DumbInt,1,1)
454 DEF_DumbType(DumbInt,2,1)
455 DEF_DumbType(DumbInt,4,1)
456 DEF_DumbType(DumbInt,8,1)
457 DEF_DumbType(DumbInt,1,0)
458 DEF_DumbType(DumbInt,2,0)
459 DEF_DumbType(DumbInt,4,0)
460 DEF_DumbType(DumbInt,8,0)
461 
462 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
463 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
464 {
465   int ierr;
466   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
467   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
468   return MPI_SUCCESS;
469 }
470 #endif
471 
472 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
473 {
474   PetscErrorCode    ierr;
475   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
476   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
477 
478   PetscFunctionBegin;
479   /* Destroy device-specific fields */
480   if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);}
481 
482   /* Destroy host related fields */
483   if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);}
484   if (!link->use_nvshmem) {
485     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
486       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);}
487     }
488     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
489     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
490       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
491       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
492     }
493   }
494   ierr = PetscFree(link);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
499 {
500   PetscErrorCode    ierr;
501 
502   PetscFunctionBegin;
503   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
504  #if defined(PETSC_HAVE_NVSHMEM)
505   {
506     PetscBool use_nvshmem;
507     ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr);
508     if (use_nvshmem) {
509       ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
510       PetscFunctionReturn(0);
511     }
512   }
513  #endif
514   ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
515   PetscFunctionReturn(0);
516 }
517 
518 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
519    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
520 */
521 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
522 {
523   PetscErrorCode       ierr;
524   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
525   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
526   const PetscInt       *rootoffset,*leafoffset;
527   PetscMPIInt          n;
528   MPI_Aint             disp;
529   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
530   MPI_Datatype         unit = link->unit;
531   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
532   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
533 
534   PetscFunctionBegin;
535   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
536   if (sf->persistent) {
537     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
538       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
539       if (direction == PETSCSF_LEAF2ROOT) {
540         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
541           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
542           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
543           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
544         }
545       } else { /* PETSCSF_ROOT2LEAF */
546         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
547           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
548           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
549           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
550         }
551       }
552       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
553     }
554 
555     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
556       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
557       if (direction == PETSCSF_LEAF2ROOT) {
558         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
559           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
560           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
561           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
562         }
563       } else { /* PETSCSF_ROOT2LEAF */
564         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
565           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
566           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
567           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
568         }
569       }
570       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
571     }
572   }
573   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
574   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
575   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
576   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
577   PetscFunctionReturn(0);
578 }
579 
580 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
581 {
582   PetscErrorCode    ierr;
583   PetscSFLink       link,*p;
584   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
585 
586   PetscFunctionBegin;
587   /* Look for types in cache */
588   for (p=&bas->inuse; (link=*p); p=&link->next) {
589     PetscBool match;
590     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
591     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
592       switch (cmode) {
593       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
594       case PETSC_USE_POINTER: break;
595       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
596       }
597       *mylink = link;
598       PetscFunctionReturn(0);
599     }
600   }
601   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
602 }
603 
604 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
605 {
606   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
607   PetscSFLink       link = *mylink;
608 
609   PetscFunctionBegin;
610   link->rootdata = NULL;
611   link->leafdata = NULL;
612   link->next     = bas->avail;
613   bas->avail     = link;
614   *mylink        = NULL;
615   PetscFunctionReturn(0);
616 }
617 
618 /* Error out on unsupported overlapped communications */
619 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
620 {
621   PetscErrorCode    ierr;
622   PetscSFLink       link,*p;
623   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
624   PetscBool         match;
625 
626   PetscFunctionBegin;
627   if (PetscDefined(USE_DEBUG)) {
628     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
629        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
630     */
631     if (rootdata || leafdata) {
632       for (p=&bas->inuse; (link=*p); p=&link->next) {
633         ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
634         if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
635       }
636     }
637   }
638   PetscFunctionReturn(0);
639 }
640 
641 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
642 {
643   PetscFunctionBegin;
644   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
645   PetscFunctionReturn(0);
646 }
647 
648 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
649 {
650   PetscErrorCode ierr;
651   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
652   PetscBool      is2Int,is2PetscInt;
653   PetscMPIInt    ni,na,nd,combiner;
654 #if defined(PETSC_HAVE_COMPLEX)
655   PetscInt       nPetscComplex=0;
656 #endif
657 
658   PetscFunctionBegin;
659   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
660   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
661   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
662   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
663   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
664   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
665 #if defined(PETSC_HAVE_COMPLEX)
666   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
667 #endif
668   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
669   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
670   /* TODO: shaell we also handle Fortran MPI_2REAL? */
671   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr);
672   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
673   link->bs = 1; /* default */
674 
675   if (is2Int) {
676     PackInit_PairType_int_int_1_1(link);
677     link->bs        = 1;
678     link->unitbytes = 2*sizeof(int);
679     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
680     link->basicunit = MPI_2INT;
681     link->unit      = MPI_2INT;
682   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
683     PackInit_PairType_PetscInt_PetscInt_1_1(link);
684     link->bs        = 1;
685     link->unitbytes = 2*sizeof(PetscInt);
686     link->basicunit = MPIU_2INT;
687     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
688     link->unit      = MPIU_2INT;
689   } else if (nPetscReal) {
690     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
691     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
692     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
693     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
694     link->bs        = nPetscReal;
695     link->unitbytes = nPetscReal*sizeof(PetscReal);
696     link->basicunit = MPIU_REAL;
697     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
698   } else if (nPetscInt) {
699     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
700     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
701     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
702     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
703     link->bs        = nPetscInt;
704     link->unitbytes = nPetscInt*sizeof(PetscInt);
705     link->basicunit = MPIU_INT;
706     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
707 #if defined(PETSC_USE_64BIT_INDICES)
708   } else if (nInt) {
709     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
710     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
711     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
712     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
713     link->bs        = nInt;
714     link->unitbytes = nInt*sizeof(int);
715     link->basicunit = MPI_INT;
716     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
717 #endif
718   } else if (nSignedChar) {
719     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
720     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
721     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
722     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
723     link->bs        = nSignedChar;
724     link->unitbytes = nSignedChar*sizeof(SignedChar);
725     link->basicunit = MPI_SIGNED_CHAR;
726     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
727   }  else if (nUnsignedChar) {
728     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
729     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
730     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
731     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
732     link->bs        = nUnsignedChar;
733     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
734     link->basicunit = MPI_UNSIGNED_CHAR;
735     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
736 #if defined(PETSC_HAVE_COMPLEX)
737   } else if (nPetscComplex) {
738     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
739     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
740     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
741     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
742     link->bs        = nPetscComplex;
743     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
744     link->basicunit = MPIU_COMPLEX;
745     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
746 #endif
747   } else {
748     MPI_Aint lb,nbyte;
749     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr);
750     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
751     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
752       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
753       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
754       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
755       link->bs        = nbyte;
756       link->unitbytes = nbyte;
757       link->basicunit = MPI_BYTE;
758     } else {
759       nInt = nbyte / sizeof(int);
760       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
761       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
762       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
763       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
764       link->bs        = nInt;
765       link->unitbytes = nbyte;
766       link->basicunit = MPI_INT;
767     }
768     if (link->isbuiltin) link->unit = unit;
769   }
770 
771   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);}
772 
773   link->Memcpy = PetscSFLinkMemcpy_Host;
774   PetscFunctionReturn(0);
775 }
776 
777 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
778 {
779   PetscFunctionBegin;
780   *UnpackAndOp = NULL;
781   if (PetscMemTypeHost(mtype)) {
782     if      (op == MPI_REPLACE)               *UnpackAndOp = link->h_UnpackAndInsert;
783     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
784     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
785     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
786     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
787     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
788     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
789     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
790     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
791     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
792     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
793     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
794     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
795   }
796 #if defined(PETSC_HAVE_DEVICE)
797   else if (PetscMemTypeDevice(mtype) && !atomic) {
798     if      (op == MPI_REPLACE)               *UnpackAndOp = link->d_UnpackAndInsert;
799     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
800     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
801     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
802     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
803     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
804     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
805     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
806     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
807     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
808     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
809     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
810     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
811   } else if (PetscMemTypeDevice(mtype) && atomic) {
812     if      (op == MPI_REPLACE)               *UnpackAndOp = link->da_UnpackAndInsert;
813     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
814     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
815     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
816     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
817     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
818     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
819     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
820     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
821     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
822     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
823     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
824     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
825   }
826 #endif
827   PetscFunctionReturn(0);
828 }
829 
830 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
831 {
832   PetscFunctionBegin;
833   *ScatterAndOp = NULL;
834   if (PetscMemTypeHost(mtype)) {
835     if      (op == MPI_REPLACE)               *ScatterAndOp = link->h_ScatterAndInsert;
836     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
837     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
838     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
839     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
840     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
841     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
842     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
843     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
844     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
845     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
846     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
847     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
848   }
849 #if defined(PETSC_HAVE_DEVICE)
850   else if (PetscMemTypeDevice(mtype) && !atomic) {
851     if      (op == MPI_REPLACE)               *ScatterAndOp = link->d_ScatterAndInsert;
852     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
853     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
854     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
855     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
856     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
857     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
858     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
859     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
860     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
861     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
862     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
863     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
864   } else if (PetscMemTypeDevice(mtype) && atomic) {
865     if      (op == MPI_REPLACE)               *ScatterAndOp = link->da_ScatterAndInsert;
866     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
867     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
868     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
869     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
870     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
871     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
872     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
873     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
874     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
875     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
876     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
877     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
878   }
879 #endif
880   PetscFunctionReturn(0);
881 }
882 
883 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
884 {
885   PetscFunctionBegin;
886   *FetchAndOp = NULL;
887   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
888   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
889 #if defined(PETSC_HAVE_DEVICE)
890   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
891   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOp = link->da_FetchAndAdd;
892 #endif
893   PetscFunctionReturn(0);
894 }
895 
896 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
897 {
898   PetscFunctionBegin;
899   *FetchAndOpLocal = NULL;
900   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
901   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
902 #if defined(PETSC_HAVE_DEVICE)
903   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
904   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
905 #endif
906   PetscFunctionReturn(0);
907 }
908 
909 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
910 {
911   PetscErrorCode ierr;
912   PetscLogDouble flops;
913   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
914 
915   PetscFunctionBegin;
916   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
917     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
918 #if defined(PETSC_HAVE_DEVICE)
919     if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
920 #endif
921     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
922   }
923   PetscFunctionReturn(0);
924 }
925 
926 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
927 {
928   PetscLogDouble flops;
929   PetscErrorCode ierr;
930 
931   PetscFunctionBegin;
932   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
933     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
934 #if defined(PETSC_HAVE_DEVICE)
935     if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
936 #endif
937     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
938   }
939   PetscFunctionReturn(0);
940 }
941 
942 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
943   Input Parameters:
944   +sf      - The StarForest
945   .link    - The link
946   .count   - Number of entries to unpack
947   .start   - The first index, significent when indices=NULL
948   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
949   .buf     - A contiguous buffer to unpack from
950   -op      - Operation after unpack
951 
952   Output Parameters:
953   .data    - The data to unpack to
954 */
955 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
956 {
957   PetscFunctionBegin;
958 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
959   {
960     PetscErrorCode ierr;
961     PetscInt       i;
962     PetscMPIInt    n;
963     if (indices) {
964       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
965          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
966       */
967       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);}
968     } else {
969       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
970       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr);
971     }
972   }
973   PetscFunctionReturn(0);
974 #else
975   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
976 #endif
977 }
978 
979 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
980 {
981   PetscFunctionBegin;
982 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
983   {
984     PetscErrorCode ierr;
985     PetscInt       i,disp;
986     if (!srcIdx) {
987       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
988     } else {
989       for (i=0; i<count; i++) {
990         disp = dstIdx? dstIdx[i] : dstStart + i;
991         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);
992       }
993     }
994   }
995   PetscFunctionReturn(0);
996 #else
997   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
998 #endif
999 }
1000 
1001 /*=============================================================================
1002               Pack/Unpack/Fetch/Scatter routines
1003  ============================================================================*/
1004 
1005 /* Pack rootdata to rootbuf
1006   Input Parameters:
1007   + sf       - The SF this packing works on.
1008   . link     - It gives the memtype of the roots and also provides root buffer.
1009   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1010   - rootdata - Where to read the roots.
1011 
1012   Notes:
1013   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1014   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
1015  */
1016 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1017 {
1018   PetscErrorCode   ierr;
1019   const PetscInt   *rootindices = NULL;
1020   PetscInt         count,start;
1021   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1022   PetscMemType     rootmtype = link->rootmtype;
1023   PetscSFPackOpt   opt = NULL;
1024 
1025   PetscFunctionBegin;
1026   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1027   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1028     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1029     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1030     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1031   }
1032   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1033   PetscFunctionReturn(0);
1034 }
1035 
1036 /* Pack leafdata to leafbuf */
1037 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1038 {
1039   PetscErrorCode   ierr;
1040   const PetscInt   *leafindices = NULL;
1041   PetscInt         count,start;
1042   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1043   PetscMemType     leafmtype = link->leafmtype;
1044   PetscSFPackOpt   opt = NULL;
1045 
1046   PetscFunctionBegin;
1047   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1048   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1049     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1050     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1051     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1052   }
1053   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1054   PetscFunctionReturn(0);
1055 }
1056 
1057 /* Pack rootdata to rootbuf, which are in the same memory space */
1058 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1059 {
1060   PetscErrorCode   ierr;
1061   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1062 
1063   PetscFunctionBegin;
1064   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1065     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1066     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */
1067   }
1068   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1069   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);}
1070   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1071   PetscFunctionReturn(0);
1072 }
1073 /* Pack leafdata to leafbuf, which are in the same memory space */
1074 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1075 {
1076   PetscErrorCode   ierr;
1077 
1078   PetscFunctionBegin;
1079   if (scope == PETSCSF_REMOTE) {
1080     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1081     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1082   }
1083   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1084   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);}
1085   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1090 {
1091   PetscErrorCode   ierr;
1092   const PetscInt   *rootindices = NULL;
1093   PetscInt         count,start;
1094   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1095   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1096   PetscMemType     rootmtype = link->rootmtype;
1097   PetscSFPackOpt   opt = NULL;
1098 
1099   PetscFunctionBegin;
1100   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1101     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1102     if (UnpackAndOp) {
1103       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1104       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1105     } else {
1106       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1107       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1108     }
1109   }
1110   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1115 {
1116   PetscErrorCode   ierr;
1117   const PetscInt   *leafindices = NULL;
1118   PetscInt         count,start;
1119   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1120   PetscMemType     leafmtype = link->leafmtype;
1121   PetscSFPackOpt   opt = NULL;
1122 
1123   PetscFunctionBegin;
1124   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1125     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1126     if (UnpackAndOp) {
1127       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1128       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1129     } else {
1130       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1131       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1132     }
1133   }
1134   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1135   PetscFunctionReturn(0);
1136 }
1137 /* Unpack rootbuf to rootdata, which are in the same memory space */
1138 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1139 {
1140   PetscErrorCode   ierr;
1141   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1142 
1143   PetscFunctionBegin;
1144   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1145   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);}
1146   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1147   if (scope == PETSCSF_REMOTE) {
1148     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1149     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1150   }
1151   PetscFunctionReturn(0);
1152 }
1153 
1154 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1155 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1156 {
1157   PetscErrorCode   ierr;
1158 
1159   PetscFunctionBegin;
1160   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1161   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);CHKERRQ(ierr);}
1162   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1163   if (scope == PETSCSF_REMOTE) {
1164     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1165     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1166   }
1167   PetscFunctionReturn(0);
1168 }
1169 
1170 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1171 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1172 {
1173   PetscErrorCode     ierr;
1174   const PetscInt     *rootindices = NULL;
1175   PetscInt           count,start;
1176   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1177   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1178   PetscMemType       rootmtype = link->rootmtype;
1179   PetscSFPackOpt     opt = NULL;
1180 
1181   PetscFunctionBegin;
1182   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1183   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1184     /* Do FetchAndOp on rootdata with rootbuf */
1185     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr);
1186     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1187     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr);
1188   }
1189   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr);
1190   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1191   PetscFunctionReturn(0);
1192 }
1193 
1194 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1195 {
1196   PetscErrorCode       ierr;
1197   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1198   PetscInt             count,rootstart,leafstart;
1199   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1200   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1201   PetscMemType         rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1202   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1203   PetscInt             buflen = sf->leafbuflen[PETSCSF_LOCAL];
1204   char                 *srcbuf = NULL,*dstbuf = NULL;
1205   PetscBool            dstdups;
1206 
1207   PetscFunctionBegin;
1208   if (!buflen) PetscFunctionReturn(0);
1209   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1210     if (direction == PETSCSF_ROOT2LEAF) {
1211       ierr     = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1212       srcmtype = rootmtype;
1213       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1214       dstmtype = leafmtype;
1215       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1216     } else {
1217       ierr     = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1218       srcmtype = leafmtype;
1219       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1220       dstmtype = rootmtype;
1221       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1222     }
1223     ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr);
1224     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1225     if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);}
1226     if (direction == PETSCSF_ROOT2LEAF) {
1227       ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1228     } else {
1229       ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1230     }
1231   } else {
1232     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1233     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
1234     ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr);
1235     if (ScatterAndOp) {
1236       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1237       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1238       if (direction == PETSCSF_ROOT2LEAF) {
1239         ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1240       } else {
1241         ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1242       }
1243     } else {
1244       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1245       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1246       if (direction == PETSCSF_ROOT2LEAF) {
1247         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1248       } else {
1249         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1250       }
1251     }
1252   }
1253   PetscFunctionReturn(0);
1254 }
1255 
1256 /* Fetch rootdata to leafdata and leafupdate locally */
1257 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1258 {
1259   PetscErrorCode       ierr;
1260   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1261   PetscInt             count,rootstart,leafstart;
1262   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1263   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1264   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1265   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1266 
1267   PetscFunctionBegin;
1268   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1269   if (rootmtype != leafmtype) {
1270     /* The local communication has to go through pack and unpack */
1271     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1272   } else {
1273     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1274     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1275     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1276     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1277   }
1278   PetscFunctionReturn(0);
1279 }
1280 
1281 /*
1282   Create per-rank pack/unpack optimizations based on indice patterns
1283 
1284    Input Parameters:
1285   +  n       - Number of destination ranks
1286   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1287   -  idx     - [*]   Array storing indices
1288 
1289    Output Parameters:
1290   +  opt     - Pack optimizations. NULL if no optimizations.
1291 */
1292 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1293 {
1294   PetscErrorCode ierr;
1295   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1296   PetscBool      optimizable = PETSC_TRUE;
1297   PetscSFPackOpt opt;
1298 
1299   PetscFunctionBegin;
1300   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1301   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1302   opt->n      = opt->array[0] = n;
1303   opt->offset = opt->array + 1;
1304   opt->start  = opt->array + n   + 2;
1305   opt->dx     = opt->array + 2*n + 2;
1306   opt->dy     = opt->array + 3*n + 2;
1307   opt->dz     = opt->array + 4*n + 2;
1308   opt->X      = opt->array + 5*n + 2;
1309   opt->Y      = opt->array + 6*n + 2;
1310 
1311   for (r=0; r<n; r++) { /* For each destination rank */
1312     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1313     p     = offset[r];
1314     start = idx[p]; /* First index for this rank */
1315     p++;
1316 
1317     /* Search in X dimension */
1318     for (dx=1; dx<m; dx++,p++) {
1319       if (start+dx != idx[p]) break;
1320     }
1321 
1322     dydz = m/dx;
1323     X    = dydz > 1 ? (idx[p]-start) : dx;
1324     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1325     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1326     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1327       for (i=0; i<dx; i++,p++) {
1328         if (start+X*dy+i != idx[p]) {
1329           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1330           else goto Z_dimension;
1331         }
1332       }
1333     }
1334 
1335 Z_dimension:
1336     dz = m/(dx*dy);
1337     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1338     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1339     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1340     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1341       for (j=0; j<dy; j++) {
1342         for (i=0; i<dx; i++,p++) {
1343           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1344         }
1345       }
1346     }
1347     opt->start[r] = start;
1348     opt->dx[r]    = dx;
1349     opt->dy[r]    = dy;
1350     opt->dz[r]    = dz;
1351     opt->X[r]     = X;
1352     opt->Y[r]     = Y;
1353   }
1354 
1355 finish:
1356   /* If not optimizable, free arrays to save memory */
1357   if (!n || !optimizable) {
1358     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1359     ierr = PetscFree(opt);CHKERRQ(ierr);
1360     *out = NULL;
1361   } else {
1362     opt->offset[0] = 0;
1363     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1364     *out = opt;
1365   }
1366   PetscFunctionReturn(0);
1367 }
1368 
1369 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1370 {
1371   PetscErrorCode ierr;
1372   PetscSFPackOpt opt = *out;
1373 
1374   PetscFunctionBegin;
1375   if (opt) {
1376     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
1377     ierr = PetscFree(opt);CHKERRQ(ierr);
1378     *out = NULL;
1379   }
1380   PetscFunctionReturn(0);
1381 }
1382 
1383 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1384 {
1385   PetscErrorCode ierr;
1386   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1387   PetscInt       i,j;
1388 
1389   PetscFunctionBegin;
1390   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1391   for (i=0; i<2; i++) { /* Set defaults */
1392     sf->leafstart[i]   = 0;
1393     sf->leafcontig[i]  = PETSC_TRUE;
1394     sf->leafdups[i]    = PETSC_FALSE;
1395     bas->rootstart[i]  = 0;
1396     bas->rootcontig[i] = PETSC_TRUE;
1397     bas->rootdups[i]   = PETSC_FALSE;
1398   }
1399 
1400   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1401   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1402 
1403   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1404   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1405 
1406   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1407   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1408     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1409   }
1410   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1411     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1412   }
1413 
1414   /* If not, see if we can have per-rank optimizations by doing index analysis */
1415   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1416   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1417 
1418   /* Are root indices for self and remote contiguous? */
1419   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1420   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1421 
1422   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1423   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1424 
1425   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1426     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1427   }
1428   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1429     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1430   }
1431 
1432   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1433   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1434 
1435  #if defined(PETSC_HAVE_DEVICE)
1436     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1437   if (PetscDefined(HAVE_DEVICE)) {
1438     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1439     if (!sf->leafcontig[0]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1440     if (!sf->leafcontig[1]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1441     if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1442     if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1443   }
1444 #endif
1445   PetscFunctionReturn(0);
1446 }
1447 
1448 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1449 {
1450   PetscErrorCode ierr;
1451   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1452   PetscInt       i;
1453 
1454   PetscFunctionBegin;
1455   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1456     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1457     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1458    #if defined(PETSC_HAVE_DEVICE)
1459     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1460     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1461    #endif
1462   }
1463   PetscFunctionReturn(0);
1464 }
1465