xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision ae8b063e4f7e8ac6d2de6909aa6d539b126b38d9)
1 #include "petsc/private/sfimpl.h"
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
6 
7 /*
8  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9  * therefore we pack data types manually. This file defines packing routines for the standard data types.
10  */
11 
12 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
13 
14 /* Operations working like s += t */
15 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
16 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
17 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
18 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
19 /* Ref MPI MAXLOC */
20 #define OP_XLOC(op,s,t) \
21   do {                                       \
22     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23     else if (!((s).u op (t).u)) s = t;           \
24   } while (0)
25 
26 /* DEF_PackFunc - macro defining a Pack routine
27 
28    Arguments of the macro:
29    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30    .BS        Block size for vectorization. It is a factor of bsz.
31    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
32 
33    Arguments of the Pack routine:
34    +count     Number of indices in idx[].
35    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36    .opt       Per-pack optimization plan. NULL means no such plan.
37    .idx       Indices of entries to packed.
38    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40    -packed    Address of the packed data.
41  */
42 #define DEF_PackFunc(Type,BS,EQ) \
43   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44   {                                                                                                          \
45     PetscErrorCode ierr;                                                                                     \
46     const Type     *u = (const Type*)unpacked,*u2;                                                           \
47     Type           *p = (Type*)packed,*p2;                                                                   \
48     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
49     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
50     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
51     PetscFunctionBegin;                                                                                      \
52     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
53     else if (opt) { /* has optimizations available */                                                        \
54       p2 = p;                                                                                                \
55       for (r=0; r<opt->n; r++) {                                                                             \
56         u2 = u + opt->start[r]*MBS;                                                                          \
57         X  = opt->X[r];                                                                                      \
58         Y  = opt->Y[r];                                                                                      \
59         for (k=0; k<opt->dz[r]; k++)                                                                         \
60           for (j=0; j<opt->dy[r]; j++) {                                                                     \
61             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
62             p2  += opt->dx[r]*MBS;                                                                           \
63           }                                                                                                  \
64       }                                                                                                      \
65     } else {                                                                                                 \
66       for (i=0; i<count; i++)                                                                                \
67         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
68           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
69             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
70     }                                                                                                        \
71     PetscFunctionReturn(0);                                                                                  \
72   }
73 
74 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
75                 and inserts into a sparse array.
76 
77    Arguments:
78   .Type       Type of the data
79   .BS         Block size for vectorization
80   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
81 
82   Notes:
83    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
84  */
85 #define DEF_UnpackFunc(Type,BS,EQ)               \
86   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
87   {                                                                                                          \
88     PetscErrorCode ierr;                                                                                     \
89     Type           *u = (Type*)unpacked,*u2;                                                                 \
90     const Type     *p = (const Type*)packed;                                                                 \
91     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
92     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
93     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
94     PetscFunctionBegin;                                                                                      \
95     if (!idx) {                                                                                              \
96       u += start*MBS;                                                                                        \
97       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
98     } else if (opt) { /* has optimizations available */                                                      \
99       for (r=0; r<opt->n; r++) {                                                                             \
100         u2 = u + opt->start[r]*MBS;                                                                          \
101         X  = opt->X[r];                                                                                      \
102         Y  = opt->Y[r];                                                                                      \
103         for (k=0; k<opt->dz[r]; k++)                                                                         \
104           for (j=0; j<opt->dy[r]; j++) {                                                                     \
105             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
106             p   += opt->dx[r]*MBS;                                                                           \
107           }                                                                                                  \
108       }                                                                                                      \
109     } else {                                                                                                 \
110       for (i=0; i<count; i++)                                                                                \
111         for (j=0; j<M; j++)                                                                                  \
112           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
113     }                                                                                                        \
114     PetscFunctionReturn(0);                                                                                  \
115   }
116 
117 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
118 
119    Arguments:
120   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
121   .Type       Type of the data
122   .BS         Block size for vectorization
123   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
124   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
125   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
126  */
127 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
128   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
129   {                                                                                                          \
130     Type           *u = (Type*)unpacked,*u2;                                                                 \
131     const Type     *p = (const Type*)packed;                                                                 \
132     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
133     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
134     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
135     PetscFunctionBegin;                                                                                      \
136     if (!idx) {                                                                                              \
137       u += start*MBS;                                                                                        \
138       for (i=0; i<count; i++)                                                                                \
139         for (j=0; j<M; j++)                                                                                  \
140           for (k=0; k<BS; k++)                                                                               \
141             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
142     } else if (opt) { /* idx[] has patterns */                                                               \
143       for (r=0; r<opt->n; r++) {                                                                             \
144         u2 = u + opt->start[r]*MBS;                                                                          \
145         X  = opt->X[r];                                                                                      \
146         Y  = opt->Y[r];                                                                                      \
147         for (k=0; k<opt->dz[r]; k++)                                                                         \
148           for (j=0; j<opt->dy[r]; j++) {                                                                     \
149             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
150             p += opt->dx[r]*MBS;                                                                             \
151           }                                                                                                  \
152       }                                                                                                      \
153     } else {                                                                                                 \
154       for (i=0; i<count; i++)                                                                                \
155         for (j=0; j<M; j++)                                                                                  \
156           for (k=0; k<BS; k++)                                                                               \
157             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
158     }                                                                                                        \
159     PetscFunctionReturn(0);                                                                                  \
160   }
161 
162 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
163   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
164   {                                                                                                          \
165     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
166     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
167     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
168     const PetscInt MBS = M*BS;                                                                               \
169     PetscFunctionBegin;                                                                                      \
170     for (i=0; i<count; i++) {                                                                                \
171       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
172       l = i*MBS;                                                                                             \
173       for (j=0; j<M; j++)                                                                                    \
174         for (k=0; k<BS; k++) {                                                                               \
175           tmp = u[r+j*BS+k];                                                                                 \
176           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
177           p[l+j*BS+k] = tmp;                                                                                 \
178         }                                                                                                    \
179     }                                                                                                        \
180     PetscFunctionReturn(0);                                                                                  \
181   }
182 
183 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
184   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
185   {                                                                                                          \
186     PetscErrorCode ierr;                                                                                     \
187     const Type     *u = (const Type*)src;                                                                    \
188     Type           *v = (Type*)dst;                                                                          \
189     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
190     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
191     const PetscInt MBS = M*BS;                                                                               \
192     PetscFunctionBegin;                                                                                      \
193     if (!srcIdx) { /* src is contiguous */                                                                   \
194       u += srcStart*MBS;                                                                                     \
195       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
196     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
197       u += srcOpt->start[0]*MBS;                                                                             \
198       v += dstStart*MBS;                                                                                     \
199       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
200       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
201         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
202           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
203           v += srcOpt->dx[0]*MBS;                                                                            \
204         }                                                                                                    \
205     } else { /* all other cases */                                                                           \
206       for (i=0; i<count; i++) {                                                                              \
207         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
208         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
209         for (j=0; j<M; j++)                                                                                  \
210           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
211       }                                                                                                      \
212     }                                                                                                        \
213     PetscFunctionReturn(0);                                                                                  \
214   }
215 
216 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
217   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
218   {                                                                                                          \
219     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
220     const Type     *ldata = (const Type*)leafdata;                                                           \
221     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
222     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
223     const PetscInt MBS = M*BS;                                                                               \
224     PetscFunctionBegin;                                                                                      \
225     for (i=0; i<count; i++) {                                                                                \
226       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
227       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
228       for (j=0; j<M; j++)                                                                                    \
229         for (k=0; k<BS; k++) {                                                                               \
230           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
231           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
232         }                                                                                                    \
233     }                                                                                                        \
234     PetscFunctionReturn(0);                                                                                  \
235   }
236 
237 /* Pack, Unpack/Fetch ops */
238 #define DEF_Pack(Type,BS,EQ)                                                      \
239   DEF_PackFunc(Type,BS,EQ)                                                        \
240   DEF_UnpackFunc(Type,BS,EQ)                                                      \
241   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
242   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
243     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
244     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
245     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
246   }
247 
248 /* Add, Mult ops */
249 #define DEF_Add(Type,BS,EQ)                                                       \
250   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
251   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
252   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
253   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
254   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
255   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
256   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
257     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
258     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
259     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
260     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
261     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
262     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
263   }
264 
265 /* Max, Min ops */
266 #define DEF_Cmp(Type,BS,EQ)                                                       \
267   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
268   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
269   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
270   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
271   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
272     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
273     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
274     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
275     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
276   }
277 
278 /* Logical ops.
279   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
280   the compilation warning "empty macro arguments are undefined in ISO C90"
281  */
282 #define DEF_Log(Type,BS,EQ)                                                       \
283   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
284   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
285   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
286   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
287   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
288   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
289   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
290     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
291     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
292     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
293     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
294     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
295     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
296   }
297 
298 /* Bitwise ops */
299 #define DEF_Bit(Type,BS,EQ)                                                       \
300   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
301   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
302   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
303   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
304   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
305   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
306   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
307     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
308     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
309     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
310     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
311     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
312     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
313   }
314 
315 /* Maxloc, Minloc ops */
316 #define DEF_Xloc(Type,BS,EQ)                                                      \
317   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
318   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
319   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
320   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
321   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
322     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
323     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
324     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
325     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
326   }
327 
328 #define DEF_IntegerType(Type,BS,EQ)                                               \
329   DEF_Pack(Type,BS,EQ)                                                            \
330   DEF_Add(Type,BS,EQ)                                                             \
331   DEF_Cmp(Type,BS,EQ)                                                             \
332   DEF_Log(Type,BS,EQ)                                                             \
333   DEF_Bit(Type,BS,EQ)                                                             \
334   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
335     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
336     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
337     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
338     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
339     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
340   }
341 
342 #define DEF_RealType(Type,BS,EQ)                                                  \
343   DEF_Pack(Type,BS,EQ)                                                            \
344   DEF_Add(Type,BS,EQ)                                                             \
345   DEF_Cmp(Type,BS,EQ)                                                             \
346   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
347     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
348     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
349     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
350   }
351 
352 #if defined(PETSC_HAVE_COMPLEX)
353 #define DEF_ComplexType(Type,BS,EQ)                                               \
354   DEF_Pack(Type,BS,EQ)                                                            \
355   DEF_Add(Type,BS,EQ)                                                             \
356   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
357     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
358     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
359   }
360 #endif
361 
362 #define DEF_DumbType(Type,BS,EQ)                                                  \
363   DEF_Pack(Type,BS,EQ)                                                            \
364   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
365     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366   }
367 
368 /* Maxloc, Minloc */
369 #define DEF_PairType(Type,BS,EQ)                                                  \
370   DEF_Pack(Type,BS,EQ)                                                            \
371   DEF_Xloc(Type,BS,EQ)                                                            \
372   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
373     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
375   }
376 
377 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
378 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
379 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
380 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
381 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
382 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
383 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
384 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
385 
386 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
387 DEF_IntegerType(int,1,1)
388 DEF_IntegerType(int,2,1)
389 DEF_IntegerType(int,4,1)
390 DEF_IntegerType(int,8,1)
391 DEF_IntegerType(int,1,0)
392 DEF_IntegerType(int,2,0)
393 DEF_IntegerType(int,4,0)
394 DEF_IntegerType(int,8,0)
395 #endif
396 
397 /* The typedefs are used to get a typename without space that CPPJoin can handle */
398 typedef signed char SignedChar;
399 DEF_IntegerType(SignedChar,1,1)
400 DEF_IntegerType(SignedChar,2,1)
401 DEF_IntegerType(SignedChar,4,1)
402 DEF_IntegerType(SignedChar,8,1)
403 DEF_IntegerType(SignedChar,1,0)
404 DEF_IntegerType(SignedChar,2,0)
405 DEF_IntegerType(SignedChar,4,0)
406 DEF_IntegerType(SignedChar,8,0)
407 
408 typedef unsigned char UnsignedChar;
409 DEF_IntegerType(UnsignedChar,1,1)
410 DEF_IntegerType(UnsignedChar,2,1)
411 DEF_IntegerType(UnsignedChar,4,1)
412 DEF_IntegerType(UnsignedChar,8,1)
413 DEF_IntegerType(UnsignedChar,1,0)
414 DEF_IntegerType(UnsignedChar,2,0)
415 DEF_IntegerType(UnsignedChar,4,0)
416 DEF_IntegerType(UnsignedChar,8,0)
417 
418 DEF_RealType(PetscReal,1,1)
419 DEF_RealType(PetscReal,2,1)
420 DEF_RealType(PetscReal,4,1)
421 DEF_RealType(PetscReal,8,1)
422 DEF_RealType(PetscReal,1,0)
423 DEF_RealType(PetscReal,2,0)
424 DEF_RealType(PetscReal,4,0)
425 DEF_RealType(PetscReal,8,0)
426 
427 #if defined(PETSC_HAVE_COMPLEX)
428 DEF_ComplexType(PetscComplex,1,1)
429 DEF_ComplexType(PetscComplex,2,1)
430 DEF_ComplexType(PetscComplex,4,1)
431 DEF_ComplexType(PetscComplex,8,1)
432 DEF_ComplexType(PetscComplex,1,0)
433 DEF_ComplexType(PetscComplex,2,0)
434 DEF_ComplexType(PetscComplex,4,0)
435 DEF_ComplexType(PetscComplex,8,0)
436 #endif
437 
438 #define PairType(Type1,Type2) Type1##_##Type2
439 typedef struct {int u; int i;}           PairType(int,int);
440 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
441 DEF_PairType(PairType(int,int),1,1)
442 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
443 
444 /* If we don't know the basic type, we treat it as a stream of chars or ints */
445 DEF_DumbType(char,1,1)
446 DEF_DumbType(char,2,1)
447 DEF_DumbType(char,4,1)
448 DEF_DumbType(char,1,0)
449 DEF_DumbType(char,2,0)
450 DEF_DumbType(char,4,0)
451 
452 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
453 DEF_DumbType(DumbInt,1,1)
454 DEF_DumbType(DumbInt,2,1)
455 DEF_DumbType(DumbInt,4,1)
456 DEF_DumbType(DumbInt,8,1)
457 DEF_DumbType(DumbInt,1,0)
458 DEF_DumbType(DumbInt,2,0)
459 DEF_DumbType(DumbInt,4,0)
460 DEF_DumbType(DumbInt,8,0)
461 
462 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
463 {
464   PetscErrorCode    ierr;
465   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
466   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
467 
468   PetscFunctionBegin;
469   /* Destroy device-specific fields */
470   if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);}
471 
472   /* Destroy host related fields */
473   if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);}
474   if (!link->use_nvshmem) {
475     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
476       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);}
477     }
478     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
479     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
480       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
481       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
482     }
483   }
484   ierr = PetscFree(link);CHKERRQ(ierr);
485   PetscFunctionReturn(0);
486 }
487 
488 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
489 {
490   PetscErrorCode    ierr;
491 
492   PetscFunctionBegin;
493   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
494  #if defined(PETSC_HAVE_NVSHMEM)
495   {
496     PetscBool use_nvshmem;
497     ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr);
498     if (use_nvshmem) {
499       ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
500       PetscFunctionReturn(0);
501     }
502   }
503  #endif
504   ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
505   PetscFunctionReturn(0);
506 }
507 
508 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
509    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
510 */
511 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
512 {
513   PetscErrorCode       ierr;
514   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
515   PetscInt             i,j,cnt,nrootranks,ndrootranks,nleafranks,ndleafranks;
516   const PetscInt       *rootoffset,*leafoffset;
517   MPI_Aint             disp;
518   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
519   MPI_Datatype         unit = link->unit;
520   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
521   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
522 
523   PetscFunctionBegin;
524   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
525   if (sf->persistent) {
526     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
527       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
528       if (direction == PETSCSF_LEAF2ROOT) {
529         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
530           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
531           cnt  = rootoffset[i+1]-rootoffset[i];
532           ierr = MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
533         }
534       } else { /* PETSCSF_ROOT2LEAF */
535         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
536           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
537           cnt  = rootoffset[i+1]-rootoffset[i];
538           ierr = MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
539         }
540       }
541       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
542     }
543 
544     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
545       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
546       if (direction == PETSCSF_LEAF2ROOT) {
547         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
548           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
549           cnt  = leafoffset[i+1]-leafoffset[i];
550           ierr = MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
551         }
552       } else { /* PETSCSF_ROOT2LEAF */
553         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
554           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
555           cnt  = leafoffset[i+1]-leafoffset[i];
556           ierr = MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
557         }
558       }
559       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
560     }
561   }
562   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
563   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
564   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
565   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
566   PetscFunctionReturn(0);
567 }
568 
569 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
570 {
571   PetscErrorCode    ierr;
572   PetscSFLink       link,*p;
573   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
574 
575   PetscFunctionBegin;
576   /* Look for types in cache */
577   for (p=&bas->inuse; (link=*p); p=&link->next) {
578     PetscBool match;
579     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
580     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
581       switch (cmode) {
582       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
583       case PETSC_USE_POINTER: break;
584       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
585       }
586       *mylink = link;
587       PetscFunctionReturn(0);
588     }
589   }
590   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
591 }
592 
593 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
594 {
595   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
596   PetscSFLink       link = *mylink;
597 
598   PetscFunctionBegin;
599   link->rootdata = NULL;
600   link->leafdata = NULL;
601   link->next     = bas->avail;
602   bas->avail     = link;
603   *mylink        = NULL;
604   PetscFunctionReturn(0);
605 }
606 
607 /* Error out on unsupported overlapped communications */
608 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
609 {
610   PetscErrorCode    ierr;
611   PetscSFLink       link,*p;
612   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
613   PetscBool         match;
614 
615   PetscFunctionBegin;
616   if (PetscDefined(USE_DEBUG)) {
617     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
618        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
619     */
620     if (rootdata || leafdata) {
621       for (p=&bas->inuse; (link=*p); p=&link->next) {
622         ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
623         PetscCheckFalse(match && (rootdata == link->rootdata) && (leafdata == link->leafdata),PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
624       }
625     }
626   }
627   PetscFunctionReturn(0);
628 }
629 
630 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
631 {
632   PetscFunctionBegin;
633   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
634   PetscFunctionReturn(0);
635 }
636 
637 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
638 {
639   PetscErrorCode ierr;
640   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
641   PetscBool      is2Int,is2PetscInt;
642   PetscMPIInt    ni,na,nd,combiner;
643 #if defined(PETSC_HAVE_COMPLEX)
644   PetscInt       nPetscComplex=0;
645 #endif
646 
647   PetscFunctionBegin;
648   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
649   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
650   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
651   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
652   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
653   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
654 #if defined(PETSC_HAVE_COMPLEX)
655   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
656 #endif
657   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
658   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
659   /* TODO: shaell we also handle Fortran MPI_2REAL? */
660   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr);
661   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
662   link->bs = 1; /* default */
663 
664   if (is2Int) {
665     PackInit_PairType_int_int_1_1(link);
666     link->bs        = 1;
667     link->unitbytes = 2*sizeof(int);
668     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
669     link->basicunit = MPI_2INT;
670     link->unit      = MPI_2INT;
671   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
672     PackInit_PairType_PetscInt_PetscInt_1_1(link);
673     link->bs        = 1;
674     link->unitbytes = 2*sizeof(PetscInt);
675     link->basicunit = MPIU_2INT;
676     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
677     link->unit      = MPIU_2INT;
678   } else if (nPetscReal) {
679     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
680     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
681     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
682     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
683     link->bs        = nPetscReal;
684     link->unitbytes = nPetscReal*sizeof(PetscReal);
685     link->basicunit = MPIU_REAL;
686     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
687   } else if (nPetscInt) {
688     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
689     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
690     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
691     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
692     link->bs        = nPetscInt;
693     link->unitbytes = nPetscInt*sizeof(PetscInt);
694     link->basicunit = MPIU_INT;
695     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
696 #if defined(PETSC_USE_64BIT_INDICES)
697   } else if (nInt) {
698     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
699     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
700     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
701     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
702     link->bs        = nInt;
703     link->unitbytes = nInt*sizeof(int);
704     link->basicunit = MPI_INT;
705     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
706 #endif
707   } else if (nSignedChar) {
708     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
709     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
710     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
711     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
712     link->bs        = nSignedChar;
713     link->unitbytes = nSignedChar*sizeof(SignedChar);
714     link->basicunit = MPI_SIGNED_CHAR;
715     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
716   }  else if (nUnsignedChar) {
717     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
718     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
719     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
720     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
721     link->bs        = nUnsignedChar;
722     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
723     link->basicunit = MPI_UNSIGNED_CHAR;
724     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
725 #if defined(PETSC_HAVE_COMPLEX)
726   } else if (nPetscComplex) {
727     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
728     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
729     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
730     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
731     link->bs        = nPetscComplex;
732     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
733     link->basicunit = MPIU_COMPLEX;
734     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
735 #endif
736   } else {
737     MPI_Aint lb,nbyte;
738     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr);
739     PetscCheckFalse(lb != 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb);
740     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
741       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
742       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
743       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
744       link->bs        = nbyte;
745       link->unitbytes = nbyte;
746       link->basicunit = MPI_BYTE;
747     } else {
748       nInt = nbyte / sizeof(int);
749       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
750       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
751       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
752       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
753       link->bs        = nInt;
754       link->unitbytes = nbyte;
755       link->basicunit = MPI_INT;
756     }
757     if (link->isbuiltin) link->unit = unit;
758   }
759 
760   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);}
761 
762   link->Memcpy = PetscSFLinkMemcpy_Host;
763   PetscFunctionReturn(0);
764 }
765 
766 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
767 {
768   PetscFunctionBegin;
769   *UnpackAndOp = NULL;
770   if (PetscMemTypeHost(mtype)) {
771     if      (op == MPI_REPLACE)               *UnpackAndOp = link->h_UnpackAndInsert;
772     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
773     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
774     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
775     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
776     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
777     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
778     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
779     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
780     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
781     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
782     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
783     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
784   }
785 #if defined(PETSC_HAVE_DEVICE)
786   else if (PetscMemTypeDevice(mtype) && !atomic) {
787     if      (op == MPI_REPLACE)               *UnpackAndOp = link->d_UnpackAndInsert;
788     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
789     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
790     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
791     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
792     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
793     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
794     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
795     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
796     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
797     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
798     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
799     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
800   } else if (PetscMemTypeDevice(mtype) && atomic) {
801     if      (op == MPI_REPLACE)               *UnpackAndOp = link->da_UnpackAndInsert;
802     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
803     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
804     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
805     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
806     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
807     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
808     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
809     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
810     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
811     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
812     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
813     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
814   }
815 #endif
816   PetscFunctionReturn(0);
817 }
818 
819 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
820 {
821   PetscFunctionBegin;
822   *ScatterAndOp = NULL;
823   if (PetscMemTypeHost(mtype)) {
824     if      (op == MPI_REPLACE)               *ScatterAndOp = link->h_ScatterAndInsert;
825     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
826     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
827     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
828     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
829     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
830     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
831     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
832     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
833     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
834     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
835     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
836     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
837   }
838 #if defined(PETSC_HAVE_DEVICE)
839   else if (PetscMemTypeDevice(mtype) && !atomic) {
840     if      (op == MPI_REPLACE)               *ScatterAndOp = link->d_ScatterAndInsert;
841     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
842     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
843     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
844     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
845     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
846     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
847     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
848     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
849     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
850     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
851     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
852     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
853   } else if (PetscMemTypeDevice(mtype) && atomic) {
854     if      (op == MPI_REPLACE)               *ScatterAndOp = link->da_ScatterAndInsert;
855     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
856     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
857     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
858     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
859     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
860     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
861     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
862     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
863     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
864     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
865     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
866     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
867   }
868 #endif
869   PetscFunctionReturn(0);
870 }
871 
872 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
873 {
874   PetscFunctionBegin;
875   *FetchAndOp = NULL;
876   PetscCheckFalse(op != MPI_SUM && op != MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
877   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
878 #if defined(PETSC_HAVE_DEVICE)
879   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
880   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOp = link->da_FetchAndAdd;
881 #endif
882   PetscFunctionReturn(0);
883 }
884 
885 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
886 {
887   PetscFunctionBegin;
888   *FetchAndOpLocal = NULL;
889   PetscCheckFalse(op != MPI_SUM && op != MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
890   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
891 #if defined(PETSC_HAVE_DEVICE)
892   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
893   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
894 #endif
895   PetscFunctionReturn(0);
896 }
897 
898 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
899 {
900   PetscErrorCode ierr;
901   PetscLogDouble flops;
902   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
903 
904   PetscFunctionBegin;
905   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
906     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
907 #if defined(PETSC_HAVE_DEVICE)
908     if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
909 #endif
910     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
911   }
912   PetscFunctionReturn(0);
913 }
914 
915 static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
916 {
917   PetscLogDouble flops;
918   PetscErrorCode ierr;
919 
920   PetscFunctionBegin;
921   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
922     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
923 #if defined(PETSC_HAVE_DEVICE)
924     if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
925 #endif
926     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
927   }
928   PetscFunctionReturn(0);
929 }
930 
931 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
932   Input Parameters:
933   +sf      - The StarForest
934   .link    - The link
935   .count   - Number of entries to unpack
936   .start   - The first index, significent when indices=NULL
937   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
938   .buf     - A contiguous buffer to unpack from
939   -op      - Operation after unpack
940 
941   Output Parameters:
942   .data    - The data to unpack to
943 */
944 static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
945 {
946   PetscFunctionBegin;
947 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
948   {
949     PetscErrorCode ierr;
950     PetscInt       i;
951     if (indices) {
952       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
953          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
954       */
955       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);}
956     } else {
957       ierr = MPIU_Reduce_local(buf,(char*)data+start*link->unitbytes,count,link->unit,op);CHKERRMPI(ierr);
958     }
959   }
960   PetscFunctionReturn(0);
961 #else
962   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
963 #endif
964 }
965 
966 static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
967 {
968   PetscFunctionBegin;
969 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
970   {
971     PetscErrorCode ierr;
972     PetscInt       i,disp;
973     if (!srcIdx) {
974       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
975     } else {
976       for (i=0; i<count; i++) {
977         disp = dstIdx? dstIdx[i] : dstStart + i;
978         ierr = MPIU_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);
979       }
980     }
981   }
982   PetscFunctionReturn(0);
983 #else
984   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
985 #endif
986 }
987 
988 /*=============================================================================
989               Pack/Unpack/Fetch/Scatter routines
990  ============================================================================*/
991 
992 /* Pack rootdata to rootbuf
993   Input Parameters:
994   + sf       - The SF this packing works on.
995   . link     - It gives the memtype of the roots and also provides root buffer.
996   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
997   - rootdata - Where to read the roots.
998 
999   Notes:
1000   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1001   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
1002  */
1003 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1004 {
1005   PetscErrorCode   ierr;
1006   const PetscInt   *rootindices = NULL;
1007   PetscInt         count,start;
1008   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1009   PetscMemType     rootmtype = link->rootmtype;
1010   PetscSFPackOpt   opt = NULL;
1011 
1012   PetscFunctionBegin;
1013   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1014   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1015     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1016     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1017     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1018   }
1019   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1020   PetscFunctionReturn(0);
1021 }
1022 
1023 /* Pack leafdata to leafbuf */
1024 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1025 {
1026   PetscErrorCode   ierr;
1027   const PetscInt   *leafindices = NULL;
1028   PetscInt         count,start;
1029   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1030   PetscMemType     leafmtype = link->leafmtype;
1031   PetscSFPackOpt   opt = NULL;
1032 
1033   PetscFunctionBegin;
1034   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1035   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1036     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1037     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1038     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1039   }
1040   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1041   PetscFunctionReturn(0);
1042 }
1043 
1044 /* Pack rootdata to rootbuf, which are in the same memory space */
1045 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1046 {
1047   PetscErrorCode   ierr;
1048   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1049 
1050   PetscFunctionBegin;
1051   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1052     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1053     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */
1054   }
1055   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1056   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);}
1057   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1058   PetscFunctionReturn(0);
1059 }
1060 /* Pack leafdata to leafbuf, which are in the same memory space */
1061 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1062 {
1063   PetscErrorCode   ierr;
1064 
1065   PetscFunctionBegin;
1066   if (scope == PETSCSF_REMOTE) {
1067     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1068     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1069   }
1070   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1071   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);}
1072   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1073   PetscFunctionReturn(0);
1074 }
1075 
1076 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1077 {
1078   PetscErrorCode   ierr;
1079   const PetscInt   *rootindices = NULL;
1080   PetscInt         count,start;
1081   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1082   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1083   PetscMemType     rootmtype = link->rootmtype;
1084   PetscSFPackOpt   opt = NULL;
1085 
1086   PetscFunctionBegin;
1087   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1088     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1089     if (UnpackAndOp) {
1090       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1091       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1092     } else {
1093       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1094       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1095     }
1096   }
1097   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1098   PetscFunctionReturn(0);
1099 }
1100 
1101 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1102 {
1103   PetscErrorCode   ierr;
1104   const PetscInt   *leafindices = NULL;
1105   PetscInt         count,start;
1106   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1107   PetscMemType     leafmtype = link->leafmtype;
1108   PetscSFPackOpt   opt = NULL;
1109 
1110   PetscFunctionBegin;
1111   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1112     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1113     if (UnpackAndOp) {
1114       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1115       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1116     } else {
1117       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1118       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1119     }
1120   }
1121   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1122   PetscFunctionReturn(0);
1123 }
1124 /* Unpack rootbuf to rootdata, which are in the same memory space */
1125 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1126 {
1127   PetscErrorCode   ierr;
1128   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1129 
1130   PetscFunctionBegin;
1131   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1132   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);}
1133   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1134   if (scope == PETSCSF_REMOTE) {
1135     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1136     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1137   }
1138   PetscFunctionReturn(0);
1139 }
1140 
1141 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1142 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1143 {
1144   PetscErrorCode   ierr;
1145 
1146   PetscFunctionBegin;
1147   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1148   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);CHKERRQ(ierr);}
1149   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1150   if (scope == PETSCSF_REMOTE) {
1151     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1152     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1153   }
1154   PetscFunctionReturn(0);
1155 }
1156 
1157 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1158 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1159 {
1160   PetscErrorCode     ierr;
1161   const PetscInt     *rootindices = NULL;
1162   PetscInt           count,start;
1163   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1164   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1165   PetscMemType       rootmtype = link->rootmtype;
1166   PetscSFPackOpt     opt = NULL;
1167 
1168   PetscFunctionBegin;
1169   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1170   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1171     /* Do FetchAndOp on rootdata with rootbuf */
1172     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr);
1173     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1174     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr);
1175   }
1176   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr);
1177   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1182 {
1183   PetscErrorCode       ierr;
1184   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1185   PetscInt             count,rootstart,leafstart;
1186   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1187   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1188   PetscMemType         rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1189   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1190   PetscInt             buflen = sf->leafbuflen[PETSCSF_LOCAL];
1191   char                 *srcbuf = NULL,*dstbuf = NULL;
1192   PetscBool            dstdups;
1193 
1194   PetscFunctionBegin;
1195   if (!buflen) PetscFunctionReturn(0);
1196   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1197     if (direction == PETSCSF_ROOT2LEAF) {
1198       ierr     = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1199       srcmtype = rootmtype;
1200       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1201       dstmtype = leafmtype;
1202       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1203     } else {
1204       ierr     = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1205       srcmtype = leafmtype;
1206       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1207       dstmtype = rootmtype;
1208       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1209     }
1210     ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr);
1211     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1212     if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);}
1213     if (direction == PETSCSF_ROOT2LEAF) {
1214       ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1215     } else {
1216       ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1217     }
1218   } else {
1219     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1220     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
1221     ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr);
1222     if (ScatterAndOp) {
1223       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1224       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1225       if (direction == PETSCSF_ROOT2LEAF) {
1226         ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1227       } else {
1228         ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1229       }
1230     } else {
1231       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1232       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1233       if (direction == PETSCSF_ROOT2LEAF) {
1234         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1235       } else {
1236         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1237       }
1238     }
1239   }
1240   PetscFunctionReturn(0);
1241 }
1242 
1243 /* Fetch rootdata to leafdata and leafupdate locally */
1244 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1245 {
1246   PetscErrorCode       ierr;
1247   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1248   PetscInt             count,rootstart,leafstart;
1249   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1250   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1251   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1252   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1253 
1254   PetscFunctionBegin;
1255   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1256   if (rootmtype != leafmtype) {
1257     /* The local communication has to go through pack and unpack */
1258     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1259   } else {
1260     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1261     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1262     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1263     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1264   }
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 /*
1269   Create per-rank pack/unpack optimizations based on indice patterns
1270 
1271    Input Parameters:
1272   +  n       - Number of destination ranks
1273   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1274   -  idx     - [*]   Array storing indices
1275 
1276    Output Parameters:
1277   +  opt     - Pack optimizations. NULL if no optimizations.
1278 */
1279 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1280 {
1281   PetscErrorCode ierr;
1282   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1283   PetscBool      optimizable = PETSC_TRUE;
1284   PetscSFPackOpt opt;
1285 
1286   PetscFunctionBegin;
1287   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1288   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1289   opt->n      = opt->array[0] = n;
1290   opt->offset = opt->array + 1;
1291   opt->start  = opt->array + n   + 2;
1292   opt->dx     = opt->array + 2*n + 2;
1293   opt->dy     = opt->array + 3*n + 2;
1294   opt->dz     = opt->array + 4*n + 2;
1295   opt->X      = opt->array + 5*n + 2;
1296   opt->Y      = opt->array + 6*n + 2;
1297 
1298   for (r=0; r<n; r++) { /* For each destination rank */
1299     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1300     p     = offset[r];
1301     start = idx[p]; /* First index for this rank */
1302     p++;
1303 
1304     /* Search in X dimension */
1305     for (dx=1; dx<m; dx++,p++) {
1306       if (start+dx != idx[p]) break;
1307     }
1308 
1309     dydz = m/dx;
1310     X    = dydz > 1 ? (idx[p]-start) : dx;
1311     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1312     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1313     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1314       for (i=0; i<dx; i++,p++) {
1315         if (start+X*dy+i != idx[p]) {
1316           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1317           else goto Z_dimension;
1318         }
1319       }
1320     }
1321 
1322 Z_dimension:
1323     dz = m/(dx*dy);
1324     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1325     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1326     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1327     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1328       for (j=0; j<dy; j++) {
1329         for (i=0; i<dx; i++,p++) {
1330           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1331         }
1332       }
1333     }
1334     opt->start[r] = start;
1335     opt->dx[r]    = dx;
1336     opt->dy[r]    = dy;
1337     opt->dz[r]    = dz;
1338     opt->X[r]     = X;
1339     opt->Y[r]     = Y;
1340   }
1341 
1342 finish:
1343   /* If not optimizable, free arrays to save memory */
1344   if (!n || !optimizable) {
1345     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1346     ierr = PetscFree(opt);CHKERRQ(ierr);
1347     *out = NULL;
1348   } else {
1349     opt->offset[0] = 0;
1350     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1351     *out = opt;
1352   }
1353   PetscFunctionReturn(0);
1354 }
1355 
1356 static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1357 {
1358   PetscErrorCode ierr;
1359   PetscSFPackOpt opt = *out;
1360 
1361   PetscFunctionBegin;
1362   if (opt) {
1363     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
1364     ierr = PetscFree(opt);CHKERRQ(ierr);
1365     *out = NULL;
1366   }
1367   PetscFunctionReturn(0);
1368 }
1369 
1370 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1371 {
1372   PetscErrorCode ierr;
1373   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1374   PetscInt       i,j;
1375 
1376   PetscFunctionBegin;
1377   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1378   for (i=0; i<2; i++) { /* Set defaults */
1379     sf->leafstart[i]   = 0;
1380     sf->leafcontig[i]  = PETSC_TRUE;
1381     sf->leafdups[i]    = PETSC_FALSE;
1382     bas->rootstart[i]  = 0;
1383     bas->rootcontig[i] = PETSC_TRUE;
1384     bas->rootdups[i]   = PETSC_FALSE;
1385   }
1386 
1387   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1388   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1389 
1390   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1391   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1392 
1393   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1394   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1395     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1396   }
1397   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1398     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1399   }
1400 
1401   /* If not, see if we can have per-rank optimizations by doing index analysis */
1402   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1403   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1404 
1405   /* Are root indices for self and remote contiguous? */
1406   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1407   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1408 
1409   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1410   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1411 
1412   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1413     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1414   }
1415   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1416     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1417   }
1418 
1419   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1420   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1421 
1422     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1423   if (PetscDefined(HAVE_DEVICE)) {
1424     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1425     if (!sf->leafcontig[0]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1426     if (!sf->leafcontig[1]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1427     if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1428     if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1429   }
1430   PetscFunctionReturn(0);
1431 }
1432 
1433 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1434 {
1435   PetscErrorCode ierr;
1436   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1437   PetscInt       i;
1438 
1439   PetscFunctionBegin;
1440   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1441     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1442     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1443    #if defined(PETSC_HAVE_DEVICE)
1444     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1445     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1446    #endif
1447   }
1448   PetscFunctionReturn(0);
1449 }
1450