xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 4e278199b78715991f5c71ebbd945c1489263e6c)
1 #include "petsc/private/sfimpl.h"
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
6 
7 #if defined(PETSC_HAVE_CUDA)
8 #include <cuda_runtime.h>
9 #include <petsccublas.h>
10 #endif
11 #if defined(PETSC_HAVE_HIP)
12 #include <hip/hip_runtime.h>
13 #include <petschipblas.h>
14 #endif
15 /*
16  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
17  * therefore we pack data types manually. This file defines packing routines for the standard data types.
18  */
19 
20 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
21 
22 /* Operations working like s += t */
23 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
24 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
25 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
26 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
27 /* Ref MPI MAXLOC */
28 #define OP_XLOC(op,s,t) \
29   do {                                       \
30     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
31     else if (!((s).u op (t).u)) s = t;           \
32   } while (0)
33 
34 /* DEF_PackFunc - macro defining a Pack routine
35 
36    Arguments of the macro:
37    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
38    .BS        Block size for vectorization. It is a factor of bsz.
39    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
40 
41    Arguments of the Pack routine:
42    +count     Number of indices in idx[].
43    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
44    .opt       Per-pack optimization plan. NULL means no such plan.
45    .idx       Indices of entries to packed.
46    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
47    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
48    -packed    Address of the packed data.
49  */
50 #define DEF_PackFunc(Type,BS,EQ) \
51   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
52   {                                                                                                          \
53     PetscErrorCode ierr;                                                                                     \
54     const Type     *u = (const Type*)unpacked,*u2;                                                           \
55     Type           *p = (Type*)packed,*p2;                                                                   \
56     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
57     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
58     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
59     PetscFunctionBegin;                                                                                      \
60     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
61     else if (opt) { /* has optimizations available */                                                        \
62       p2 = p;                                                                                                \
63       for (r=0; r<opt->n; r++) {                                                                             \
64         u2 = u + opt->start[r]*MBS;                                                                          \
65         X  = opt->X[r];                                                                                      \
66         Y  = opt->Y[r];                                                                                      \
67         for (k=0; k<opt->dz[r]; k++)                                                                         \
68           for (j=0; j<opt->dy[r]; j++) {                                                                     \
69             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
70             p2  += opt->dx[r]*MBS;                                                                           \
71           }                                                                                                  \
72       }                                                                                                      \
73     } else {                                                                                                 \
74       for (i=0; i<count; i++)                                                                                \
75         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
76           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
77             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
78     }                                                                                                        \
79     PetscFunctionReturn(0);                                                                                  \
80   }
81 
82 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
83                 and inserts into a sparse array.
84 
85    Arguments:
86   .Type       Type of the data
87   .BS         Block size for vectorization
88   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
89 
90   Notes:
91    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
92  */
93 #define DEF_UnpackFunc(Type,BS,EQ)               \
94   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
95   {                                                                                                          \
96     PetscErrorCode ierr;                                                                                     \
97     Type           *u = (Type*)unpacked,*u2;                                                                 \
98     const Type     *p = (const Type*)packed;                                                                 \
99     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
100     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
101     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
102     PetscFunctionBegin;                                                                                      \
103     if (!idx) {                                                                                              \
104       u += start*MBS;                                                                                        \
105       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
106     } else if (opt) { /* has optimizations available */                                                      \
107       for (r=0; r<opt->n; r++) {                                                                             \
108         u2 = u + opt->start[r]*MBS;                                                                          \
109         X  = opt->X[r];                                                                                      \
110         Y  = opt->Y[r];                                                                                      \
111         for (k=0; k<opt->dz[r]; k++)                                                                         \
112           for (j=0; j<opt->dy[r]; j++) {                                                                     \
113             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
114             p   += opt->dx[r]*MBS;                                                                           \
115           }                                                                                                  \
116       }                                                                                                      \
117     } else {                                                                                                 \
118       for (i=0; i<count; i++)                                                                                \
119         for (j=0; j<M; j++)                                                                                  \
120           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
121     }                                                                                                        \
122     PetscFunctionReturn(0);                                                                                  \
123   }
124 
125 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
126 
127    Arguments:
128   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
129   .Type       Type of the data
130   .BS         Block size for vectorization
131   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
132   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
133   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
134  */
135 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
136   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
137   {                                                                                                          \
138     Type           *u = (Type*)unpacked,*u2;                                                                 \
139     const Type     *p = (const Type*)packed;                                                                 \
140     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
141     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
142     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
143     PetscFunctionBegin;                                                                                      \
144     if (!idx) {                                                                                              \
145       u += start*MBS;                                                                                        \
146       for (i=0; i<count; i++)                                                                                \
147         for (j=0; j<M; j++)                                                                                  \
148           for (k=0; k<BS; k++)                                                                               \
149             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
150     } else if (opt) { /* idx[] has patterns */                                                               \
151       for (r=0; r<opt->n; r++) {                                                                             \
152         u2 = u + opt->start[r]*MBS;                                                                          \
153         X  = opt->X[r];                                                                                      \
154         Y  = opt->Y[r];                                                                                      \
155         for (k=0; k<opt->dz[r]; k++)                                                                         \
156           for (j=0; j<opt->dy[r]; j++) {                                                                     \
157             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
158             p += opt->dx[r]*MBS;                                                                             \
159           }                                                                                                  \
160       }                                                                                                      \
161     } else {                                                                                                 \
162       for (i=0; i<count; i++)                                                                                \
163         for (j=0; j<M; j++)                                                                                  \
164           for (k=0; k<BS; k++)                                                                               \
165             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
166     }                                                                                                        \
167     PetscFunctionReturn(0);                                                                                  \
168   }
169 
170 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
171   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
172   {                                                                                                          \
173     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
174     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
175     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
176     const PetscInt MBS = M*BS;                                                                               \
177     PetscFunctionBegin;                                                                                      \
178     for (i=0; i<count; i++) {                                                                                \
179       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
180       l = i*MBS;                                                                                             \
181       for (j=0; j<M; j++)                                                                                    \
182         for (k=0; k<BS; k++) {                                                                               \
183           tmp = u[r+j*BS+k];                                                                                 \
184           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
185           p[l+j*BS+k] = tmp;                                                                                 \
186         }                                                                                                    \
187     }                                                                                                        \
188     PetscFunctionReturn(0);                                                                                  \
189   }
190 
191 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
192   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
193   {                                                                                                          \
194     PetscErrorCode ierr;                                                                                     \
195     const Type     *u = (const Type*)src;                                                                    \
196     Type           *v = (Type*)dst;                                                                          \
197     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
198     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
199     const PetscInt MBS = M*BS;                                                                               \
200     PetscFunctionBegin;                                                                                      \
201     if (!srcIdx) { /* src is contiguous */                                                                   \
202       u += srcStart*MBS;                                                                                     \
203       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
204     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
205       u += srcOpt->start[0]*MBS;                                                                             \
206       v += dstStart*MBS;                                                                                     \
207       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
208       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
209         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
210           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
211           v += srcOpt->dx[0]*MBS;                                                                            \
212         }                                                                                                    \
213     } else { /* all other cases */                                                                           \
214       for (i=0; i<count; i++) {                                                                              \
215         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
216         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
217         for (j=0; j<M; j++)                                                                                  \
218           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
219       }                                                                                                      \
220     }                                                                                                        \
221     PetscFunctionReturn(0);                                                                                  \
222   }
223 
224 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
225   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
226   {                                                                                                          \
227     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
228     const Type     *ldata = (const Type*)leafdata;                                                           \
229     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
230     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
231     const PetscInt MBS = M*BS;                                                                               \
232     PetscFunctionBegin;                                                                                      \
233     for (i=0; i<count; i++) {                                                                                \
234       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
235       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
236       for (j=0; j<M; j++)                                                                                    \
237         for (k=0; k<BS; k++) {                                                                               \
238           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
239           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
240         }                                                                                                    \
241     }                                                                                                        \
242     PetscFunctionReturn(0);                                                                                  \
243   }
244 
245 /* Pack, Unpack/Fetch ops */
246 #define DEF_Pack(Type,BS,EQ)                                                      \
247   DEF_PackFunc(Type,BS,EQ)                                                        \
248   DEF_UnpackFunc(Type,BS,EQ)                                                      \
249   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
250   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
251     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
252     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
253     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
254   }
255 
256 /* Add, Mult ops */
257 #define DEF_Add(Type,BS,EQ)                                                       \
258   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
259   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
260   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
261   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
262   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
263   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
264   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
265     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
266     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
267     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
268     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
269     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
270     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
271   }
272 
273 /* Max, Min ops */
274 #define DEF_Cmp(Type,BS,EQ)                                                       \
275   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
276   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
277   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
278   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
279   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
280     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
281     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
282     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
283     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
284   }
285 
286 /* Logical ops.
287   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
288   the compilation warning "empty macro arguments are undefined in ISO C90"
289  */
290 #define DEF_Log(Type,BS,EQ)                                                       \
291   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
292   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
293   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
294   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
295   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
296   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
297   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
298     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
299     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
300     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
301     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
302     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
303     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
304   }
305 
306 /* Bitwise ops */
307 #define DEF_Bit(Type,BS,EQ)                                                       \
308   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
309   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
310   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
311   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
312   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
313   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
314   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
315     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
316     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
317     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
318     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
319     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
320     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
321   }
322 
323 /* Maxloc, Minloc ops */
324 #define DEF_Xloc(Type,BS,EQ)                                                      \
325   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
326   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
327   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
328   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
329   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
330     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
331     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
332     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
333     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
334   }
335 
336 #define DEF_IntegerType(Type,BS,EQ)                                               \
337   DEF_Pack(Type,BS,EQ)                                                            \
338   DEF_Add(Type,BS,EQ)                                                             \
339   DEF_Cmp(Type,BS,EQ)                                                             \
340   DEF_Log(Type,BS,EQ)                                                             \
341   DEF_Bit(Type,BS,EQ)                                                             \
342   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
343     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
344     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
345     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
346     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
347     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
348   }
349 
350 #define DEF_RealType(Type,BS,EQ)                                                  \
351   DEF_Pack(Type,BS,EQ)                                                            \
352   DEF_Add(Type,BS,EQ)                                                             \
353   DEF_Cmp(Type,BS,EQ)                                                             \
354   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
355     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
356     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
357     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
358   }
359 
360 #if defined(PETSC_HAVE_COMPLEX)
361 #define DEF_ComplexType(Type,BS,EQ)                                               \
362   DEF_Pack(Type,BS,EQ)                                                            \
363   DEF_Add(Type,BS,EQ)                                                             \
364   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
365     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
367   }
368 #endif
369 
370 #define DEF_DumbType(Type,BS,EQ)                                                  \
371   DEF_Pack(Type,BS,EQ)                                                            \
372   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
373     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374   }
375 
376 /* Maxloc, Minloc */
377 #define DEF_PairType(Type,BS,EQ)                                                  \
378   DEF_Pack(Type,BS,EQ)                                                            \
379   DEF_Xloc(Type,BS,EQ)                                                            \
380   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
381     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
382     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
383   }
384 
385 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
386 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
387 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
388 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
389 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
390 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
391 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
392 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
393 
394 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
395 DEF_IntegerType(int,1,1)
396 DEF_IntegerType(int,2,1)
397 DEF_IntegerType(int,4,1)
398 DEF_IntegerType(int,8,1)
399 DEF_IntegerType(int,1,0)
400 DEF_IntegerType(int,2,0)
401 DEF_IntegerType(int,4,0)
402 DEF_IntegerType(int,8,0)
403 #endif
404 
405 /* The typedefs are used to get a typename without space that CPPJoin can handle */
406 typedef signed char SignedChar;
407 DEF_IntegerType(SignedChar,1,1)
408 DEF_IntegerType(SignedChar,2,1)
409 DEF_IntegerType(SignedChar,4,1)
410 DEF_IntegerType(SignedChar,8,1)
411 DEF_IntegerType(SignedChar,1,0)
412 DEF_IntegerType(SignedChar,2,0)
413 DEF_IntegerType(SignedChar,4,0)
414 DEF_IntegerType(SignedChar,8,0)
415 
416 typedef unsigned char UnsignedChar;
417 DEF_IntegerType(UnsignedChar,1,1)
418 DEF_IntegerType(UnsignedChar,2,1)
419 DEF_IntegerType(UnsignedChar,4,1)
420 DEF_IntegerType(UnsignedChar,8,1)
421 DEF_IntegerType(UnsignedChar,1,0)
422 DEF_IntegerType(UnsignedChar,2,0)
423 DEF_IntegerType(UnsignedChar,4,0)
424 DEF_IntegerType(UnsignedChar,8,0)
425 
426 DEF_RealType(PetscReal,1,1)
427 DEF_RealType(PetscReal,2,1)
428 DEF_RealType(PetscReal,4,1)
429 DEF_RealType(PetscReal,8,1)
430 DEF_RealType(PetscReal,1,0)
431 DEF_RealType(PetscReal,2,0)
432 DEF_RealType(PetscReal,4,0)
433 DEF_RealType(PetscReal,8,0)
434 
435 #if defined(PETSC_HAVE_COMPLEX)
436 DEF_ComplexType(PetscComplex,1,1)
437 DEF_ComplexType(PetscComplex,2,1)
438 DEF_ComplexType(PetscComplex,4,1)
439 DEF_ComplexType(PetscComplex,8,1)
440 DEF_ComplexType(PetscComplex,1,0)
441 DEF_ComplexType(PetscComplex,2,0)
442 DEF_ComplexType(PetscComplex,4,0)
443 DEF_ComplexType(PetscComplex,8,0)
444 #endif
445 
446 #define PairType(Type1,Type2) Type1##_##Type2
447 typedef struct {int u; int i;}           PairType(int,int);
448 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
449 DEF_PairType(PairType(int,int),1,1)
450 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
451 
452 /* If we don't know the basic type, we treat it as a stream of chars or ints */
453 DEF_DumbType(char,1,1)
454 DEF_DumbType(char,2,1)
455 DEF_DumbType(char,4,1)
456 DEF_DumbType(char,1,0)
457 DEF_DumbType(char,2,0)
458 DEF_DumbType(char,4,0)
459 
460 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
461 DEF_DumbType(DumbInt,1,1)
462 DEF_DumbType(DumbInt,2,1)
463 DEF_DumbType(DumbInt,4,1)
464 DEF_DumbType(DumbInt,8,1)
465 DEF_DumbType(DumbInt,1,0)
466 DEF_DumbType(DumbInt,2,0)
467 DEF_DumbType(DumbInt,4,0)
468 DEF_DumbType(DumbInt,8,0)
469 
470 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
471 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
472 {
473   int ierr;
474   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
475   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
476   return MPI_SUCCESS;
477 }
478 #endif
479 
480 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
481 {
482   PetscErrorCode    ierr;
483   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
484   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
485 
486   PetscFunctionBegin;
487   /* Destroy device-specific fields */
488   if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);}
489 
490   /* Destroy host related fields */
491   if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);}
492   if (!link->use_nvshmem) {
493     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
494       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);}
495     }
496     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
497     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
498       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
499       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
500     }
501   }
502   ierr = PetscFree(link);CHKERRQ(ierr);
503   PetscFunctionReturn(0);
504 }
505 
506 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
507 {
508   PetscErrorCode    ierr;
509 
510   PetscFunctionBegin;
511   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
512  #if defined(PETSC_HAVE_NVSHMEM)
513   {
514     PetscBool use_nvshmem;
515     ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr);
516     if (use_nvshmem) {
517       ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
518       PetscFunctionReturn(0);
519     }
520   }
521  #endif
522   ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
523   PetscFunctionReturn(0);
524 }
525 
526 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
527    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
528 */
529 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
530 {
531   PetscErrorCode       ierr;
532   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
533   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
534   const PetscInt       *rootoffset,*leafoffset;
535   PetscMPIInt          n;
536   MPI_Aint             disp;
537   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
538   MPI_Datatype         unit = link->unit;
539   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
540   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
541 
542   PetscFunctionBegin;
543   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
544   if (sf->persistent) {
545     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
546       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
547       if (direction == PETSCSF_LEAF2ROOT) {
548         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
549           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
550           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
551           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
552         }
553       } else { /* PETSCSF_ROOT2LEAF */
554         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
555           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
556           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
557           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
558         }
559       }
560       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
561     }
562 
563     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
564       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
565       if (direction == PETSCSF_LEAF2ROOT) {
566         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
567           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
568           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
569           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
570         }
571       } else { /* PETSCSF_ROOT2LEAF */
572         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
573           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
574           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
575           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
576         }
577       }
578       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
579     }
580   }
581   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
582   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
583   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
584   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
585   PetscFunctionReturn(0);
586 }
587 
588 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
589 {
590   PetscErrorCode    ierr;
591   PetscSFLink       link,*p;
592   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
593 
594   PetscFunctionBegin;
595   /* Look for types in cache */
596   for (p=&bas->inuse; (link=*p); p=&link->next) {
597     PetscBool match;
598     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
599     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
600       switch (cmode) {
601       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
602       case PETSC_USE_POINTER: break;
603       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
604       }
605       *mylink = link;
606       PetscFunctionReturn(0);
607     }
608   }
609   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
610 }
611 
612 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
613 {
614   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
615   PetscSFLink       link = *mylink;
616 
617   PetscFunctionBegin;
618   link->rootdata = NULL;
619   link->leafdata = NULL;
620   link->next     = bas->avail;
621   bas->avail     = link;
622   *mylink        = NULL;
623   PetscFunctionReturn(0);
624 }
625 
626 /* Error out on unsupported overlapped communications */
627 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
628 {
629   PetscErrorCode    ierr;
630   PetscSFLink       link,*p;
631   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
632   PetscBool         match;
633 
634   PetscFunctionBegin;
635   if (PetscDefined(USE_DEBUG)) {
636     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
637        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
638     */
639     if (rootdata || leafdata) {
640       for (p=&bas->inuse; (link=*p); p=&link->next) {
641         ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
642         if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
643       }
644     }
645   }
646   PetscFunctionReturn(0);
647 }
648 
649 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
650 {
651   PetscFunctionBegin;
652   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
653   PetscFunctionReturn(0);
654 }
655 
656 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
657 {
658   PetscErrorCode ierr;
659   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
660   PetscBool      is2Int,is2PetscInt;
661   PetscMPIInt    ni,na,nd,combiner;
662 #if defined(PETSC_HAVE_COMPLEX)
663   PetscInt       nPetscComplex=0;
664 #endif
665 
666   PetscFunctionBegin;
667   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
668   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
669   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
670   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
671   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
672   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
673 #if defined(PETSC_HAVE_COMPLEX)
674   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
675 #endif
676   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
677   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
678   /* TODO: shaell we also handle Fortran MPI_2REAL? */
679   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr);
680   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
681   link->bs = 1; /* default */
682 
683   if (is2Int) {
684     PackInit_PairType_int_int_1_1(link);
685     link->bs        = 1;
686     link->unitbytes = 2*sizeof(int);
687     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
688     link->basicunit = MPI_2INT;
689     link->unit      = MPI_2INT;
690   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
691     PackInit_PairType_PetscInt_PetscInt_1_1(link);
692     link->bs        = 1;
693     link->unitbytes = 2*sizeof(PetscInt);
694     link->basicunit = MPIU_2INT;
695     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
696     link->unit      = MPIU_2INT;
697   } else if (nPetscReal) {
698     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
699     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
700     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
701     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
702     link->bs        = nPetscReal;
703     link->unitbytes = nPetscReal*sizeof(PetscReal);
704     link->basicunit = MPIU_REAL;
705     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
706   } else if (nPetscInt) {
707     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
708     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
709     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
710     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
711     link->bs        = nPetscInt;
712     link->unitbytes = nPetscInt*sizeof(PetscInt);
713     link->basicunit = MPIU_INT;
714     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
715 #if defined(PETSC_USE_64BIT_INDICES)
716   } else if (nInt) {
717     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
718     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
719     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
720     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
721     link->bs        = nInt;
722     link->unitbytes = nInt*sizeof(int);
723     link->basicunit = MPI_INT;
724     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
725 #endif
726   } else if (nSignedChar) {
727     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
728     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
729     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
730     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
731     link->bs        = nSignedChar;
732     link->unitbytes = nSignedChar*sizeof(SignedChar);
733     link->basicunit = MPI_SIGNED_CHAR;
734     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
735   }  else if (nUnsignedChar) {
736     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
737     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
738     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
739     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
740     link->bs        = nUnsignedChar;
741     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
742     link->basicunit = MPI_UNSIGNED_CHAR;
743     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
744 #if defined(PETSC_HAVE_COMPLEX)
745   } else if (nPetscComplex) {
746     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
747     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
748     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
749     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
750     link->bs        = nPetscComplex;
751     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
752     link->basicunit = MPIU_COMPLEX;
753     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
754 #endif
755   } else {
756     MPI_Aint lb,nbyte;
757     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr);
758     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
759     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
760       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
761       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
762       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
763       link->bs        = nbyte;
764       link->unitbytes = nbyte;
765       link->basicunit = MPI_BYTE;
766     } else {
767       nInt = nbyte / sizeof(int);
768       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
769       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
770       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
771       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
772       link->bs        = nInt;
773       link->unitbytes = nbyte;
774       link->basicunit = MPI_INT;
775     }
776     if (link->isbuiltin) link->unit = unit;
777   }
778 
779   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);}
780 
781   link->Memcpy = PetscSFLinkMemcpy_Host;
782   PetscFunctionReturn(0);
783 }
784 
785 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
786 {
787   PetscFunctionBegin;
788   *UnpackAndOp = NULL;
789   if (PetscMemTypeHost(mtype)) {
790     if      (op == MPI_REPLACE)               *UnpackAndOp = link->h_UnpackAndInsert;
791     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
792     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
793     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
794     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
795     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
796     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
797     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
798     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
799     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
800     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
801     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
802     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
803   }
804 #if defined(PETSC_HAVE_DEVICE)
805   else if (PetscMemTypeDevice(mtype) && !atomic) {
806     if      (op == MPI_REPLACE)               *UnpackAndOp = link->d_UnpackAndInsert;
807     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
808     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
809     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
810     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
811     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
812     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
813     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
814     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
815     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
816     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
817     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
818     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
819   } else if (PetscMemTypeDevice(mtype) && atomic) {
820     if      (op == MPI_REPLACE)               *UnpackAndOp = link->da_UnpackAndInsert;
821     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
822     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
823     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
824     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
825     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
826     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
827     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
828     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
829     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
830     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
831     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
832     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
833   }
834 #endif
835   PetscFunctionReturn(0);
836 }
837 
838 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
839 {
840   PetscFunctionBegin;
841   *ScatterAndOp = NULL;
842   if (PetscMemTypeHost(mtype)) {
843     if      (op == MPI_REPLACE)               *ScatterAndOp = link->h_ScatterAndInsert;
844     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
845     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
846     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
847     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
848     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
849     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
850     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
851     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
852     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
853     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
854     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
855     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
856   }
857 #if defined(PETSC_HAVE_DEVICE)
858   else if (PetscMemTypeDevice(mtype) && !atomic) {
859     if      (op == MPI_REPLACE)               *ScatterAndOp = link->d_ScatterAndInsert;
860     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
861     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
862     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
863     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
864     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
865     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
866     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
867     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
868     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
869     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
870     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
871     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
872   } else if (PetscMemTypeDevice(mtype) && atomic) {
873     if      (op == MPI_REPLACE)               *ScatterAndOp = link->da_ScatterAndInsert;
874     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
875     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
876     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
877     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
878     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
879     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
880     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
881     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
882     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
883     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
884     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
885     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
886   }
887 #endif
888   PetscFunctionReturn(0);
889 }
890 
891 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
892 {
893   PetscFunctionBegin;
894   *FetchAndOp = NULL;
895   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
896   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
897 #if defined(PETSC_HAVE_DEVICE)
898   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
899   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOp = link->da_FetchAndAdd;
900 #endif
901   PetscFunctionReturn(0);
902 }
903 
904 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
905 {
906   PetscFunctionBegin;
907   *FetchAndOpLocal = NULL;
908   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
909   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
910 #if defined(PETSC_HAVE_DEVICE)
911   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
912   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
913 #endif
914   PetscFunctionReturn(0);
915 }
916 
917 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
918 {
919   PetscErrorCode ierr;
920   PetscLogDouble flops;
921   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
922 
923   PetscFunctionBegin;
924   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
925     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
926 #if defined(PETSC_HAVE_DEVICE)
927     if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
928 #endif
929     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
930   }
931   PetscFunctionReturn(0);
932 }
933 
934 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
935 {
936   PetscLogDouble flops;
937   PetscErrorCode ierr;
938 
939   PetscFunctionBegin;
940   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
941     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
942 #if defined(PETSC_HAVE_DEVICE)
943     if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
944 #endif
945     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
946   }
947   PetscFunctionReturn(0);
948 }
949 
950 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
951   Input Arguments:
952   +sf      - The StarForest
953   .link    - The link
954   .count   - Number of entries to unpack
955   .start   - The first index, significent when indices=NULL
956   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
957   .buf     - A contiguous buffer to unpack from
958   -op      - Operation after unpack
959 
960   Output Arguments:
961   .data    - The data to unpack to
962 */
963 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
964 {
965   PetscFunctionBegin;
966 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
967   {
968     PetscErrorCode ierr;
969     PetscInt       i;
970     PetscMPIInt    n;
971     if (indices) {
972       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
973          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
974       */
975       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);}
976     } else {
977       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
978       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr);
979     }
980   }
981   PetscFunctionReturn(0);
982 #else
983   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
984 #endif
985 }
986 
987 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
988 {
989   PetscFunctionBegin;
990 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
991   {
992     PetscErrorCode ierr;
993     PetscInt       i,disp;
994     if (!srcIdx) {
995       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
996     } else {
997       for (i=0; i<count; i++) {
998         disp = dstIdx? dstIdx[i] : dstStart + i;
999         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);
1000       }
1001     }
1002   }
1003   PetscFunctionReturn(0);
1004 #else
1005   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1006 #endif
1007 }
1008 
1009 /*=============================================================================
1010               Pack/Unpack/Fetch/Scatter routines
1011  ============================================================================*/
1012 
1013 /* Pack rootdata to rootbuf
1014   Input Arguments:
1015   + sf       - The SF this packing works on.
1016   . link     - It gives the memtype of the roots and also provides root buffer.
1017   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1018   - rootdata - Where to read the roots.
1019 
1020   Notes:
1021   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1022   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
1023  */
1024 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1025 {
1026   PetscErrorCode   ierr;
1027   const PetscInt   *rootindices = NULL;
1028   PetscInt         count,start;
1029   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1030   PetscMemType     rootmtype = link->rootmtype;
1031   PetscSFPackOpt   opt = NULL;
1032 
1033   PetscFunctionBegin;
1034   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1035   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1036     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1037     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1038     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1039   }
1040   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1041   PetscFunctionReturn(0);
1042 }
1043 
1044 /* Pack leafdata to leafbuf */
1045 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1046 {
1047   PetscErrorCode   ierr;
1048   const PetscInt   *leafindices = NULL;
1049   PetscInt         count,start;
1050   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1051   PetscMemType     leafmtype = link->leafmtype;
1052   PetscSFPackOpt   opt = NULL;
1053 
1054   PetscFunctionBegin;
1055   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1056   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1057     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1058     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1059     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1060   }
1061   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1062   PetscFunctionReturn(0);
1063 }
1064 
1065 /* Pack rootdata to rootbuf, which are in the same memory space */
1066 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1067 {
1068   PetscErrorCode   ierr;
1069   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1070 
1071   PetscFunctionBegin;
1072   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1073     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1074     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */
1075   }
1076   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1077   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);}
1078   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1079   PetscFunctionReturn(0);
1080 }
1081 /* Pack leafdata to leafbuf, which are in the same memory space */
1082 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1083 {
1084   PetscErrorCode   ierr;
1085 
1086   PetscFunctionBegin;
1087   if (scope == PETSCSF_REMOTE) {
1088     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1089     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1090   }
1091   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1092   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);}
1093   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1094   PetscFunctionReturn(0);
1095 }
1096 
1097 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1098 {
1099   PetscErrorCode   ierr;
1100   const PetscInt   *rootindices = NULL;
1101   PetscInt         count,start;
1102   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1103   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1104   PetscMemType     rootmtype = link->rootmtype;
1105   PetscSFPackOpt   opt = NULL;
1106 
1107   PetscFunctionBegin;
1108   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1109     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1110     if (UnpackAndOp) {
1111       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1112       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1113     } else {
1114       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1115       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1116     }
1117   }
1118   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1119   PetscFunctionReturn(0);
1120 }
1121 
1122 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1123 {
1124   PetscErrorCode   ierr;
1125   const PetscInt   *leafindices = NULL;
1126   PetscInt         count,start;
1127   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1128   PetscMemType     leafmtype = link->leafmtype;
1129   PetscSFPackOpt   opt = NULL;
1130 
1131   PetscFunctionBegin;
1132   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1133     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1134     if (UnpackAndOp) {
1135       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1136       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1137     } else {
1138       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1139       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1140     }
1141   }
1142   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1143   PetscFunctionReturn(0);
1144 }
1145 /* Unpack rootbuf to rootdata, which are in the same memory space */
1146 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1147 {
1148   PetscErrorCode   ierr;
1149   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1150 
1151   PetscFunctionBegin;
1152   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1153   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);}
1154   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1155   if (scope == PETSCSF_REMOTE) {
1156     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1157     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1158   }
1159   PetscFunctionReturn(0);
1160 }
1161 
1162 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1163 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1164 {
1165   PetscErrorCode   ierr;
1166 
1167   PetscFunctionBegin;
1168   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1169   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);}
1170   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1171   if (scope == PETSCSF_REMOTE) {
1172     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1173     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1174   }
1175   PetscFunctionReturn(0);
1176 }
1177 
1178 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1179 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1180 {
1181   PetscErrorCode     ierr;
1182   const PetscInt     *rootindices = NULL;
1183   PetscInt           count,start;
1184   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1185   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1186   PetscMemType       rootmtype = link->rootmtype;
1187   PetscSFPackOpt     opt = NULL;
1188 
1189   PetscFunctionBegin;
1190   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1191   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1192     /* Do FetchAndOp on rootdata with rootbuf */
1193     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr);
1194     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1195     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr);
1196   }
1197   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr);
1198   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1203 {
1204   PetscErrorCode       ierr;
1205   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1206   PetscInt             count,rootstart,leafstart;
1207   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1208   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1209   PetscMemType         rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1210   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1211   PetscInt             buflen = sf->leafbuflen[PETSCSF_LOCAL];
1212   char                 *srcbuf = NULL,*dstbuf = NULL;
1213   PetscBool            dstdups;
1214 
1215   if (!buflen) PetscFunctionReturn(0);
1216   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1217     if (direction == PETSCSF_ROOT2LEAF) {
1218       ierr     = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1219       srcmtype = rootmtype;
1220       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1221       dstmtype = leafmtype;
1222       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1223     } else {
1224       ierr     = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1225       srcmtype = leafmtype;
1226       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1227       dstmtype = rootmtype;
1228       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1229     }
1230     ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr);
1231     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1232     if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);}
1233     if (direction == PETSCSF_ROOT2LEAF) {
1234       ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1235     } else {
1236       ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1237     }
1238   } else {
1239     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1240     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
1241     ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr);
1242     if (ScatterAndOp) {
1243       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1244       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1245       if (direction == PETSCSF_ROOT2LEAF) {
1246         ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1247       } else {
1248         ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1249       }
1250     } else {
1251       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1252       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1253       if (direction == PETSCSF_ROOT2LEAF) {
1254         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1255       } else {
1256         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1257       }
1258     }
1259   }
1260   PetscFunctionReturn(0);
1261 }
1262 
1263 /* Fetch rootdata to leafdata and leafupdate locally */
1264 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1265 {
1266   PetscErrorCode       ierr;
1267   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1268   PetscInt             count,rootstart,leafstart;
1269   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1270   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1271   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1272   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1273 
1274   PetscFunctionBegin;
1275   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1276   if (rootmtype != leafmtype) {
1277     /* The local communication has to go through pack and unpack */
1278     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1279   } else {
1280     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1281     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1282     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1283     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1284   }
1285   PetscFunctionReturn(0);
1286 }
1287 
1288 /*
1289   Create per-rank pack/unpack optimizations based on indice patterns
1290 
1291    Input Parameters:
1292   +  n       - Number of destination ranks
1293   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1294   -  idx     - [*]   Array storing indices
1295 
1296    Output Parameters:
1297   +  opt     - Pack optimizations. NULL if no optimizations.
1298 */
1299 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1300 {
1301   PetscErrorCode ierr;
1302   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1303   PetscBool      optimizable = PETSC_TRUE;
1304   PetscSFPackOpt opt;
1305 
1306   PetscFunctionBegin;
1307   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1308   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1309   opt->n      = opt->array[0] = n;
1310   opt->offset = opt->array + 1;
1311   opt->start  = opt->array + n   + 2;
1312   opt->dx     = opt->array + 2*n + 2;
1313   opt->dy     = opt->array + 3*n + 2;
1314   opt->dz     = opt->array + 4*n + 2;
1315   opt->X      = opt->array + 5*n + 2;
1316   opt->Y      = opt->array + 6*n + 2;
1317 
1318   for (r=0; r<n; r++) { /* For each destination rank */
1319     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1320     p     = offset[r];
1321     start = idx[p]; /* First index for this rank */
1322     p++;
1323 
1324     /* Search in X dimension */
1325     for (dx=1; dx<m; dx++,p++) {
1326       if (start+dx != idx[p]) break;
1327     }
1328 
1329     dydz = m/dx;
1330     X    = dydz > 1 ? (idx[p]-start) : dx;
1331     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1332     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1333     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1334       for (i=0; i<dx; i++,p++) {
1335         if (start+X*dy+i != idx[p]) {
1336           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1337           else goto Z_dimension;
1338         }
1339       }
1340     }
1341 
1342 Z_dimension:
1343     dz = m/(dx*dy);
1344     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1345     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1346     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1347     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1348       for (j=0; j<dy; j++) {
1349         for (i=0; i<dx; i++,p++) {
1350           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1351         }
1352       }
1353     }
1354     opt->start[r] = start;
1355     opt->dx[r]    = dx;
1356     opt->dy[r]    = dy;
1357     opt->dz[r]    = dz;
1358     opt->X[r]     = X;
1359     opt->Y[r]     = Y;
1360   }
1361 
1362 finish:
1363   /* If not optimizable, free arrays to save memory */
1364   if (!n || !optimizable) {
1365     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1366     ierr = PetscFree(opt);CHKERRQ(ierr);
1367     *out = NULL;
1368   } else {
1369     opt->offset[0] = 0;
1370     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1371     *out = opt;
1372   }
1373   PetscFunctionReturn(0);
1374 }
1375 
1376 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1377 {
1378   PetscErrorCode ierr;
1379   PetscSFPackOpt opt = *out;
1380 
1381   PetscFunctionBegin;
1382   if (opt) {
1383     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
1384     ierr = PetscFree(opt);CHKERRQ(ierr);
1385     *out = NULL;
1386   }
1387   PetscFunctionReturn(0);
1388 }
1389 
1390 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1391 {
1392   PetscErrorCode ierr;
1393   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1394   PetscInt       i,j;
1395 
1396   PetscFunctionBegin;
1397   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1398   for (i=0; i<2; i++) { /* Set defaults */
1399     sf->leafstart[i]   = 0;
1400     sf->leafcontig[i]  = PETSC_TRUE;
1401     sf->leafdups[i]    = PETSC_FALSE;
1402     bas->rootstart[i]  = 0;
1403     bas->rootcontig[i] = PETSC_TRUE;
1404     bas->rootdups[i]   = PETSC_FALSE;
1405   }
1406 
1407   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1408   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1409 
1410   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1411   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1412 
1413   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1414   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1415     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1416   }
1417   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1418     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1419   }
1420 
1421   /* If not, see if we can have per-rank optimizations by doing index analysis */
1422   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1423   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1424 
1425   /* Are root indices for self and remote contiguous? */
1426   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1427   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1428 
1429   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1430   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1431 
1432   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1433     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1434   }
1435   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1436     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1437   }
1438 
1439   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1440   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1441 
1442  #if defined(PETSC_HAVE_DEVICE)
1443     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1444   if (PetscDefined(HAVE_DEVICE)) {
1445     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1446     if (!sf->leafcontig[0]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1447     if (!sf->leafcontig[1]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1448     if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1449     if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1450   }
1451 #endif
1452   PetscFunctionReturn(0);
1453 }
1454 
1455 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1456 {
1457   PetscErrorCode ierr;
1458   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1459   PetscInt       i;
1460 
1461   PetscFunctionBegin;
1462   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1463     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1464     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1465    #if defined(PETSC_HAVE_DEVICE)
1466     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1467     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1468    #endif
1469   }
1470   PetscFunctionReturn(0);
1471 }
1472