xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 793a3527f7f9b48b2117473bf24a35e19d9d5c47)
1 #include "petsc/private/sfimpl.h"
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
6 
7 #if defined(PETSC_HAVE_CUDA)
8 #include <cuda_runtime.h>
9 #include <petsccublas.h>
10 #endif
11 #if defined(PETSC_HAVE_HIP)
12 #include <hip/hip_runtime.h>
13 #include <petschipblas.h>
14 #endif
15 /*
16  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
17  * therefore we pack data types manually. This file defines packing routines for the standard data types.
18  */
19 
20 #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
21 
22 /* Operations working like s += t */
23 #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
24 #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
25 #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
26 #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
27 /* Ref MPI MAXLOC */
28 #define OP_XLOC(op,s,t) \
29   do {                                       \
30     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
31     else if (!((s).u op (t).u)) s = t;           \
32   } while (0)
33 
34 /* DEF_PackFunc - macro defining a Pack routine
35 
36    Arguments of the macro:
37    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
38    .BS        Block size for vectorization. It is a factor of bsz.
39    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
40 
41    Arguments of the Pack routine:
42    +count     Number of indices in idx[].
43    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
44    .opt       Per-pack optimization plan. NULL means no such plan.
45    .idx       Indices of entries to packed.
46    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
47    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
48    -packed    Address of the packed data.
49  */
50 #define DEF_PackFunc(Type,BS,EQ) \
51   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
52   {                                                                                                          \
53     PetscErrorCode ierr;                                                                                     \
54     const Type     *u = (const Type*)unpacked,*u2;                                                           \
55     Type           *p = (Type*)packed,*p2;                                                                   \
56     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
57     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
58     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
59     PetscFunctionBegin;                                                                                      \
60     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
61     else if (opt) { /* has optimizations available */                                                        \
62       p2 = p;                                                                                                \
63       for (r=0; r<opt->n; r++) {                                                                             \
64         u2 = u + opt->start[r]*MBS;                                                                          \
65         X  = opt->X[r];                                                                                      \
66         Y  = opt->Y[r];                                                                                      \
67         for (k=0; k<opt->dz[r]; k++)                                                                         \
68           for (j=0; j<opt->dy[r]; j++) {                                                                     \
69             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
70             p2  += opt->dx[r]*MBS;                                                                           \
71           }                                                                                                  \
72       }                                                                                                      \
73     } else {                                                                                                 \
74       for (i=0; i<count; i++)                                                                                \
75         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
76           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
77             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
78     }                                                                                                        \
79     PetscFunctionReturn(0);                                                                                  \
80   }
81 
82 /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
83                 and inserts into a sparse array.
84 
85    Arguments:
86   .Type       Type of the data
87   .BS         Block size for vectorization
88   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
89 
90   Notes:
91    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
92  */
93 #define DEF_UnpackFunc(Type,BS,EQ)               \
94   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
95   {                                                                                                          \
96     PetscErrorCode ierr;                                                                                     \
97     Type           *u = (Type*)unpacked,*u2;                                                                 \
98     const Type     *p = (const Type*)packed;                                                                 \
99     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
100     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
101     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
102     PetscFunctionBegin;                                                                                      \
103     if (!idx) {                                                                                              \
104       u += start*MBS;                                                                                        \
105       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
106     } else if (opt) { /* has optimizations available */                                                      \
107       for (r=0; r<opt->n; r++) {                                                                             \
108         u2 = u + opt->start[r]*MBS;                                                                          \
109         X  = opt->X[r];                                                                                      \
110         Y  = opt->Y[r];                                                                                      \
111         for (k=0; k<opt->dz[r]; k++)                                                                         \
112           for (j=0; j<opt->dy[r]; j++) {                                                                     \
113             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
114             p   += opt->dx[r]*MBS;                                                                           \
115           }                                                                                                  \
116       }                                                                                                      \
117     } else {                                                                                                 \
118       for (i=0; i<count; i++)                                                                                \
119         for (j=0; j<M; j++)                                                                                  \
120           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
121     }                                                                                                        \
122     PetscFunctionReturn(0);                                                                                  \
123   }
124 
125 /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
126 
127    Arguments:
128   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
129   .Type       Type of the data
130   .BS         Block size for vectorization
131   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
132   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
133   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
134  */
135 #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
136   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
137   {                                                                                                          \
138     Type           *u = (Type*)unpacked,*u2;                                                                 \
139     const Type     *p = (const Type*)packed;                                                                 \
140     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
141     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
142     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
143     PetscFunctionBegin;                                                                                      \
144     if (!idx) {                                                                                              \
145       u += start*MBS;                                                                                        \
146       for (i=0; i<count; i++)                                                                                \
147         for (j=0; j<M; j++)                                                                                  \
148           for (k=0; k<BS; k++)                                                                               \
149             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
150     } else if (opt) { /* idx[] has patterns */                                                               \
151       for (r=0; r<opt->n; r++) {                                                                             \
152         u2 = u + opt->start[r]*MBS;                                                                          \
153         X  = opt->X[r];                                                                                      \
154         Y  = opt->Y[r];                                                                                      \
155         for (k=0; k<opt->dz[r]; k++)                                                                         \
156           for (j=0; j<opt->dy[r]; j++) {                                                                     \
157             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
158             p += opt->dx[r]*MBS;                                                                             \
159           }                                                                                                  \
160       }                                                                                                      \
161     } else {                                                                                                 \
162       for (i=0; i<count; i++)                                                                                \
163         for (j=0; j<M; j++)                                                                                  \
164           for (k=0; k<BS; k++)                                                                               \
165             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
166     }                                                                                                        \
167     PetscFunctionReturn(0);                                                                                  \
168   }
169 
170 #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
171   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
172   {                                                                                                          \
173     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
174     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
175     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
176     const PetscInt MBS = M*BS;                                                                               \
177     PetscFunctionBegin;                                                                                      \
178     for (i=0; i<count; i++) {                                                                                \
179       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
180       l = i*MBS;                                                                                             \
181       for (j=0; j<M; j++)                                                                                    \
182         for (k=0; k<BS; k++) {                                                                               \
183           tmp = u[r+j*BS+k];                                                                                 \
184           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
185           p[l+j*BS+k] = tmp;                                                                                 \
186         }                                                                                                    \
187     }                                                                                                        \
188     PetscFunctionReturn(0);                                                                                  \
189   }
190 
191 #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
192   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
193   {                                                                                                          \
194     PetscErrorCode ierr;                                                                                     \
195     const Type     *u = (const Type*)src;                                                                    \
196     Type           *v = (Type*)dst;                                                                          \
197     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
198     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
199     const PetscInt MBS = M*BS;                                                                               \
200     PetscFunctionBegin;                                                                                      \
201     if (!srcIdx) { /* src is contiguous */                                                                   \
202       u += srcStart*MBS;                                                                                     \
203       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
204     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
205       u += srcOpt->start[0]*MBS;                                                                             \
206       v += dstStart*MBS;                                                                                     \
207       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
208       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
209         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
210           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
211           v += srcOpt->dx[0]*MBS;                                                                            \
212         }                                                                                                    \
213     } else { /* all other cases */                                                                           \
214       for (i=0; i<count; i++) {                                                                              \
215         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
216         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
217         for (j=0; j<M; j++)                                                                                  \
218           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
219       }                                                                                                      \
220     }                                                                                                        \
221     PetscFunctionReturn(0);                                                                                  \
222   }
223 
224 #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
225   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
226   {                                                                                                          \
227     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
228     const Type     *ldata = (const Type*)leafdata;                                                           \
229     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
230     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
231     const PetscInt MBS = M*BS;                                                                               \
232     PetscFunctionBegin;                                                                                      \
233     for (i=0; i<count; i++) {                                                                                \
234       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
235       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
236       for (j=0; j<M; j++)                                                                                    \
237         for (k=0; k<BS; k++) {                                                                               \
238           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
239           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
240         }                                                                                                    \
241     }                                                                                                        \
242     PetscFunctionReturn(0);                                                                                  \
243   }
244 
245 /* Pack, Unpack/Fetch ops */
246 #define DEF_Pack(Type,BS,EQ)                                                      \
247   DEF_PackFunc(Type,BS,EQ)                                                        \
248   DEF_UnpackFunc(Type,BS,EQ)                                                      \
249   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
250   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
251     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
252     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
253     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
254   }
255 
256 /* Add, Mult ops */
257 #define DEF_Add(Type,BS,EQ)                                                       \
258   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
259   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
260   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
261   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
262   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
263   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
264   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
265     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
266     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
267     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
268     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
269     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
270     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
271   }
272 
273 /* Max, Min ops */
274 #define DEF_Cmp(Type,BS,EQ)                                                       \
275   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
276   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
277   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
278   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
279   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
280     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
281     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
282     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
283     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
284   }
285 
286 /* Logical ops.
287   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
288   the compilation warning "empty macro arguments are undefined in ISO C90"
289  */
290 #define DEF_Log(Type,BS,EQ)                                                       \
291   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
292   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
293   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
294   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
295   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
296   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
297   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
298     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
299     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
300     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
301     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
302     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
303     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
304   }
305 
306 /* Bitwise ops */
307 #define DEF_Bit(Type,BS,EQ)                                                       \
308   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
309   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
310   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
311   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
312   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
313   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
314   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
315     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
316     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
317     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
318     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
319     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
320     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
321   }
322 
323 /* Maxloc, Minloc ops */
324 #define DEF_Xloc(Type,BS,EQ)                                                      \
325   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
326   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
327   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
328   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
329   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
330     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
331     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
332     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
333     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
334   }
335 
336 #define DEF_IntegerType(Type,BS,EQ)                                               \
337   DEF_Pack(Type,BS,EQ)                                                            \
338   DEF_Add(Type,BS,EQ)                                                             \
339   DEF_Cmp(Type,BS,EQ)                                                             \
340   DEF_Log(Type,BS,EQ)                                                             \
341   DEF_Bit(Type,BS,EQ)                                                             \
342   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
343     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
344     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
345     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
346     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
347     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
348   }
349 
350 #define DEF_RealType(Type,BS,EQ)                                                  \
351   DEF_Pack(Type,BS,EQ)                                                            \
352   DEF_Add(Type,BS,EQ)                                                             \
353   DEF_Cmp(Type,BS,EQ)                                                             \
354   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
355     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
356     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
357     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
358   }
359 
360 #if defined(PETSC_HAVE_COMPLEX)
361 #define DEF_ComplexType(Type,BS,EQ)                                               \
362   DEF_Pack(Type,BS,EQ)                                                            \
363   DEF_Add(Type,BS,EQ)                                                             \
364   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
365     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
367   }
368 #endif
369 
370 #define DEF_DumbType(Type,BS,EQ)                                                  \
371   DEF_Pack(Type,BS,EQ)                                                            \
372   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
373     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374   }
375 
376 /* Maxloc, Minloc */
377 #define DEF_PairType(Type,BS,EQ)                                                  \
378   DEF_Pack(Type,BS,EQ)                                                            \
379   DEF_Xloc(Type,BS,EQ)                                                            \
380   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
381     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
382     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
383   }
384 
385 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
386 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
387 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
388 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
389 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
390 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
391 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
392 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
393 
394 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
395 DEF_IntegerType(int,1,1)
396 DEF_IntegerType(int,2,1)
397 DEF_IntegerType(int,4,1)
398 DEF_IntegerType(int,8,1)
399 DEF_IntegerType(int,1,0)
400 DEF_IntegerType(int,2,0)
401 DEF_IntegerType(int,4,0)
402 DEF_IntegerType(int,8,0)
403 #endif
404 
405 /* The typedefs are used to get a typename without space that CPPJoin can handle */
406 typedef signed char SignedChar;
407 DEF_IntegerType(SignedChar,1,1)
408 DEF_IntegerType(SignedChar,2,1)
409 DEF_IntegerType(SignedChar,4,1)
410 DEF_IntegerType(SignedChar,8,1)
411 DEF_IntegerType(SignedChar,1,0)
412 DEF_IntegerType(SignedChar,2,0)
413 DEF_IntegerType(SignedChar,4,0)
414 DEF_IntegerType(SignedChar,8,0)
415 
416 typedef unsigned char UnsignedChar;
417 DEF_IntegerType(UnsignedChar,1,1)
418 DEF_IntegerType(UnsignedChar,2,1)
419 DEF_IntegerType(UnsignedChar,4,1)
420 DEF_IntegerType(UnsignedChar,8,1)
421 DEF_IntegerType(UnsignedChar,1,0)
422 DEF_IntegerType(UnsignedChar,2,0)
423 DEF_IntegerType(UnsignedChar,4,0)
424 DEF_IntegerType(UnsignedChar,8,0)
425 
426 DEF_RealType(PetscReal,1,1)
427 DEF_RealType(PetscReal,2,1)
428 DEF_RealType(PetscReal,4,1)
429 DEF_RealType(PetscReal,8,1)
430 DEF_RealType(PetscReal,1,0)
431 DEF_RealType(PetscReal,2,0)
432 DEF_RealType(PetscReal,4,0)
433 DEF_RealType(PetscReal,8,0)
434 
435 #if defined(PETSC_HAVE_COMPLEX)
436 DEF_ComplexType(PetscComplex,1,1)
437 DEF_ComplexType(PetscComplex,2,1)
438 DEF_ComplexType(PetscComplex,4,1)
439 DEF_ComplexType(PetscComplex,8,1)
440 DEF_ComplexType(PetscComplex,1,0)
441 DEF_ComplexType(PetscComplex,2,0)
442 DEF_ComplexType(PetscComplex,4,0)
443 DEF_ComplexType(PetscComplex,8,0)
444 #endif
445 
446 #define PairType(Type1,Type2) Type1##_##Type2
447 typedef struct {int u; int i;}           PairType(int,int);
448 typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
449 DEF_PairType(PairType(int,int),1,1)
450 DEF_PairType(PairType(PetscInt,PetscInt),1,1)
451 
452 /* If we don't know the basic type, we treat it as a stream of chars or ints */
453 DEF_DumbType(char,1,1)
454 DEF_DumbType(char,2,1)
455 DEF_DumbType(char,4,1)
456 DEF_DumbType(char,1,0)
457 DEF_DumbType(char,2,0)
458 DEF_DumbType(char,4,0)
459 
460 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
461 DEF_DumbType(DumbInt,1,1)
462 DEF_DumbType(DumbInt,2,1)
463 DEF_DumbType(DumbInt,4,1)
464 DEF_DumbType(DumbInt,8,1)
465 DEF_DumbType(DumbInt,1,0)
466 DEF_DumbType(DumbInt,2,0)
467 DEF_DumbType(DumbInt,4,0)
468 DEF_DumbType(DumbInt,8,0)
469 
470 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
471 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
472 {
473   int ierr;
474   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
475   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
476   return MPI_SUCCESS;
477 }
478 #endif
479 
480 PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
481 {
482   PetscErrorCode    ierr;
483   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
484   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
485 
486   PetscFunctionBegin;
487   /* Destroy device-specific fields */
488   if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);}
489 
490   /* Destroy host related fields */
491   if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);}
492   if (!link->use_nvshmem) {
493     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
494       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);}
495     }
496     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
497     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
498       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
499       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
500     }
501   }
502   ierr = PetscFree(link);CHKERRQ(ierr);
503   PetscFunctionReturn(0);
504 }
505 
506 PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
507 {
508   PetscErrorCode    ierr;
509 
510   PetscFunctionBegin;
511   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
512  #if defined(PETSC_HAVE_NVSHMEM)
513   {
514     PetscBool use_nvshmem;
515     ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr);
516     if (use_nvshmem) {
517       ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
518       PetscFunctionReturn(0);
519     }
520   }
521  #endif
522   ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
523   PetscFunctionReturn(0);
524 }
525 
526 /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
527    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
528 */
529 PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
530 {
531   PetscErrorCode       ierr;
532   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
533   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
534   const PetscInt       *rootoffset,*leafoffset;
535   PetscMPIInt          n;
536   MPI_Aint             disp;
537   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
538   MPI_Datatype         unit = link->unit;
539   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
540   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
541 
542   PetscFunctionBegin;
543   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
544   if (sf->persistent) {
545     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
546       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
547       if (direction == PETSCSF_LEAF2ROOT) {
548         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
549           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
550           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
551           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
552         }
553       } else { /* PETSCSF_ROOT2LEAF */
554         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
555           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
556           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
557           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
558         }
559       }
560       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
561     }
562 
563     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
564       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
565       if (direction == PETSCSF_LEAF2ROOT) {
566         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
567           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
568           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
569           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
570         }
571       } else { /* PETSCSF_ROOT2LEAF */
572         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
573           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
574           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
575           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
576         }
577       }
578       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
579     }
580   }
581   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
582   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
583   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
584   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
585   PetscFunctionReturn(0);
586 }
587 
588 
589 PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
590 {
591   PetscErrorCode    ierr;
592   PetscSFLink       link,*p;
593   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
594 
595   PetscFunctionBegin;
596   /* Look for types in cache */
597   for (p=&bas->inuse; (link=*p); p=&link->next) {
598     PetscBool match;
599     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
600     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
601       switch (cmode) {
602       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
603       case PETSC_USE_POINTER: break;
604       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
605       }
606       *mylink = link;
607       PetscFunctionReturn(0);
608     }
609   }
610   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
611 }
612 
613 PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
614 {
615   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
616   PetscSFLink       link = *mylink;
617 
618   PetscFunctionBegin;
619   link->rootdata = NULL;
620   link->leafdata = NULL;
621   link->next     = bas->avail;
622   bas->avail     = link;
623   *mylink        = NULL;
624   PetscFunctionReturn(0);
625 }
626 
627 /* Error out on unsupported overlapped communications */
628 PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
629 {
630   PetscErrorCode    ierr;
631   PetscSFLink       link,*p;
632   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
633   PetscBool         match;
634 
635   PetscFunctionBegin;
636   if (PetscDefined(USE_DEBUG)) {
637     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
638        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
639     */
640     if (rootdata || leafdata) {
641       for (p=&bas->inuse; (link=*p); p=&link->next) {
642         ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
643         if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
644       }
645     }
646   }
647   PetscFunctionReturn(0);
648 }
649 
650 static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
651 {
652   PetscFunctionBegin;
653   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
654   PetscFunctionReturn(0);
655 }
656 
657 
658 PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
659 {
660   PetscErrorCode ierr;
661   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
662   PetscBool      is2Int,is2PetscInt;
663   PetscMPIInt    ni,na,nd,combiner;
664 #if defined(PETSC_HAVE_COMPLEX)
665   PetscInt       nPetscComplex=0;
666 #endif
667 
668   PetscFunctionBegin;
669   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
670   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
671   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
672   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
673   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
674   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
675 #if defined(PETSC_HAVE_COMPLEX)
676   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
677 #endif
678   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
679   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
680   /* TODO: shaell we also handle Fortran MPI_2REAL? */
681   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr);
682   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
683   link->bs = 1; /* default */
684 
685   if (is2Int) {
686     PackInit_PairType_int_int_1_1(link);
687     link->bs        = 1;
688     link->unitbytes = 2*sizeof(int);
689     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
690     link->basicunit = MPI_2INT;
691     link->unit      = MPI_2INT;
692   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
693     PackInit_PairType_PetscInt_PetscInt_1_1(link);
694     link->bs        = 1;
695     link->unitbytes = 2*sizeof(PetscInt);
696     link->basicunit = MPIU_2INT;
697     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
698     link->unit      = MPIU_2INT;
699   } else if (nPetscReal) {
700     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
701     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
702     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
703     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
704     link->bs        = nPetscReal;
705     link->unitbytes = nPetscReal*sizeof(PetscReal);
706     link->basicunit = MPIU_REAL;
707     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
708   } else if (nPetscInt) {
709     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
710     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
711     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
712     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
713     link->bs        = nPetscInt;
714     link->unitbytes = nPetscInt*sizeof(PetscInt);
715     link->basicunit = MPIU_INT;
716     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
717 #if defined(PETSC_USE_64BIT_INDICES)
718   } else if (nInt) {
719     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
720     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
721     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
722     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
723     link->bs        = nInt;
724     link->unitbytes = nInt*sizeof(int);
725     link->basicunit = MPI_INT;
726     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
727 #endif
728   } else if (nSignedChar) {
729     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
730     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
731     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
732     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
733     link->bs        = nSignedChar;
734     link->unitbytes = nSignedChar*sizeof(SignedChar);
735     link->basicunit = MPI_SIGNED_CHAR;
736     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
737   }  else if (nUnsignedChar) {
738     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
739     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
740     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
741     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
742     link->bs        = nUnsignedChar;
743     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
744     link->basicunit = MPI_UNSIGNED_CHAR;
745     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
746 #if defined(PETSC_HAVE_COMPLEX)
747   } else if (nPetscComplex) {
748     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
749     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
750     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
751     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
752     link->bs        = nPetscComplex;
753     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
754     link->basicunit = MPIU_COMPLEX;
755     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
756 #endif
757   } else {
758     MPI_Aint lb,nbyte;
759     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr);
760     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
761     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
762       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
763       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
764       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
765       link->bs        = nbyte;
766       link->unitbytes = nbyte;
767       link->basicunit = MPI_BYTE;
768     } else {
769       nInt = nbyte / sizeof(int);
770       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
771       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
772       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
773       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
774       link->bs        = nInt;
775       link->unitbytes = nbyte;
776       link->basicunit = MPI_INT;
777     }
778     if (link->isbuiltin) link->unit = unit;
779   }
780 
781   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);}
782 
783   link->Memcpy = PetscSFLinkMemcpy_Host;
784   PetscFunctionReturn(0);
785 }
786 
787 PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
788 {
789   PetscFunctionBegin;
790   *UnpackAndOp = NULL;
791   if (PetscMemTypeHost(mtype)) {
792     if      (op == MPI_REPLACE)               *UnpackAndOp = link->h_UnpackAndInsert;
793     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
794     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
795     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
796     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
797     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
798     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
799     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
800     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
801     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
802     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
803     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
804     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
805   }
806 #if defined(PETSC_HAVE_DEVICE)
807   else if (PetscMemTypeDevice(mtype) && !atomic) {
808     if      (op == MPI_REPLACE)               *UnpackAndOp = link->d_UnpackAndInsert;
809     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
810     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
811     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
812     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
813     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
814     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
815     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
816     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
817     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
818     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
819     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
820     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
821   } else if (PetscMemTypeDevice(mtype) && atomic) {
822     if      (op == MPI_REPLACE)               *UnpackAndOp = link->da_UnpackAndInsert;
823     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
824     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
825     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
826     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
827     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
828     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
829     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
830     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
831     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
832     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
833     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
834     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
835   }
836 #endif
837   PetscFunctionReturn(0);
838 }
839 
840 PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
841 {
842   PetscFunctionBegin;
843   *ScatterAndOp = NULL;
844   if (PetscMemTypeHost(mtype)) {
845     if      (op == MPI_REPLACE)               *ScatterAndOp = link->h_ScatterAndInsert;
846     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
847     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
848     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
849     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
850     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
851     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
852     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
853     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
854     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
855     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
856     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
857     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
858   }
859 #if defined(PETSC_HAVE_DEVICE)
860   else if (PetscMemTypeDevice(mtype) && !atomic) {
861     if      (op == MPI_REPLACE)               *ScatterAndOp = link->d_ScatterAndInsert;
862     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
863     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
864     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
865     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
866     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
867     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
868     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
869     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
870     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
871     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
872     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
873     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
874   } else if (PetscMemTypeDevice(mtype) && atomic) {
875     if      (op == MPI_REPLACE)               *ScatterAndOp = link->da_ScatterAndInsert;
876     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
877     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
878     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
879     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
880     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
881     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
882     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
883     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
884     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
885     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
886     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
887     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
888   }
889 #endif
890   PetscFunctionReturn(0);
891 }
892 
893 PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
894 {
895   PetscFunctionBegin;
896   *FetchAndOp = NULL;
897   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
898   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
899 #if defined(PETSC_HAVE_DEVICE)
900   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
901   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOp = link->da_FetchAndAdd;
902 #endif
903   PetscFunctionReturn(0);
904 }
905 
906 PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
907 {
908   PetscFunctionBegin;
909   *FetchAndOpLocal = NULL;
910   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
911   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
912 #if defined(PETSC_HAVE_DEVICE)
913   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
914   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
915 #endif
916   PetscFunctionReturn(0);
917 }
918 
919 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
920 {
921   PetscErrorCode ierr;
922   PetscLogDouble flops;
923   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
924 
925   PetscFunctionBegin;
926   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
927     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
928 #if defined(PETSC_HAVE_DEVICE)
929     if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
930 #endif
931     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
932   }
933   PetscFunctionReturn(0);
934 }
935 
936 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
937 {
938   PetscLogDouble flops;
939   PetscErrorCode ierr;
940 
941   PetscFunctionBegin;
942   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
943     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
944 #if defined(PETSC_HAVE_DEVICE)
945     if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
946 #endif
947     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
948   }
949   PetscFunctionReturn(0);
950 }
951 
952 /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
953   Input Arguments:
954   +sf      - The StarForest
955   .link    - The link
956   .count   - Number of entries to unpack
957   .start   - The first index, significent when indices=NULL
958   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
959   .buf     - A contiguous buffer to unpack from
960   -op      - Operation after unpack
961 
962   Output Arguments:
963   .data    - The data to unpack to
964 */
965 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
966 {
967   PetscFunctionBegin;
968 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
969   {
970     PetscErrorCode ierr;
971     PetscInt       i;
972     PetscMPIInt    n;
973     if (indices) {
974       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
975          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
976       */
977       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);}
978     } else {
979       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
980       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr);
981     }
982   }
983   PetscFunctionReturn(0);
984 #else
985   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
986 #endif
987 }
988 
989 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
990 {
991   PetscFunctionBegin;
992 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
993   {
994     PetscErrorCode ierr;
995     PetscInt       i,disp;
996     if (!srcIdx) {
997       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
998     } else {
999       for (i=0; i<count; i++) {
1000         disp = dstIdx? dstIdx[i] : dstStart + i;
1001         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);
1002       }
1003     }
1004   }
1005   PetscFunctionReturn(0);
1006 #else
1007   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1008 #endif
1009 }
1010 
1011 /*=============================================================================
1012               Pack/Unpack/Fetch/Scatter routines
1013  ============================================================================*/
1014 
1015 /* Pack rootdata to rootbuf
1016   Input Arguments:
1017   + sf       - The SF this packing works on.
1018   . link     - It gives the memtype of the roots and also provides root buffer.
1019   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1020   - rootdata - Where to read the roots.
1021 
1022   Notes:
1023   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1024   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
1025  */
1026 PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1027 {
1028   PetscErrorCode   ierr;
1029   const PetscInt   *rootindices = NULL;
1030   PetscInt         count,start;
1031   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1032   PetscMemType     rootmtype = link->rootmtype;
1033   PetscSFPackOpt   opt = NULL;
1034 
1035   PetscFunctionBegin;
1036   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1037   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1038     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1039     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1040     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1041   }
1042   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1043   PetscFunctionReturn(0);
1044 }
1045 
1046 /* Pack leafdata to leafbuf */
1047 PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1048 {
1049   PetscErrorCode   ierr;
1050   const PetscInt   *leafindices = NULL;
1051   PetscInt         count,start;
1052   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1053   PetscMemType     leafmtype = link->leafmtype;
1054   PetscSFPackOpt   opt = NULL;
1055 
1056   PetscFunctionBegin;
1057   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1058   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1059     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1060     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1061     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1062   }
1063   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1064   PetscFunctionReturn(0);
1065 }
1066 
1067 /* Pack rootdata to rootbuf, which are in the same memory space */
1068 PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1069 {
1070   PetscErrorCode   ierr;
1071   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1072 
1073   PetscFunctionBegin;
1074   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1075     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1076     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */
1077   }
1078   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1079   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);}
1080   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1081   PetscFunctionReturn(0);
1082 }
1083 /* Pack leafdata to leafbuf, which are in the same memory space */
1084 PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1085 {
1086   PetscErrorCode   ierr;
1087 
1088   PetscFunctionBegin;
1089   if (scope == PETSCSF_REMOTE) {
1090     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1091     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1092   }
1093   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1094   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);}
1095   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1096   PetscFunctionReturn(0);
1097 }
1098 
1099 PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1100 {
1101   PetscErrorCode   ierr;
1102   const PetscInt   *rootindices = NULL;
1103   PetscInt         count,start;
1104   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1105   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1106   PetscMemType     rootmtype = link->rootmtype;
1107   PetscSFPackOpt   opt = NULL;
1108 
1109   PetscFunctionBegin;
1110   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1111     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1112     if (UnpackAndOp) {
1113       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1114       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1115     } else {
1116       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1117       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1118     }
1119   }
1120   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1121   PetscFunctionReturn(0);
1122 }
1123 
1124 PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1125 {
1126   PetscErrorCode   ierr;
1127   const PetscInt   *leafindices = NULL;
1128   PetscInt         count,start;
1129   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1130   PetscMemType     leafmtype = link->leafmtype;
1131   PetscSFPackOpt   opt = NULL;
1132 
1133   PetscFunctionBegin;
1134   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1135     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1136     if (UnpackAndOp) {
1137       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1138       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1139     } else {
1140       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1141       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1142     }
1143   }
1144   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1145   PetscFunctionReturn(0);
1146 }
1147 /* Unpack rootbuf to rootdata, which are in the same memory space */
1148 PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1149 {
1150   PetscErrorCode   ierr;
1151   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1152 
1153   PetscFunctionBegin;
1154   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1155   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);}
1156   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1157   if (scope == PETSCSF_REMOTE) {
1158     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1159     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1160   }
1161   PetscFunctionReturn(0);
1162 }
1163 
1164 /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1165 PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1166 {
1167   PetscErrorCode   ierr;
1168 
1169   PetscFunctionBegin;
1170   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1171   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);}
1172   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1173   if (scope == PETSCSF_REMOTE) {
1174     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);}  /* Used by SF nvshmem */
1175     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
1176   }
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1181 PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1182 {
1183   PetscErrorCode     ierr;
1184   const PetscInt     *rootindices = NULL;
1185   PetscInt           count,start;
1186   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1187   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1188   PetscMemType       rootmtype = link->rootmtype;
1189   PetscSFPackOpt     opt = NULL;
1190 
1191   PetscFunctionBegin;
1192   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1193   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1194     /* Do FetchAndOp on rootdata with rootbuf */
1195     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr);
1196     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1197     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr);
1198   }
1199   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr);
1200   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1201   PetscFunctionReturn(0);
1202 }
1203 
1204 PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1205 {
1206   PetscErrorCode       ierr;
1207   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1208   PetscInt             count,rootstart,leafstart;
1209   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1210   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1211   PetscMemType         rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1212   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1213   PetscInt             buflen = sf->leafbuflen[PETSCSF_LOCAL];
1214   char                 *srcbuf = NULL,*dstbuf = NULL;
1215   PetscBool            dstdups;
1216 
1217   if (!buflen) PetscFunctionReturn(0);
1218   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1219     if (direction == PETSCSF_ROOT2LEAF) {
1220       ierr     = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1221       srcmtype = rootmtype;
1222       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1223       dstmtype = leafmtype;
1224       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1225     } else {
1226       ierr     = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1227       srcmtype = leafmtype;
1228       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1229       dstmtype = rootmtype;
1230       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1231     }
1232     ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr);
1233     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1234     if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);}
1235     if (direction == PETSCSF_ROOT2LEAF) {
1236       ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1237     } else {
1238       ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1239     }
1240   } else {
1241     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1242     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
1243     ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr);
1244     if (ScatterAndOp) {
1245       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1246       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1247       if (direction == PETSCSF_ROOT2LEAF) {
1248         ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1249       } else {
1250         ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1251       }
1252     } else {
1253       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1254       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1255       if (direction == PETSCSF_ROOT2LEAF) {
1256         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1257       } else {
1258         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1259       }
1260     }
1261   }
1262   PetscFunctionReturn(0);
1263 }
1264 
1265 /* Fetch rootdata to leafdata and leafupdate locally */
1266 PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1267 {
1268   PetscErrorCode       ierr;
1269   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1270   PetscInt             count,rootstart,leafstart;
1271   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1272   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1273   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1274   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1275 
1276   PetscFunctionBegin;
1277   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1278   if (rootmtype != leafmtype) {
1279     /* The local communication has to go through pack and unpack */
1280     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1281   } else {
1282     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1283     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1284     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1285     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1286   }
1287   PetscFunctionReturn(0);
1288 }
1289 
1290 /*
1291   Create per-rank pack/unpack optimizations based on indice patterns
1292 
1293    Input Parameters:
1294   +  n       - Number of destination ranks
1295   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1296   -  idx     - [*]   Array storing indices
1297 
1298    Output Parameters:
1299   +  opt     - Pack optimizations. NULL if no optimizations.
1300 */
1301 PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1302 {
1303   PetscErrorCode ierr;
1304   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1305   PetscBool      optimizable = PETSC_TRUE;
1306   PetscSFPackOpt opt;
1307 
1308   PetscFunctionBegin;
1309   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1310   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1311   opt->n      = opt->array[0] = n;
1312   opt->offset = opt->array + 1;
1313   opt->start  = opt->array + n   + 2;
1314   opt->dx     = opt->array + 2*n + 2;
1315   opt->dy     = opt->array + 3*n + 2;
1316   opt->dz     = opt->array + 4*n + 2;
1317   opt->X      = opt->array + 5*n + 2;
1318   opt->Y      = opt->array + 6*n + 2;
1319 
1320   for (r=0; r<n; r++) { /* For each destination rank */
1321     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1322     p     = offset[r];
1323     start = idx[p]; /* First index for this rank */
1324     p++;
1325 
1326     /* Search in X dimension */
1327     for (dx=1; dx<m; dx++,p++) {
1328       if (start+dx != idx[p]) break;
1329     }
1330 
1331     dydz = m/dx;
1332     X    = dydz > 1 ? (idx[p]-start) : dx;
1333     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1334     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1335     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1336       for (i=0; i<dx; i++,p++) {
1337         if (start+X*dy+i != idx[p]) {
1338           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1339           else goto Z_dimension;
1340         }
1341       }
1342     }
1343 
1344 Z_dimension:
1345     dz = m/(dx*dy);
1346     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1347     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1348     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1349     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1350       for (j=0; j<dy; j++) {
1351         for (i=0; i<dx; i++,p++) {
1352           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1353         }
1354       }
1355     }
1356     opt->start[r] = start;
1357     opt->dx[r]    = dx;
1358     opt->dy[r]    = dy;
1359     opt->dz[r]    = dz;
1360     opt->X[r]     = X;
1361     opt->Y[r]     = Y;
1362   }
1363 
1364 finish:
1365   /* If not optimizable, free arrays to save memory */
1366   if (!n || !optimizable) {
1367     ierr = PetscFree(opt->array);CHKERRQ(ierr);
1368     ierr = PetscFree(opt);CHKERRQ(ierr);
1369     *out = NULL;
1370   } else {
1371     opt->offset[0] = 0;
1372     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1373     *out = opt;
1374   }
1375   PetscFunctionReturn(0);
1376 }
1377 
1378 PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1379 {
1380   PetscErrorCode ierr;
1381   PetscSFPackOpt opt = *out;
1382 
1383   PetscFunctionBegin;
1384   if (opt) {
1385     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
1386     ierr = PetscFree(opt);CHKERRQ(ierr);
1387     *out = NULL;
1388   }
1389   PetscFunctionReturn(0);
1390 }
1391 
1392 PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1393 {
1394   PetscErrorCode ierr;
1395   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1396   PetscInt       i,j;
1397 
1398   PetscFunctionBegin;
1399   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1400   for (i=0; i<2; i++) { /* Set defaults */
1401     sf->leafstart[i]   = 0;
1402     sf->leafcontig[i]  = PETSC_TRUE;
1403     sf->leafdups[i]    = PETSC_FALSE;
1404     bas->rootstart[i]  = 0;
1405     bas->rootcontig[i] = PETSC_TRUE;
1406     bas->rootdups[i]   = PETSC_FALSE;
1407   }
1408 
1409   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1410   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1411 
1412   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1413   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1414 
1415   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1416   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1417     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1418   }
1419   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1420     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1421   }
1422 
1423   /* If not, see if we can have per-rank optimizations by doing index analysis */
1424   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1425   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1426 
1427   /* Are root indices for self and remote contiguous? */
1428   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1429   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1430 
1431   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1432   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1433 
1434   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1435     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1436   }
1437   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1438     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1439   }
1440 
1441   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1442   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1443 
1444  #if defined(PETSC_HAVE_DEVICE)
1445     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1446   if (PetscDefined(HAVE_DEVICE)) {
1447     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1448     if (!sf->leafcontig[0]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1449     if (!sf->leafcontig[1]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1450     if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1451     if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1452   }
1453 #endif
1454   PetscFunctionReturn(0);
1455 }
1456 
1457 PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1458 {
1459   PetscErrorCode ierr;
1460   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1461   PetscInt       i;
1462 
1463   PetscFunctionBegin;
1464   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1465     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1466     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1467    #if defined(PETSC_HAVE_DEVICE)
1468     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1469     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1470    #endif
1471   }
1472   PetscFunctionReturn(0);
1473 }
1474