xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision fe2efc57b7777594bce7568e90822861480cbdc8)
1 
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6 #include <cuda_runtime.h>
7 #endif
8 /*
9  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
10  * therefore we pack data types manually. This file defines packing routines for the standard data types.
11  */
12 
13 #define CPPJoin2(a,b)         a ##_## b
14 #define CPPJoin3(a,b,c)       a ##_## b ##_## c
15 #define CPPJoin4_(a,b,c,d)    a##b##_##c##_##d
16 #define CPPJoin4(a,b,c,d)     CPPJoin4_(a##_,b,c,d)
17 
18 #define EXECUTE(statement)    statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */
19 #define IGNORE(statement)     do {} while(0)
20 
21 #define BINARY_OP(r,s,op,t)   do {(r) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
22 #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
23 #define LXOR_OP(r,s,op,t)     do {(r) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
24 #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0)
25 
26 #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */
27 
28 /* DEF_PackFunc - macro defining a Pack routine
29 
30    Arguments of the macro:
31    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
32    .BS        Block size for vectorization. It is a factor of bs.
33    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
34 
35    Arguments of the Pack routine:
36    +count     Number of indices in idx[]
37    .idx       Indices of entries to packed. NULL means contiguous indices, that is [0,count)
38    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39    .opt       Pack optimization plans. NULL means no plan at all.
40    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count)
41    -packed    Address of the packed data for each rank
42  */
43 #define DEF_PackFunc(Type,BS,EQ) \
44   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed) \
45   {                                                                                                          \
46     PetscErrorCode ierr;                                                                                     \
47     const Type     *u = (const Type*)unpacked,*u2;                                                           \
48     Type           *p = (Type*)packed,*p2;                                                                   \
49     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
50     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
51     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
52     PetscFunctionBegin;                                                                                      \
53     if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);}  /* Indices are contiguous */             \
54     else if (!opt) { /* No optimizations available */                                                        \
55       for (i=0; i<count; i++)                                                                                \
56         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
57           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
58             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
59     } else {                                                                                                 \
60       for (r=0; r<opt->n; r++) {                                                                             \
61         p2  = p + opt->offset[r]*MBS;                                                                        \
62         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
63           idx2 = idx + opt->offset[r];                                                                       \
64           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
65             for (j=0; j<M; j++)                                                                              \
66               for (k=0; k<BS; k++)                                                                           \
67                 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k];                                                    \
68         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
69           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
70             u2   = u + idx[opt->copy_start[i]]*MBS;                                                          \
71             l    = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */                       \
72             ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr);                                                     \
73             p2  += l;                                                                                        \
74           }                                                                                                  \
75         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
76           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
77           step = opt->stride_step[r];                                                                        \
78           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
79             for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j];                                            \
80         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
81       }                                                                                                      \
82     }                                                                                                        \
83     PetscFunctionReturn(0);                                                                                  \
84   }
85 
86 /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine
87 
88    Arguments:
89   +action     Unpack or Fetch
90   .Type       Type of the data
91   .BS         Block size for vectorization
92   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
93   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
94   .CType      Type with or without the const qualifier, i.e., const Type or Type
95   .Cvoid      void with or without the const qualifier, i.e., const void or void
96 
97   Notes:
98    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
99    The two arguments CType and Cvoid are used (instead of one constness argument), because we want to
100    get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument,
101    sometimes we input 'const', sometimes we have to input empty.
102 
103    If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare.
104  */
105 #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid)               \
106   static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
107   {                                                                                                          \
108     PetscErrorCode ierr;                                                                                     \
109     Type           *u = (Type*)unpacked,*u2;                                                                 \
110     CType          *p = (CType*)packed,*p2;                                                                  \
111     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
112     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
113     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
114     PetscFunctionBegin;                                                                                      \
115     if (!idx) {                                                                                              \
116       FILTER(Type *v);                                                                                       \
117       FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr));                                               \
118       FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr));                                             \
119              ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);                                              \
120       FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr));                                             \
121       FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                             \
122     } else if (!opt) { /* No optimizations available */                                                      \
123       for (i=0; i<count; i++)                                                                                \
124         for (j=0; j<M; j++)                                                                                  \
125           for (k=0; k<BS; k++) {                                                                             \
126             FILTER(Type t                = u[idx[i]*MBS+j*BS+k]);                                            \
127                    u[idx[i]*MBS+j*BS+k]  = p[i*MBS+j*BS+k];                                                  \
128             FILTER(p[i*MBS+j*BS+k]       = t);                                                               \
129           }                                                                                                  \
130     } else {                                                                                                 \
131       for (r=0; r<opt->n; r++) {                                                                             \
132         p2 = p + opt->offset[r]*MBS;                                                                         \
133         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
134           idx2 = idx + opt->offset[r];                                                                       \
135           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
136             for (j=0; j<M; j++)                                                                              \
137               for (k=0; k<BS; k++) {                                                                         \
138                 FILTER(Type t                = u[idx2[i]*MBS+j*BS+k]);                                       \
139                        u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
140                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
141               }                                                                                              \
142         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
143           FILTER(Type *v);                                                                                   \
144           FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \
145           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
146             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
147             l  = opt->copy_length[i]*MBS;                                                                    \
148             FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr));                                              \
149                    ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr);                                              \
150             FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr));                                              \
151             p2 += l;                                                                                         \
152           }                                                                                                  \
153           FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                         \
154         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
155           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
156           step = opt->stride_step[r];                                                                        \
157           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
158             for (j=0; j<M; j++)                                                                              \
159               for (k=0; k<BS; k++) {                                                                         \
160                 FILTER(Type t                = u2[i*step*MBS+j*BS+k]);                                       \
161                        u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
162                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
163               }                                                                                              \
164         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
165       }                                                                                                      \
166     }                                                                                                        \
167     PetscFunctionReturn(0);                                                                                  \
168   }
169 
170 /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc
171 
172    Arguments:
173   +action     Unpack or Fetch
174   .opname     Name of the Op, such as Add, Mult, LAND, etc.
175   .Type       Type of the data
176   .BS         Block size for vectorization
177   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
178   .op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
179   .APPLY      Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP
180   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
181   .CType      Type with or without the const qualifier, i.e., const Type or Type
182   -Cvoid      void with or without the const qualifier, i.e., const void or void
183  */
184 #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \
185   static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
186   {                                                                                                          \
187     Type           *u = (Type*)unpacked,*u2,t;                                                               \
188     CType          *p = (CType*)packed,*p2;                                                                  \
189     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
190     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
191     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
192     PetscFunctionBegin;                                                                                      \
193     if (!idx) {                                                                                              \
194       for (i=0; i<count; i++)                                                                                \
195         for (j=0; j<M; j++)                                                                                  \
196           for (k=0; k<BS; k++) {                                                                             \
197             t    = u[i*MBS+j*BS+k];                                                                          \
198             APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                                    \
199             FILTER(p[i*MBS+j*BS+k] = t);                                                                     \
200           }                                                                                                  \
201     } else if (!opt) { /* No optimizations available */                                                      \
202       for (i=0; i<count; i++)                                                                                \
203         for (j=0; j<M; j++)                                                                                  \
204           for (k=0; k<BS; k++) {                                                                             \
205               t    = u[idx[i]*MBS+j*BS+k];                                                                   \
206               APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                             \
207               FILTER(p[i*MBS+j*BS+k] = t);                                                                   \
208           }                                                                                                  \
209     } else {                                                                                                 \
210       for (r=0; r<opt->n; r++) {                                                                             \
211         p2 = p + opt->offset[r]*MBS;                                                                         \
212         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
213           idx2 = idx + opt->offset[r];                                                                       \
214           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
215             for (j=0; j<M; j++)                                                                              \
216               for (k=0; k<BS; k++) {                                                                         \
217                 t    = u[idx2[i]*MBS+j*BS+k];                                                                \
218                 APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
219                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
220               }                                                                                              \
221         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
222           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
223             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
224             l  = opt->copy_length[i]*MBS;                                                                    \
225             for (j=0; j<l; j++) {                                                                            \
226               t    = u2[j];                                                                                  \
227               APPLY (u2[j],t,op,p2[j]);                                                                      \
228               FILTER(p2[j] = t);                                                                             \
229             }                                                                                                \
230             p2 += l;                                                                                         \
231           }                                                                                                  \
232         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
233           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
234           step = opt->stride_step[r];                                                                        \
235           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
236             for (j=0; j<M; j++)                                                                              \
237               for (k=0; k<BS; k++) {                                                                         \
238                 t    = u2[i*step*MBS+j*BS+k];                                                                \
239                 APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
240                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
241               }                                                                                              \
242         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
243       }                                                                                                      \
244     }                                                                                                        \
245     PetscFunctionReturn(0);                                                                                  \
246   }
247 
248 /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine
249 
250    Arguments:
251   +Action     Unpack or Fetch
252   .locname    Max or Min
253   .type1      Type of the first data in a pair type
254   .type2      Type of the second data in a pair type, usually PetscMPIInt for MPI ranks.
255   .op         > or <
256   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
257   .CType      Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2)
258   -Cvoid      void with or without the const qualifier, i.e., const void or void
259  */
260 #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \
261   static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \
262     PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked;                                             \
263     CType                 *p = (CType*)packed;                                                               \
264     PetscInt              i,j;                                                                               \
265     for (i=0; i<count; i++) {                                                                                \
266       FILTER(PairType(Type1,Type2) v);                                                                       \
267       j = idx? idx[i] : i;                                                                                   \
268       FILTER(v = u[j]);                                                                                      \
269       if (p[i].a op u[j].a) {                                                                                \
270         u[j] = p[i];                                                                                         \
271       } else if (p[i].a == u[j].a) {                                                                         \
272         u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */                                 \
273       }                                                                                                      \
274       FILTER(p[i] = v);                                                                                      \
275     }                                                                                                        \
276     PetscFunctionReturn(0);                                                                                  \
277   }
278 
279 /* Pack, Unpack/Fetch ops */
280 #define DEF_Pack(Type,BS,EQ)                                                                   \
281   DEF_PackFunc(Type,BS,EQ)                                                                     \
282   DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void)                                   \
283   DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void)                                              \
284   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) {                           \
285     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);                            \
286     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);                            \
287     link->h_FetchAndInsert  = CPPJoin4(FetchAndInsert, Type,BS,EQ);                            \
288   }
289 
290 /* Add, Mult ops */
291 #define DEF_Add(Type,BS,EQ)                                                                    \
292   DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void)             \
293   DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void)             \
294   DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void)                        \
295   DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void)                        \
296   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) {                            \
297     link->h_UnpackAndAdd    = CPPJoin4(UnpackAndAdd,   Type,BS,EQ);                            \
298     link->h_UnpackAndMult   = CPPJoin4(UnpackAndMult,  Type,BS,EQ);                            \
299     link->h_FetchAndAdd     = CPPJoin4(FetchAndAdd,    Type,BS,EQ);                            \
300     link->h_FetchAndMult    = CPPJoin4(FetchAndMult,   Type,BS,EQ);                            \
301   }
302 
303 /* Max, Min ops */
304 #define DEF_Cmp(Type,BS,EQ)                                                                    \
305   DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void)     \
306   DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void)     \
307   DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void)                \
308   DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void)                \
309   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) {                        \
310     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);                            \
311     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);                            \
312     link->h_FetchAndMax     = CPPJoin4(FetchAndMax ,   Type,BS,EQ);                            \
313     link->h_FetchAndMin     = CPPJoin4(FetchAndMin ,   Type,BS,EQ);                            \
314   }
315 
316 /* Logical ops.
317   The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid
318   the compilation warning "empty macro arguments are undefined in ISO C90"
319  */
320 #define DEF_Log(Type,BS,EQ)                                                                    \
321   DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void)            \
322   DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void)            \
323   DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP,  IGNORE,const Type,const void)            \
324   DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void)                       \
325   DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void)                       \
326   DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP,  EXECUTE,Type,void)                       \
327   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) {                        \
328     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND,Type,BS,EQ);                              \
329     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR, Type,BS,EQ);                              \
330     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR,Type,BS,EQ);                              \
331     link->h_FetchAndLAND    = CPPJoin4(FetchAndLAND, Type,BS,EQ);                              \
332     link->h_FetchAndLOR     = CPPJoin4(FetchAndLOR,  Type,BS,EQ);                              \
333     link->h_FetchAndLXOR    = CPPJoin4(FetchAndLXOR, Type,BS,EQ);                              \
334   }
335 
336 /* Bitwise ops */
337 #define DEF_Bit(Type,BS,EQ)                                                                    \
338   DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void)             \
339   DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void)             \
340   DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void)             \
341   DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void)                        \
342   DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void)                        \
343   DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void)                        \
344   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) {                        \
345     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND,Type,BS,EQ);                              \
346     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR, Type,BS,EQ);                              \
347     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR,Type,BS,EQ);                              \
348     link->h_FetchAndBAND    = CPPJoin4(FetchAndBAND, Type,BS,EQ);                              \
349     link->h_FetchAndBOR     = CPPJoin4(FetchAndBOR,  Type,BS,EQ);                              \
350     link->h_FetchAndBXOR    = CPPJoin4(FetchAndBXOR, Type,BS,EQ);                              \
351   }
352 
353 /* Maxloc, Minloc */
354 #define DEF_Xloc(Type1,Type2)                                                                  \
355   DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void)    \
356   DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void)    \
357   DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void)               \
358   DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void)               \
359   static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) {                          \
360     link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1);             \
361     link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1);             \
362     link->h_FetchAndMaxloc  = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1);             \
363     link->h_FetchAndMinloc  = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1);             \
364   }
365 
366 #define DEF_IntegerType(Type,BS,EQ)                                                            \
367   DEF_Pack(Type,BS,EQ)                                                                         \
368   DEF_Add(Type,BS,EQ)                                                                          \
369   DEF_Cmp(Type,BS,EQ)                                                                          \
370   DEF_Log(Type,BS,EQ)                                                                          \
371   DEF_Bit(Type,BS,EQ)                                                                          \
372   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) {                    \
373     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
374     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
375     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
376     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                               \
377     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                               \
378   }
379 
380 #define DEF_RealType(Type,BS,EQ)                                                               \
381   DEF_Pack(Type,BS,EQ)                                                                         \
382   DEF_Add(Type,BS,EQ)                                                                          \
383   DEF_Cmp(Type,BS,EQ)                                                                          \
384   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) {                       \
385     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
386     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
387     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
388   }
389 
390 #if defined(PETSC_HAVE_COMPLEX)
391 #define DEF_ComplexType(Type,BS,EQ)                                                            \
392   DEF_Pack(Type,BS,EQ)                                                                         \
393   DEF_Add(Type,BS,EQ)                                                                          \
394   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) {                    \
395     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
396     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
397   }
398 #endif
399 
400 #define DEF_DumbType(Type,BS,EQ)                                                               \
401   DEF_Pack(Type,BS,EQ)                                                                         \
402   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) {                       \
403     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
404   }
405 
406 /* Maxloc, Minloc */
407 #define DEF_PairType(Type1,Type2)                                                              \
408   typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2);                                    \
409   DEF_Pack(PairType(Type1,Type2),1,1)                                                          \
410   DEF_Xloc(Type1,Type2)                                                                        \
411   static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) {                      \
412     CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link);                                   \
413     CPPJoin3(PackInit_Xloc,Type1,Type2)(link);                                                 \
414   }
415 
416 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
417 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
418 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
419 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
420 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
421 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
422 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
423 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
424 
425 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
426 DEF_IntegerType(int,1,1)
427 DEF_IntegerType(int,2,1)
428 DEF_IntegerType(int,4,1)
429 DEF_IntegerType(int,8,1)
430 DEF_IntegerType(int,1,0)
431 DEF_IntegerType(int,2,0)
432 DEF_IntegerType(int,4,0)
433 DEF_IntegerType(int,8,0)
434 #endif
435 
436 /* The typedefs are used to get a typename without space that CPPJoin can handle */
437 typedef signed char SignedChar;
438 DEF_IntegerType(SignedChar,1,1)
439 DEF_IntegerType(SignedChar,2,1)
440 DEF_IntegerType(SignedChar,4,1)
441 DEF_IntegerType(SignedChar,8,1)
442 DEF_IntegerType(SignedChar,1,0)
443 DEF_IntegerType(SignedChar,2,0)
444 DEF_IntegerType(SignedChar,4,0)
445 DEF_IntegerType(SignedChar,8,0)
446 
447 typedef unsigned char UnsignedChar;
448 DEF_IntegerType(UnsignedChar,1,1)
449 DEF_IntegerType(UnsignedChar,2,1)
450 DEF_IntegerType(UnsignedChar,4,1)
451 DEF_IntegerType(UnsignedChar,8,1)
452 DEF_IntegerType(UnsignedChar,1,0)
453 DEF_IntegerType(UnsignedChar,2,0)
454 DEF_IntegerType(UnsignedChar,4,0)
455 DEF_IntegerType(UnsignedChar,8,0)
456 
457 DEF_RealType(PetscReal,1,1)
458 DEF_RealType(PetscReal,2,1)
459 DEF_RealType(PetscReal,4,1)
460 DEF_RealType(PetscReal,8,1)
461 DEF_RealType(PetscReal,1,0)
462 DEF_RealType(PetscReal,2,0)
463 DEF_RealType(PetscReal,4,0)
464 DEF_RealType(PetscReal,8,0)
465 
466 #if defined(PETSC_HAVE_COMPLEX)
467 DEF_ComplexType(PetscComplex,1,1)
468 DEF_ComplexType(PetscComplex,2,1)
469 DEF_ComplexType(PetscComplex,4,1)
470 DEF_ComplexType(PetscComplex,8,1)
471 DEF_ComplexType(PetscComplex,1,0)
472 DEF_ComplexType(PetscComplex,2,0)
473 DEF_ComplexType(PetscComplex,4,0)
474 DEF_ComplexType(PetscComplex,8,0)
475 #endif
476 
477 DEF_PairType(int,int)
478 DEF_PairType(PetscInt,PetscInt)
479 
480 /* If we don't know the basic type, we treat it as a stream of chars or ints */
481 DEF_DumbType(char,1,1)
482 DEF_DumbType(char,2,1)
483 DEF_DumbType(char,4,1)
484 DEF_DumbType(char,1,0)
485 DEF_DumbType(char,2,0)
486 DEF_DumbType(char,4,0)
487 
488 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
489 DEF_DumbType(DumbInt,1,1)
490 DEF_DumbType(DumbInt,2,1)
491 DEF_DumbType(DumbInt,4,1)
492 DEF_DumbType(DumbInt,8,1)
493 DEF_DumbType(DumbInt,1,0)
494 DEF_DumbType(DumbInt,2,0)
495 DEF_DumbType(DumbInt,4,0)
496 DEF_DumbType(DumbInt,8,0)
497 
498 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
499 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
500 {
501   int ierr;
502   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
503   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
504   return MPI_SUCCESS;
505 }
506 #endif
507 
508 PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink)
509 {
510   PetscErrorCode    ierr;
511   PetscSFPack       link,*p;
512   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
513 
514   PetscFunctionBegin;
515   /* Look for types in cache */
516   for (p=&bas->inuse; (link=*p); p=&link->next) {
517     PetscBool match;
518     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
519     if (match && (rkey == link->rkey) && (lkey == link->lkey)) {
520       switch (cmode) {
521       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
522       case PETSC_USE_POINTER: break;
523       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
524       }
525       *mylink = link;
526       PetscFunctionReturn(0);
527     }
528   }
529   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
530   PetscFunctionReturn(0);
531 }
532 
533 PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link)
534 {
535   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
536 
537   PetscFunctionBegin;
538   (*link)->rkey = NULL;
539   (*link)->lkey = NULL;
540   (*link)->next = bas->avail;
541   bas->avail    = *link;
542   *link         = NULL;
543   PetscFunctionReturn(0);
544 }
545 
546 /* Destroy all links, i.e., PetscSFPacks in the linked list, usually named 'avail' */
547 PetscErrorCode PetscSFPackDestroyAvailable(PetscSFPack *avail)
548 {
549   PetscErrorCode    ierr;
550   PetscSFPack       link=*avail,next;
551   PetscInt          i;
552 
553   PetscFunctionBegin;
554   for (; link; link=next) {
555     next = link->next;
556     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
557     for (i=0; i<(link->nrootreqs+link->nleafreqs)*4; i++) { /* Persistent reqs must be freed. */
558       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
559     }
560     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
561     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
562     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
563     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->selfbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
564 
565 #if defined(PETSC_HAVE_CUDA)
566     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
567     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
568     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->selfbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
569     if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
570 #endif
571     ierr = PetscFree(link);CHKERRQ(ierr);
572   }
573   *avail = NULL;
574   PetscFunctionReturn(0);
575 }
576 
577 /* Error out on unsupported overlapped communications */
578 PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey)
579 {
580   PetscErrorCode    ierr;
581   PetscSFPack       link,*p;
582   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
583   PetscBool         match;
584 
585   PetscFunctionBegin;
586   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
587      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
588   */
589   if (rkey || lkey) {
590     for (p=&bas->inuse; (link=*p); p=&link->next) {
591       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
592       if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey);
593     }
594   }
595   PetscFunctionReturn(0);
596 }
597 
598 PetscErrorCode PetscSFPackSetUp_Host(PetscSF sf,PetscSFPack link,MPI_Datatype unit)
599 {
600   PetscErrorCode ierr;
601   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
602   PetscBool      is2Int,is2PetscInt;
603   PetscMPIInt    ni,na,nd,combiner;
604 #if defined(PETSC_HAVE_COMPLEX)
605   PetscInt       nPetscComplex=0;
606 #endif
607 
608   PetscFunctionBegin;
609   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
610   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
611   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
612   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
613   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
614   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
615 #if defined(PETSC_HAVE_COMPLEX)
616   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
617 #endif
618   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
619   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
620   /* TODO: shaell we also handle Fortran MPI_2REAL? */
621   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
622   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
623   link->bs = 1; /* default */
624 
625   if (is2Int) {
626     PackInit_PairType_int_int(link);
627     link->bs        = 1;
628     link->unitbytes = 2*sizeof(int);
629     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
630     link->basicunit = MPI_2INT;
631     link->unit      = MPI_2INT;
632   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
633     PackInit_PairType_PetscInt_PetscInt(link);
634     link->bs        = 1;
635     link->unitbytes = 2*sizeof(PetscInt);
636     link->basicunit = MPIU_2INT;
637     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
638     link->unit      = MPIU_2INT;
639   } else if (nPetscReal) {
640     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
641     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
642     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
643     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
644     link->bs        = nPetscReal;
645     link->unitbytes = nPetscReal*sizeof(PetscReal);
646     link->basicunit = MPIU_REAL;
647     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
648   } else if (nPetscInt) {
649     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
650     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
651     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
652     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
653     link->bs        = nPetscInt;
654     link->unitbytes = nPetscInt*sizeof(PetscInt);
655     link->basicunit = MPIU_INT;
656     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
657 #if defined(PETSC_USE_64BIT_INDICES)
658   } else if (nInt) {
659     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
660     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
661     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
662     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
663     link->bs        = nInt;
664     link->unitbytes = nInt*sizeof(int);
665     link->basicunit = MPI_INT;
666     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
667 #endif
668   } else if (nSignedChar) {
669     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
670     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
671     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
672     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
673     link->bs        = nSignedChar;
674     link->unitbytes = nSignedChar*sizeof(SignedChar);
675     link->basicunit = MPI_SIGNED_CHAR;
676     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
677   }  else if (nUnsignedChar) {
678     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
679     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
680     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
681     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
682     link->bs        = nUnsignedChar;
683     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
684     link->basicunit = MPI_UNSIGNED_CHAR;
685     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
686 #if defined(PETSC_HAVE_COMPLEX)
687   } else if (nPetscComplex) {
688     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
689     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
690     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
691     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
692     link->bs        = nPetscComplex;
693     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
694     link->basicunit = MPIU_COMPLEX;
695     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
696 #endif
697   } else {
698     MPI_Aint lb,nbyte;
699     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
700     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
701     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
702       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
703       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
704       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
705       link->bs        = nbyte;
706       link->unitbytes = nbyte;
707       link->basicunit = MPI_BYTE;
708     } else {
709       nInt = nbyte / sizeof(int);
710       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
711       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
712       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
713       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
714       link->bs        = nInt;
715       link->unitbytes = nbyte;
716       link->basicunit = MPI_INT;
717     }
718     if (link->isbuiltin) link->unit = unit;
719   }
720 
721   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
722   PetscFunctionReturn(0);
723 }
724 
725 PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*))
726 {
727   PetscFunctionBegin;
728   *UnpackAndOp = NULL;
729   if (mtype == PETSC_MEMTYPE_HOST) {
730     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
731     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
732     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
733     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
734     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
735     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
736     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
737     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
738     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
739     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
740     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
741     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
742     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
743   }
744 #if defined(PETSC_HAVE_CUDA)
745   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
746     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
747     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
748     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
749     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
750     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
751     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
752     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
753     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
754     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
755     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
756     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
757     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
758     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
759   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
760     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
761     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
762     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
763     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
764     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
765     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
766     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
767     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
768     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
769     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
770     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
771     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
772     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
773   }
774 #endif
775   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
776 
777   PetscFunctionReturn(0);
778 }
779 
780 PetscErrorCode PetscSFPackGetFetchAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*))
781 {
782   PetscFunctionBegin;
783   *FetchAndOp = NULL;
784   if (mtype == PETSC_MEMTYPE_HOST) {
785     if (op == MPIU_REPLACE)                   *FetchAndOp = link->h_FetchAndInsert;
786     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->h_FetchAndAdd;
787     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->h_FetchAndMax;
788     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->h_FetchAndMin;
789     else if (op == MPI_MAXLOC)                *FetchAndOp = link->h_FetchAndMaxloc;
790     else if (op == MPI_MINLOC)                *FetchAndOp = link->h_FetchAndMinloc;
791     else if (op == MPI_PROD)                  *FetchAndOp = link->h_FetchAndMult;
792     else if (op == MPI_LAND)                  *FetchAndOp = link->h_FetchAndLAND;
793     else if (op == MPI_BAND)                  *FetchAndOp = link->h_FetchAndBAND;
794     else if (op == MPI_LOR)                   *FetchAndOp = link->h_FetchAndLOR;
795     else if (op == MPI_BOR)                   *FetchAndOp = link->h_FetchAndBOR;
796     else if (op == MPI_LXOR)                  *FetchAndOp = link->h_FetchAndLXOR;
797     else if (op == MPI_BXOR)                  *FetchAndOp = link->h_FetchAndBXOR;
798     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
799   }
800 #if defined(PETSC_HAVE_CUDA)
801   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
802     if (op == MPIU_REPLACE)                   *FetchAndOp = link->d_FetchAndInsert;
803     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->d_FetchAndAdd;
804     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->d_FetchAndMax;
805     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->d_FetchAndMin;
806     else if (op == MPI_MAXLOC)                *FetchAndOp = link->d_FetchAndMaxloc;
807     else if (op == MPI_MINLOC)                *FetchAndOp = link->d_FetchAndMinloc;
808     else if (op == MPI_PROD)                  *FetchAndOp = link->d_FetchAndMult;
809     else if (op == MPI_LAND)                  *FetchAndOp = link->d_FetchAndLAND;
810     else if (op == MPI_BAND)                  *FetchAndOp = link->d_FetchAndBAND;
811     else if (op == MPI_LOR)                   *FetchAndOp = link->d_FetchAndLOR;
812     else if (op == MPI_BOR)                   *FetchAndOp = link->d_FetchAndBOR;
813     else if (op == MPI_LXOR)                  *FetchAndOp = link->d_FetchAndLXOR;
814     else if (op == MPI_BXOR)                  *FetchAndOp = link->d_FetchAndBXOR;
815     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
816   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
817     if (op == MPIU_REPLACE)                   *FetchAndOp = link->da_FetchAndInsert;
818     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->da_FetchAndAdd;
819     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->da_FetchAndMax;
820     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->da_FetchAndMin;
821     else if (op == MPI_MAXLOC)                *FetchAndOp = link->da_FetchAndMaxloc;
822     else if (op == MPI_MINLOC)                *FetchAndOp = link->da_FetchAndMinloc;
823     else if (op == MPI_PROD)                  *FetchAndOp = link->da_FetchAndMult;
824     else if (op == MPI_LAND)                  *FetchAndOp = link->da_FetchAndLAND;
825     else if (op == MPI_BAND)                  *FetchAndOp = link->da_FetchAndBAND;
826     else if (op == MPI_LOR)                   *FetchAndOp = link->da_FetchAndLOR;
827     else if (op == MPI_BOR)                   *FetchAndOp = link->da_FetchAndBOR;
828     else if (op == MPI_LXOR)                  *FetchAndOp = link->da_FetchAndLXOR;
829     else if (op == MPI_BXOR)                  *FetchAndOp = link->da_FetchAndBXOR;
830     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
831   }
832 #endif
833   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
834   PetscFunctionReturn(0);
835 }
836 
837 /*
838   Create pack/unpack optimization plans based on indice patterns available
839 
840    Input Parameters:
841   +  n       - Number of target ranks
842   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
843   -  idx     - [*]   Array storing indices
844 
845    Output Parameters:
846   +  opt    - Optimization plans. Maybe NULL if no optimization can be built.
847 */
848 PetscErrorCode PetscSFPackOptCreate(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
849 {
850   PetscErrorCode ierr;
851   PetscInt       i,j,k,n_copies,tot_copies=0,step;
852   PetscBool      strided,optimized=PETSC_FALSE;
853   PetscSFPackOpt opt;
854 
855   PetscFunctionBegin;
856   if (!n) {
857     *out = NULL;
858     PetscFunctionReturn(0);
859   }
860 
861   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
862   ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr);
863   ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr);
864   if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/
865 
866   opt->n = n;
867 
868   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */
869   for (i=0; i<n; i++) { /* for each target processor */
870     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
871     n_copies = 1;
872     for (j=offset[i]; j<offset[i+1]-1; j++) {
873       if (idx[j]+1 != idx[j+1]) n_copies++;
874     }
875     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
876        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
877      */
878     if ((offset[i+1]-offset[i])/n_copies >= 32) {
879       opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
880       optimized    = PETSC_TRUE;
881       tot_copies  += n_copies;
882     }
883   }
884 
885   /* Setup memcpy plan for each contiguous piece */
886   k    = 0; /* k-th copy */
887   ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
888   for (i=0; i<n; i++) { /* for each target processor */
889     if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
890       n_copies           = 1;
891       opt->copy_start[k] = offset[i] - offset[0];
892       for (j=offset[i]; j<offset[i+1]-1; j++) {
893         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
894           n_copies++;
895           opt->copy_start[k+1] = j-offset[0]+1;
896           opt->copy_length[k]  = opt->copy_start[k+1] - opt->copy_start[k];
897           k++;
898         }
899       }
900       /* Set copy length of the last copy for this target */
901       opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
902       k++;
903     }
904     /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
905     opt->copy_offset[i+1] = k;
906   }
907 
908   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
909   for (i=0; i<n; i++) { /* for each remote */
910     if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
911       strided = PETSC_TRUE;
912       step    = idx[offset[i]+1] - idx[offset[i]];
913       for (j=offset[i]; j<offset[i+1]-1; j++) {
914         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
915       }
916       if (strided) {
917         opt->type[i]         = PETSCSF_PACKOPT_STRIDE;
918         opt->stride_step[i]  = step;
919         opt->stride_n[i]     = offset[i+1] - offset[i];
920         optimized            = PETSC_TRUE;
921       }
922     }
923   }
924   /* If no rank gets optimized, free arrays to save memory */
925   if (!optimized) {
926     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
927     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
928     ierr = PetscFree(opt);CHKERRQ(ierr);
929     *out = NULL;
930   } else *out = opt;
931   PetscFunctionReturn(0);
932 }
933 
934 PetscErrorCode PetscSFPackOptDestroy(PetscSFPackOpt *out)
935 {
936   PetscErrorCode ierr;
937   PetscSFPackOpt opt = *out;
938 
939   PetscFunctionBegin;
940   if (opt) {
941     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
942     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
943     ierr = PetscFree(opt);CHKERRQ(ierr);
944     *out = NULL;
945   }
946   PetscFunctionReturn(0);
947 }
948