xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision c4762a1b19cd2af06abeed90e8f9d34fb975dd94)
1 
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6 #include <cuda_runtime.h>
7 #endif
8 /*
9  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
10  * therefore we pack data types manually. This file defines packing routines for the standard data types.
11  */
12 
13 #define CPPJoin2(a,b)         a ##_## b
14 #define CPPJoin3(a,b,c)       a ##_## b ##_## c
15 #define CPPJoin4_(a,b,c,d)    a##b##_##c##_##d
16 #define CPPJoin4(a,b,c,d)     CPPJoin4_(a##_,b,c,d)
17 
18 #define EXECUTE(statement)    statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */
19 #define IGNORE(statement)     do {} while(0)
20 
21 #define BINARY_OP(r,s,op,t)   do {(r) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
22 #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
23 #define LXOR_OP(r,s,op,t)     do {(r) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
24 #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0)
25 
26 #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */
27 
28 /* DEF_PackFunc - macro defining a Pack routine
29 
30    Arguments of the macro:
31    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
32    .BS        Block size for vectorization. It is a factor of bs.
33    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
34 
35    Arguments of the Pack routine:
36    +count     Number of indices in idx[]
37    .idx       Indices of entries to packed. NULL means contiguous indices, that is [0,count)
38    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39    .opt       Pack optimization plans. NULL means no plan at all.
40    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count)
41    -packed    Address of the packed data for each rank
42  */
43 #define DEF_PackFunc(Type,BS,EQ) \
44   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed) \
45   {                                                                                                          \
46     PetscErrorCode ierr;                                                                                     \
47     const Type     *u = (const Type*)unpacked,*u2;                                                           \
48     Type           *p = (Type*)packed,*p2;                                                                   \
49     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
50     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
51     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
52     PetscFunctionBegin;                                                                                      \
53     if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);}  /* Indices are contiguous */             \
54     else if (!opt) { /* No optimizations available */                                                        \
55       for (i=0; i<count; i++)                                                                                \
56         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
57           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
58             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
59     } else {                                                                                                 \
60       for (r=0; r<opt->n; r++) {                                                                             \
61         p2  = p + opt->offset[r]*MBS;                                                                        \
62         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
63           idx2 = idx + opt->offset[r];                                                                       \
64           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
65             for (j=0; j<M; j++)                                                                              \
66               for (k=0; k<BS; k++)                                                                           \
67                 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k];                                                    \
68         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
69           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
70             u2   = u + idx[opt->copy_start[i]]*MBS;                                                          \
71             l    = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */                       \
72             ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr);                                                     \
73             p2  += l;                                                                                        \
74           }                                                                                                  \
75         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
76           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
77           step = opt->stride_step[r];                                                                        \
78           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
79             for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j];                                            \
80         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
81       }                                                                                                      \
82     }                                                                                                        \
83     PetscFunctionReturn(0);                                                                                  \
84   }
85 
86 /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine
87 
88    Arguments:
89   +action     Unpack or Fetch
90   .Type       Type of the data
91   .BS         Block size for vectorization
92   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
93   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
94   .CType      Type with or without the const qualifier, i.e., const Type or Type
95   .Cvoid      void with or without the const qualifier, i.e., const void or void
96 
97   Notes:
98    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
99    The two arguments CType and Cvoid are used (instead of one constness argument), because we want to
100    get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument,
101    sometimes we input 'const', sometimes we have to input empty.
102 
103    If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare.
104  */
105 #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid)               \
106   static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
107   {                                                                                                          \
108     PetscErrorCode ierr;                                                                                     \
109     Type           *u = (Type*)unpacked,*u2;                                                                 \
110     CType          *p = (CType*)packed,*p2;                                                                  \
111     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
112     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
113     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
114     PetscFunctionBegin;                                                                                      \
115     if (!idx) {                                                                                              \
116       FILTER(Type *v);                                                                                       \
117       FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr));                                               \
118       FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr));                                             \
119              ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);                                              \
120       FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr));                                             \
121       FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                             \
122     } else if (!opt) { /* No optimizations available */                                                      \
123       for (i=0; i<count; i++)                                                                                \
124         for (j=0; j<M; j++)                                                                                  \
125           for (k=0; k<BS; k++) {                                                                             \
126             FILTER(Type t                = u[idx[i]*MBS+j*BS+k]);                                            \
127                    u[idx[i]*MBS+j*BS+k]  = p[i*MBS+j*BS+k];                                                  \
128             FILTER(p[i*MBS+j*BS+k]       = t);                                                               \
129           }                                                                                                  \
130     } else {                                                                                                 \
131       for (r=0; r<opt->n; r++) {                                                                             \
132         p2 = p + opt->offset[r]*MBS;                                                                         \
133         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
134           idx2 = idx + opt->offset[r];                                                                       \
135           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
136             for (j=0; j<M; j++)                                                                              \
137               for (k=0; k<BS; k++) {                                                                         \
138                 FILTER(Type t                = u[idx2[i]*MBS+j*BS+k]);                                       \
139                        u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
140                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
141               }                                                                                              \
142         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
143           FILTER(Type *v);                                                                                   \
144           FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \
145           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
146             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
147             l  = opt->copy_length[i]*MBS;                                                                    \
148             FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr));                                              \
149                    ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr);                                              \
150             FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr));                                              \
151             p2 += l;                                                                                         \
152           }                                                                                                  \
153           FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                         \
154         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
155           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
156           step = opt->stride_step[r];                                                                        \
157           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
158             for (j=0; j<M; j++)                                                                              \
159               for (k=0; k<BS; k++) {                                                                         \
160                 FILTER(Type t                = u2[i*step*MBS+j*BS+k]);                                       \
161                        u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
162                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
163               }                                                                                              \
164         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
165       }                                                                                                      \
166     }                                                                                                        \
167     PetscFunctionReturn(0);                                                                                  \
168   }
169 
170 /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc
171 
172    Arguments:
173   +action     Unpack or Fetch
174   .opname     Name of the Op, such as Add, Mult, LAND, etc.
175   .Type       Type of the data
176   .BS         Block size for vectorization
177   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
178   .op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
179   .APPLY      Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP
180   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
181   .CType      Type with or without the const qualifier, i.e., const Type or Type
182   -Cvoid      void with or without the const qualifier, i.e., const void or void
183  */
184 #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \
185   static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
186   {                                                                                                          \
187     Type           *u = (Type*)unpacked,*u2,t;                                                               \
188     CType          *p = (CType*)packed,*p2;                                                                  \
189     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
190     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
191     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
192     PetscFunctionBegin;                                                                                      \
193     if (!idx) {                                                                                              \
194       for (i=0; i<count; i++)                                                                                \
195         for (j=0; j<M; j++)                                                                                  \
196           for (k=0; k<BS; k++) {                                                                             \
197             t    = u[i*MBS+j*BS+k];                                                                          \
198             APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                                    \
199             FILTER(p[i*MBS+j*BS+k] = t);                                                                     \
200           }                                                                                                  \
201     } else if (!opt) { /* No optimizations available */                                                      \
202       for (i=0; i<count; i++)                                                                                \
203         for (j=0; j<M; j++)                                                                                  \
204           for (k=0; k<BS; k++) {                                                                             \
205               t    = u[idx[i]*MBS+j*BS+k];                                                                   \
206               APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                             \
207               FILTER(p[i*MBS+j*BS+k] = t);                                                                   \
208           }                                                                                                  \
209     } else {                                                                                                 \
210       for (r=0; r<opt->n; r++) {                                                                             \
211         p2 = p + opt->offset[r]*MBS;                                                                         \
212         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
213           idx2 = idx + opt->offset[r];                                                                       \
214           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
215             for (j=0; j<M; j++)                                                                              \
216               for (k=0; k<BS; k++) {                                                                         \
217                 t    = u[idx2[i]*MBS+j*BS+k];                                                                \
218                 APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
219                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
220               }                                                                                              \
221         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
222           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
223             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
224             l  = opt->copy_length[i]*MBS;                                                                    \
225             for (j=0; j<l; j++) {                                                                            \
226               t    = u2[j];                                                                                  \
227               APPLY (u2[j],t,op,p2[j]);                                                                      \
228               FILTER(p2[j] = t);                                                                             \
229             }                                                                                                \
230             p2 += l;                                                                                         \
231           }                                                                                                  \
232         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
233           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
234           step = opt->stride_step[r];                                                                        \
235           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
236             for (j=0; j<M; j++)                                                                              \
237               for (k=0; k<BS; k++) {                                                                         \
238                 t    = u2[i*step*MBS+j*BS+k];                                                                \
239                 APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
240                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
241               }                                                                                              \
242         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
243       }                                                                                                      \
244     }                                                                                                        \
245     PetscFunctionReturn(0);                                                                                  \
246   }
247 
248 /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine
249 
250    Arguments:
251   +Action     Unpack or Fetch
252   .locname    Max or Min
253   .type1      Type of the first data in a pair type
254   .type2      Type of the second data in a pair type, usually PetscMPIInt for MPI ranks.
255   .op         > or <
256   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
257   .CType      Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2)
258   -Cvoid      void with or without the const qualifier, i.e., const void or void
259  */
260 #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \
261   static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \
262     PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked;                                             \
263     CType                 *p = (CType*)packed;                                                               \
264     PetscInt              i,j;                                                                               \
265     for (i=0; i<count; i++) {                                                                                \
266       FILTER(PairType(Type1,Type2) v);                                                                       \
267       j = idx? idx[i] : i;                                                                                   \
268       FILTER(v = u[j]);                                                                                      \
269       if (p[i].a op u[j].a) {                                                                                \
270         u[j] = p[i];                                                                                         \
271       } else if (p[i].a == u[j].a) {                                                                         \
272         u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */                                 \
273       }                                                                                                      \
274       FILTER(p[i] = v);                                                                                      \
275     }                                                                                                        \
276     PetscFunctionReturn(0);                                                                                  \
277   }
278 
279 /* Pack, Unpack/Fetch ops */
280 #define DEF_Pack(Type,BS,EQ)                                                                   \
281   DEF_PackFunc(Type,BS,EQ)                                                                     \
282   DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void)                                   \
283   DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void)                                              \
284   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) {                           \
285     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);                            \
286     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);                            \
287     link->h_FetchAndInsert  = CPPJoin4(FetchAndInsert, Type,BS,EQ);                            \
288   }
289 
290 /* Add, Mult ops */
291 #define DEF_Add(Type,BS,EQ)                                                                    \
292   DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void)             \
293   DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void)             \
294   DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void)                        \
295   DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void)                        \
296   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) {                            \
297     link->h_UnpackAndAdd    = CPPJoin4(UnpackAndAdd,   Type,BS,EQ);                            \
298     link->h_UnpackAndMult   = CPPJoin4(UnpackAndMult,  Type,BS,EQ);                            \
299     link->h_FetchAndAdd     = CPPJoin4(FetchAndAdd,    Type,BS,EQ);                            \
300     link->h_FetchAndMult    = CPPJoin4(FetchAndMult,   Type,BS,EQ);                            \
301   }
302 
303 /* Max, Min ops */
304 #define DEF_Cmp(Type,BS,EQ)                                                                    \
305   DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void)     \
306   DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void)     \
307   DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void)                \
308   DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void)                \
309   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) {                        \
310     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);                            \
311     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);                            \
312     link->h_FetchAndMax     = CPPJoin4(FetchAndMax ,   Type,BS,EQ);                            \
313     link->h_FetchAndMin     = CPPJoin4(FetchAndMin ,   Type,BS,EQ);                            \
314   }
315 
316 /* Logical ops.
317   The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid
318   the compilation warning "empty macro arguments are undefined in ISO C90"
319  */
320 #define DEF_Log(Type,BS,EQ)                                                                    \
321   DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void)            \
322   DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void)            \
323   DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP,  IGNORE,const Type,const void)            \
324   DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void)                       \
325   DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void)                       \
326   DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP,  EXECUTE,Type,void)                       \
327   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) {                        \
328     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND,Type,BS,EQ);                              \
329     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR, Type,BS,EQ);                              \
330     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR,Type,BS,EQ);                              \
331     link->h_FetchAndLAND    = CPPJoin4(FetchAndLAND, Type,BS,EQ);                              \
332     link->h_FetchAndLOR     = CPPJoin4(FetchAndLOR,  Type,BS,EQ);                              \
333     link->h_FetchAndLXOR    = CPPJoin4(FetchAndLXOR, Type,BS,EQ);                              \
334   }
335 
336 /* Bitwise ops */
337 #define DEF_Bit(Type,BS,EQ)                                                                    \
338   DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void)             \
339   DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void)             \
340   DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void)             \
341   DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void)                        \
342   DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void)                        \
343   DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void)                        \
344   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) {                        \
345     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND,Type,BS,EQ);                              \
346     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR, Type,BS,EQ);                              \
347     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR,Type,BS,EQ);                              \
348     link->h_FetchAndBAND    = CPPJoin4(FetchAndBAND, Type,BS,EQ);                              \
349     link->h_FetchAndBOR     = CPPJoin4(FetchAndBOR,  Type,BS,EQ);                              \
350     link->h_FetchAndBXOR    = CPPJoin4(FetchAndBXOR, Type,BS,EQ);                              \
351   }
352 
353 /* Maxloc, Minloc */
354 #define DEF_Xloc(Type1,Type2)                                                                  \
355   DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void)    \
356   DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void)    \
357   DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void)               \
358   DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void)               \
359   static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) {                          \
360     link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1);             \
361     link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1);             \
362     link->h_FetchAndMaxloc  = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1);             \
363     link->h_FetchAndMinloc  = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1);             \
364   }
365 
366 #define DEF_IntegerType(Type,BS,EQ)                                                            \
367   DEF_Pack(Type,BS,EQ)                                                                         \
368   DEF_Add(Type,BS,EQ)                                                                          \
369   DEF_Cmp(Type,BS,EQ)                                                                          \
370   DEF_Log(Type,BS,EQ)                                                                          \
371   DEF_Bit(Type,BS,EQ)                                                                          \
372   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) {                    \
373     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
374     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
375     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
376     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                               \
377     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                               \
378   }
379 
380 #define DEF_RealType(Type,BS,EQ)                                                               \
381   DEF_Pack(Type,BS,EQ)                                                                         \
382   DEF_Add(Type,BS,EQ)                                                                          \
383   DEF_Cmp(Type,BS,EQ)                                                                          \
384   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) {                       \
385     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
386     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
387     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
388   }
389 
390 #if defined(PETSC_HAVE_COMPLEX)
391 #define DEF_ComplexType(Type,BS,EQ)                                                            \
392   DEF_Pack(Type,BS,EQ)                                                                         \
393   DEF_Add(Type,BS,EQ)                                                                          \
394   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) {                    \
395     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
396     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
397   }
398 #endif
399 
400 #define DEF_DumbType(Type,BS,EQ)                                                               \
401   DEF_Pack(Type,BS,EQ)                                                                         \
402   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) {                       \
403     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
404   }
405 
406 /* Maxloc, Minloc */
407 #define DEF_PairType(Type1,Type2)                                                              \
408   typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2);                                    \
409   DEF_Pack(PairType(Type1,Type2),1,1)                                                          \
410   DEF_Xloc(Type1,Type2)                                                                        \
411   static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) {                      \
412     CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link);                                   \
413     CPPJoin3(PackInit_Xloc,Type1,Type2)(link);                                                 \
414   }
415 
416 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
417 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
418 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
419 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
420 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
421 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
422 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
423 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
424 
425 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
426 DEF_IntegerType(int,1,1)
427 DEF_IntegerType(int,2,1)
428 DEF_IntegerType(int,4,1)
429 DEF_IntegerType(int,8,1)
430 DEF_IntegerType(int,1,0)
431 DEF_IntegerType(int,2,0)
432 DEF_IntegerType(int,4,0)
433 DEF_IntegerType(int,8,0)
434 #endif
435 
436 /* The typedefs are used to get a typename without space that CPPJoin can handle */
437 typedef signed char SignedChar;
438 DEF_IntegerType(SignedChar,1,1)
439 DEF_IntegerType(SignedChar,2,1)
440 DEF_IntegerType(SignedChar,4,1)
441 DEF_IntegerType(SignedChar,8,1)
442 DEF_IntegerType(SignedChar,1,0)
443 DEF_IntegerType(SignedChar,2,0)
444 DEF_IntegerType(SignedChar,4,0)
445 DEF_IntegerType(SignedChar,8,0)
446 
447 typedef unsigned char UnsignedChar;
448 DEF_IntegerType(UnsignedChar,1,1)
449 DEF_IntegerType(UnsignedChar,2,1)
450 DEF_IntegerType(UnsignedChar,4,1)
451 DEF_IntegerType(UnsignedChar,8,1)
452 DEF_IntegerType(UnsignedChar,1,0)
453 DEF_IntegerType(UnsignedChar,2,0)
454 DEF_IntegerType(UnsignedChar,4,0)
455 DEF_IntegerType(UnsignedChar,8,0)
456 
457 DEF_RealType(PetscReal,1,1)
458 DEF_RealType(PetscReal,2,1)
459 DEF_RealType(PetscReal,4,1)
460 DEF_RealType(PetscReal,8,1)
461 DEF_RealType(PetscReal,1,0)
462 DEF_RealType(PetscReal,2,0)
463 DEF_RealType(PetscReal,4,0)
464 DEF_RealType(PetscReal,8,0)
465 
466 #if defined(PETSC_HAVE_COMPLEX)
467 DEF_ComplexType(PetscComplex,1,1)
468 DEF_ComplexType(PetscComplex,2,1)
469 DEF_ComplexType(PetscComplex,4,1)
470 DEF_ComplexType(PetscComplex,8,1)
471 DEF_ComplexType(PetscComplex,1,0)
472 DEF_ComplexType(PetscComplex,2,0)
473 DEF_ComplexType(PetscComplex,4,0)
474 DEF_ComplexType(PetscComplex,8,0)
475 #endif
476 
477 DEF_PairType(int,int)
478 DEF_PairType(PetscInt,PetscInt)
479 
480 /* If we don't know the basic type, we treat it as a stream of chars or ints */
481 DEF_DumbType(char,1,1)
482 DEF_DumbType(char,2,1)
483 DEF_DumbType(char,4,1)
484 DEF_DumbType(char,1,0)
485 DEF_DumbType(char,2,0)
486 DEF_DumbType(char,4,0)
487 
488 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
489 DEF_DumbType(DumbInt,1,1)
490 DEF_DumbType(DumbInt,2,1)
491 DEF_DumbType(DumbInt,4,1)
492 DEF_DumbType(DumbInt,8,1)
493 DEF_DumbType(DumbInt,1,0)
494 DEF_DumbType(DumbInt,2,0)
495 DEF_DumbType(DumbInt,4,0)
496 DEF_DumbType(DumbInt,8,0)
497 
498 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
499 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
500 {
501   int ierr;
502   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
503   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
504   return MPI_SUCCESS;
505 }
506 #endif
507 
508 PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFPack *mylink)
509 {
510   PetscErrorCode    ierr;
511   PetscSFPack       link,*p;
512   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
513 
514   PetscFunctionBegin;
515   /* Look for types in cache */
516   for (p=&bas->inuse; (link=*p); p=&link->next) {
517     PetscBool match;
518     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
519     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
520       switch (cmode) {
521       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
522       case PETSC_USE_POINTER: break;
523       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
524       }
525       *mylink = link;
526       PetscFunctionReturn(0);
527     }
528   }
529   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
530   PetscFunctionReturn(0);
531 }
532 
533 PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link)
534 {
535   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
536 
537   PetscFunctionBegin;
538   (*link)->rootdata = NULL;
539   (*link)->leafdata = NULL;
540   (*link)->next     = bas->avail;
541   bas->avail        = *link;
542   *link             = NULL;
543   PetscFunctionReturn(0);
544 }
545 
546 /* Destroy all links, i.e., PetscSFPacks in the linked list, usually named 'avail' */
547 PetscErrorCode PetscSFPackDestroyAvailable(PetscSF sf,PetscSFPack *avail)
548 {
549   PetscErrorCode    ierr;
550   PetscSFPack       link=*avail,next;
551   PetscInt          i;
552 
553   PetscFunctionBegin;
554   for (; link; link=next) {
555     next = link->next;
556     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
557     for (i=0; i<(link->nrootreqs+link->nleafreqs)*4; i++) { /* Persistent reqs must be freed. */
558       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
559     }
560     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
561 
562 #if defined(PETSC_HAVE_CUDA)
563     if (!use_gpu_aware_mpi && sf->use_pinned_buf) { /* In case the buffers are allocated specially */
564       if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscFreePinnedMemory(link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);}
565       if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscFreePinnedMemory(link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);}
566     }
567 #endif
568 
569     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
570     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
571     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->selfbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
572 
573 #if defined(PETSC_HAVE_CUDA)
574     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
575     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
576     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->selfbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
577     if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
578 #endif
579     ierr = PetscFree(link);CHKERRQ(ierr);
580   }
581   *avail = NULL;
582   PetscFunctionReturn(0);
583 }
584 
585 /* Error out on unsupported overlapped communications */
586 PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
587 {
588   PetscErrorCode    ierr;
589   PetscSFPack       link,*p;
590   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
591   PetscBool         match;
592 
593   PetscFunctionBegin;
594   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
595      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
596   */
597   if (rootdata || leafdata) {
598     for (p=&bas->inuse; (link=*p); p=&link->next) {
599       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
600       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rootdata,leafdata);
601     }
602   }
603   PetscFunctionReturn(0);
604 }
605 
606 PetscErrorCode PetscSFPackSetUp_Host(PetscSF sf,PetscSFPack link,MPI_Datatype unit)
607 {
608   PetscErrorCode ierr;
609   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
610   PetscBool      is2Int,is2PetscInt;
611   PetscMPIInt    ni,na,nd,combiner;
612 #if defined(PETSC_HAVE_COMPLEX)
613   PetscInt       nPetscComplex=0;
614 #endif
615 
616   PetscFunctionBegin;
617   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
618   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
619   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
620   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
621   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
622   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
623 #if defined(PETSC_HAVE_COMPLEX)
624   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
625 #endif
626   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
627   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
628   /* TODO: shaell we also handle Fortran MPI_2REAL? */
629   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
630   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
631   link->bs = 1; /* default */
632 
633   if (is2Int) {
634     PackInit_PairType_int_int(link);
635     link->bs        = 1;
636     link->unitbytes = 2*sizeof(int);
637     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
638     link->basicunit = MPI_2INT;
639     link->unit      = MPI_2INT;
640   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
641     PackInit_PairType_PetscInt_PetscInt(link);
642     link->bs        = 1;
643     link->unitbytes = 2*sizeof(PetscInt);
644     link->basicunit = MPIU_2INT;
645     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
646     link->unit      = MPIU_2INT;
647   } else if (nPetscReal) {
648     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
649     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
650     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
651     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
652     link->bs        = nPetscReal;
653     link->unitbytes = nPetscReal*sizeof(PetscReal);
654     link->basicunit = MPIU_REAL;
655     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
656   } else if (nPetscInt) {
657     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
658     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
659     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
660     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
661     link->bs        = nPetscInt;
662     link->unitbytes = nPetscInt*sizeof(PetscInt);
663     link->basicunit = MPIU_INT;
664     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
665 #if defined(PETSC_USE_64BIT_INDICES)
666   } else if (nInt) {
667     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
668     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
669     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
670     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
671     link->bs        = nInt;
672     link->unitbytes = nInt*sizeof(int);
673     link->basicunit = MPI_INT;
674     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
675 #endif
676   } else if (nSignedChar) {
677     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
678     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
679     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
680     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
681     link->bs        = nSignedChar;
682     link->unitbytes = nSignedChar*sizeof(SignedChar);
683     link->basicunit = MPI_SIGNED_CHAR;
684     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
685   }  else if (nUnsignedChar) {
686     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
687     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
688     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
689     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
690     link->bs        = nUnsignedChar;
691     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
692     link->basicunit = MPI_UNSIGNED_CHAR;
693     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
694 #if defined(PETSC_HAVE_COMPLEX)
695   } else if (nPetscComplex) {
696     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
697     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
698     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
699     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
700     link->bs        = nPetscComplex;
701     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
702     link->basicunit = MPIU_COMPLEX;
703     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
704 #endif
705   } else {
706     MPI_Aint lb,nbyte;
707     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
708     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
709     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
710       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
711       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
712       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
713       link->bs        = nbyte;
714       link->unitbytes = nbyte;
715       link->basicunit = MPI_BYTE;
716     } else {
717       nInt = nbyte / sizeof(int);
718       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
719       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
720       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
721       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
722       link->bs        = nInt;
723       link->unitbytes = nbyte;
724       link->basicunit = MPI_INT;
725     }
726     if (link->isbuiltin) link->unit = unit;
727   }
728 
729   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
730   PetscFunctionReturn(0);
731 }
732 
733 PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*))
734 {
735   PetscFunctionBegin;
736   *UnpackAndOp = NULL;
737   if (mtype == PETSC_MEMTYPE_HOST) {
738     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
739     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
740     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
741     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
742     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
743     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
744     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
745     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
746     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
747     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
748     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
749     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
750     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
751   }
752 #if defined(PETSC_HAVE_CUDA)
753   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
754     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
755     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
756     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
757     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
758     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
759     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
760     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
761     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
762     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
763     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
764     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
765     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
766     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
767   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
768     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
769     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
770     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
771     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
772     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
773     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
774     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
775     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
776     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
777     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
778     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
779     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
780     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
781   }
782 #endif
783   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
784 
785   PetscFunctionReturn(0);
786 }
787 
788 PetscErrorCode PetscSFPackGetFetchAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*))
789 {
790   PetscFunctionBegin;
791   *FetchAndOp = NULL;
792   if (mtype == PETSC_MEMTYPE_HOST) {
793     if (op == MPIU_REPLACE)                   *FetchAndOp = link->h_FetchAndInsert;
794     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->h_FetchAndAdd;
795     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->h_FetchAndMax;
796     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->h_FetchAndMin;
797     else if (op == MPI_MAXLOC)                *FetchAndOp = link->h_FetchAndMaxloc;
798     else if (op == MPI_MINLOC)                *FetchAndOp = link->h_FetchAndMinloc;
799     else if (op == MPI_PROD)                  *FetchAndOp = link->h_FetchAndMult;
800     else if (op == MPI_LAND)                  *FetchAndOp = link->h_FetchAndLAND;
801     else if (op == MPI_BAND)                  *FetchAndOp = link->h_FetchAndBAND;
802     else if (op == MPI_LOR)                   *FetchAndOp = link->h_FetchAndLOR;
803     else if (op == MPI_BOR)                   *FetchAndOp = link->h_FetchAndBOR;
804     else if (op == MPI_LXOR)                  *FetchAndOp = link->h_FetchAndLXOR;
805     else if (op == MPI_BXOR)                  *FetchAndOp = link->h_FetchAndBXOR;
806     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
807   }
808 #if defined(PETSC_HAVE_CUDA)
809   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
810     if (op == MPIU_REPLACE)                   *FetchAndOp = link->d_FetchAndInsert;
811     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->d_FetchAndAdd;
812     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->d_FetchAndMax;
813     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->d_FetchAndMin;
814     else if (op == MPI_MAXLOC)                *FetchAndOp = link->d_FetchAndMaxloc;
815     else if (op == MPI_MINLOC)                *FetchAndOp = link->d_FetchAndMinloc;
816     else if (op == MPI_PROD)                  *FetchAndOp = link->d_FetchAndMult;
817     else if (op == MPI_LAND)                  *FetchAndOp = link->d_FetchAndLAND;
818     else if (op == MPI_BAND)                  *FetchAndOp = link->d_FetchAndBAND;
819     else if (op == MPI_LOR)                   *FetchAndOp = link->d_FetchAndLOR;
820     else if (op == MPI_BOR)                   *FetchAndOp = link->d_FetchAndBOR;
821     else if (op == MPI_LXOR)                  *FetchAndOp = link->d_FetchAndLXOR;
822     else if (op == MPI_BXOR)                  *FetchAndOp = link->d_FetchAndBXOR;
823     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
824   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
825     if (op == MPIU_REPLACE)                   *FetchAndOp = link->da_FetchAndInsert;
826     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->da_FetchAndAdd;
827     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->da_FetchAndMax;
828     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->da_FetchAndMin;
829     else if (op == MPI_MAXLOC)                *FetchAndOp = link->da_FetchAndMaxloc;
830     else if (op == MPI_MINLOC)                *FetchAndOp = link->da_FetchAndMinloc;
831     else if (op == MPI_PROD)                  *FetchAndOp = link->da_FetchAndMult;
832     else if (op == MPI_LAND)                  *FetchAndOp = link->da_FetchAndLAND;
833     else if (op == MPI_BAND)                  *FetchAndOp = link->da_FetchAndBAND;
834     else if (op == MPI_LOR)                   *FetchAndOp = link->da_FetchAndLOR;
835     else if (op == MPI_BOR)                   *FetchAndOp = link->da_FetchAndBOR;
836     else if (op == MPI_LXOR)                  *FetchAndOp = link->da_FetchAndLXOR;
837     else if (op == MPI_BXOR)                  *FetchAndOp = link->da_FetchAndBXOR;
838     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
839   }
840 #endif
841   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
842   PetscFunctionReturn(0);
843 }
844 
845 /*
846   Create pack/unpack optimization plans based on indice patterns available
847 
848    Input Parameters:
849   +  n       - Number of target ranks
850   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
851   -  idx     - [*]   Array storing indices
852 
853    Output Parameters:
854   +  opt    - Optimization plans. Maybe NULL if no optimization can be built.
855 */
856 PetscErrorCode PetscSFPackOptCreate(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
857 {
858   PetscErrorCode ierr;
859   PetscInt       i,j,k,n_copies,tot_copies=0,step;
860   PetscBool      strided,optimized=PETSC_FALSE;
861   PetscSFPackOpt opt;
862 
863   PetscFunctionBegin;
864   if (!n) {
865     *out = NULL;
866     PetscFunctionReturn(0);
867   }
868 
869   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
870   ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr);
871   ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr);
872   if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/
873 
874   opt->n = n;
875 
876   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with multiple memcpy's ) */
877   for (i=0; i<n; i++) { /* for each target processor */
878     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
879     n_copies = 1;
880     for (j=offset[i]; j<offset[i+1]-1; j++) {
881       if (idx[j]+1 != idx[j+1]) n_copies++;
882     }
883     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
884        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
885      */
886     if ((offset[i+1]-offset[i])/n_copies >= 32) {
887       opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
888       optimized    = PETSC_TRUE;
889       tot_copies  += n_copies;
890     }
891   }
892 
893   /* Setup memcpy plan for each contiguous piece */
894   k    = 0; /* k-th copy */
895   ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
896   for (i=0; i<n; i++) { /* for each target processor */
897     if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
898       n_copies           = 1;
899       opt->copy_start[k] = offset[i] - offset[0];
900       for (j=offset[i]; j<offset[i+1]-1; j++) {
901         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
902           n_copies++;
903           opt->copy_start[k+1] = j-offset[0]+1;
904           opt->copy_length[k]  = opt->copy_start[k+1] - opt->copy_start[k];
905           k++;
906         }
907       }
908       /* Set copy length of the last copy for this target */
909       opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
910       k++;
911     }
912     /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
913     opt->copy_offset[i+1] = k;
914   }
915 
916   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
917   for (i=0; i<n; i++) { /* for each remote */
918     if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
919       strided = PETSC_TRUE;
920       step    = idx[offset[i]+1] - idx[offset[i]];
921       for (j=offset[i]; j<offset[i+1]-1; j++) {
922         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
923       }
924       if (strided) {
925         opt->type[i]         = PETSCSF_PACKOPT_STRIDE;
926         opt->stride_step[i]  = step;
927         opt->stride_n[i]     = offset[i+1] - offset[i];
928         optimized            = PETSC_TRUE;
929       }
930     }
931   }
932   /* If no rank gets optimized, free arrays to save memory */
933   if (!optimized) {
934     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
935     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
936     ierr = PetscFree(opt);CHKERRQ(ierr);
937     *out = NULL;
938   } else *out = opt;
939   PetscFunctionReturn(0);
940 }
941 
942 PetscErrorCode PetscSFPackOptDestroy(PetscSFPackOpt *out)
943 {
944   PetscErrorCode ierr;
945   PetscSFPackOpt opt = *out;
946 
947   PetscFunctionBegin;
948   if (opt) {
949     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
950     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
951     ierr = PetscFree(opt);CHKERRQ(ierr);
952     *out = NULL;
953   }
954   PetscFunctionReturn(0);
955 }
956