xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision feb237ba1401b79cfa9a99e81307a265c4a68462)
1 
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6 #include <cuda_runtime.h>
7 #endif
8 /*
9  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
10  * therefore we pack data types manually. This file defines packing routines for the standard data types.
11  */
12 
13 #define CPPJoin2(a,b)         a ##_## b
14 #define CPPJoin3(a,b,c)       a ##_## b ##_## c
15 #define CPPJoin4_(a,b,c,d)    a##b##_##c##_##d
16 #define CPPJoin4(a,b,c,d)     CPPJoin4_(a##_,b,c,d)
17 
18 #define EXECUTE(statement)    statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */
19 #define IGNORE(statement)     do {} while(0)
20 
21 #define BINARY_OP(r,s,op,t)   do {(r) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
22 #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
23 #define LXOR_OP(r,s,op,t)     do {(r) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
24 #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0)
25 
26 #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */
27 
28 /* DEF_PackFunc - macro defining a Pack routine
29 
30    Arguments of the macro:
31    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
32    .BS        Block size for vectorization. It is a factor of bs.
33    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
34 
35    Arguments of the Pack routine:
36    +count     Number of indices in idx[]
37    .idx       Indices of entries to packed. NULL means contiguous indices, that is [0,count)
38    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39    .opt       Pack optimization plans. NULL means no plan at all.
40    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count)
41    -packed    Address of the packed data for each rank
42  */
43 #define DEF_PackFunc(Type,BS,EQ) \
44   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed) \
45   {                                                                                                          \
46     PetscErrorCode ierr;                                                                                     \
47     const Type     *u = (const Type*)unpacked,*u2;                                                           \
48     Type           *p = (Type*)packed,*p2;                                                                   \
49     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
50     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
51     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
52     PetscFunctionBegin;                                                                                      \
53     if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);}  /* Indices are contiguous */             \
54     else if (!opt) { /* No optimizations available */                                                        \
55       for (i=0; i<count; i++)                                                                                \
56         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
57           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
58             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
59     } else {                                                                                                 \
60       for (r=0; r<opt->n; r++) {                                                                             \
61         p2  = p + opt->offset[r]*MBS;                                                                        \
62         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
63           idx2 = idx + opt->offset[r];                                                                       \
64           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
65             for (j=0; j<M; j++)                                                                              \
66               for (k=0; k<BS; k++)                                                                           \
67                 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k];                                                    \
68         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
69           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
70             u2   = u + idx[opt->copy_start[i]]*MBS;                                                          \
71             l    = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */                       \
72             ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr);                                                     \
73             p2  += l;                                                                                        \
74           }                                                                                                  \
75         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
76           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
77           step = opt->stride_step[r];                                                                        \
78           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
79             for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j];                                            \
80         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
81       }                                                                                                      \
82     }                                                                                                        \
83     PetscFunctionReturn(0);                                                                                  \
84   }
85 
86 /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine
87 
88    Arguments:
89   +action     Unpack or Fetch
90   .Type       Type of the data
91   .BS         Block size for vectorization
92   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
93   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
94   .CType      Type with or without the const qualifier, i.e., const Type or Type
95   .Cvoid      void with or without the const qualifier, i.e., const void or void
96 
97   Notes:
98    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
99    The two arguments CType and Cvoid are used (instead of one constness argument), because we want to
100    get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument,
101    sometimes we input 'const', sometimes we have to input empty.
102 
103    If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare.
104  */
105 #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid)               \
106   static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
107   {                                                                                                          \
108     PetscErrorCode ierr;                                                                                     \
109     Type           *u = (Type*)unpacked,*u2;                                                                 \
110     CType          *p = (CType*)packed,*p2;                                                                  \
111     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
112     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
113     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
114     PetscFunctionBegin;                                                                                      \
115     if (!idx) {                                                                                              \
116       FILTER(Type *v);                                                                                       \
117       FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr));                                               \
118       FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr));                                             \
119              ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);                                              \
120       FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr));                                             \
121       FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                             \
122     } else if (!opt) { /* No optimizations available */                                                      \
123       for (i=0; i<count; i++)                                                                                \
124         for (j=0; j<M; j++)                                                                                  \
125           for (k=0; k<BS; k++) {                                                                             \
126             FILTER(Type t                = u[idx[i]*MBS+j*BS+k]);                                            \
127                    u[idx[i]*MBS+j*BS+k]  = p[i*MBS+j*BS+k];                                                  \
128             FILTER(p[i*MBS+j*BS+k]       = t);                                                               \
129           }                                                                                                  \
130     } else {                                                                                                 \
131       for (r=0; r<opt->n; r++) {                                                                             \
132         p2 = p + opt->offset[r]*MBS;                                                                         \
133         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
134           idx2 = idx + opt->offset[r];                                                                       \
135           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
136             for (j=0; j<M; j++)                                                                              \
137               for (k=0; k<BS; k++) {                                                                         \
138                 FILTER(Type t                = u[idx2[i]*MBS+j*BS+k]);                                       \
139                        u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
140                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
141               }                                                                                              \
142         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
143           FILTER(Type *v);                                                                                   \
144           FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \
145           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
146             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
147             l  = opt->copy_length[i]*MBS;                                                                    \
148             FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr));                                              \
149                    ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr);                                              \
150             FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr));                                              \
151             p2 += l;                                                                                         \
152           }                                                                                                  \
153           FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                         \
154         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
155           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
156           step = opt->stride_step[r];                                                                        \
157           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
158             for (j=0; j<M; j++)                                                                              \
159               for (k=0; k<BS; k++) {                                                                         \
160                 FILTER(Type t                = u2[i*step*MBS+j*BS+k]);                                       \
161                        u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
162                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
163               }                                                                                              \
164         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
165       }                                                                                                      \
166     }                                                                                                        \
167     PetscFunctionReturn(0);                                                                                  \
168   }
169 
170 /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc
171 
172    Arguments:
173   +action     Unpack or Fetch
174   .opname     Name of the Op, such as Add, Mult, LAND, etc.
175   .Type       Type of the data
176   .BS         Block size for vectorization
177   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
178   .op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
179   .APPLY      Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP
180   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
181   .CType      Type with or without the const qualifier, i.e., const Type or Type
182   -Cvoid      void with or without the const qualifier, i.e., const void or void
183  */
184 #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \
185   static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
186   {                                                                                                          \
187     Type           *u = (Type*)unpacked,*u2,t;                                                               \
188     CType          *p = (CType*)packed,*p2;                                                                  \
189     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
190     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
191     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
192     PetscFunctionBegin;                                                                                      \
193     if (!idx) {                                                                                              \
194       for (i=0; i<count; i++)                                                                                \
195         for (j=0; j<M; j++)                                                                                  \
196           for (k=0; k<BS; k++) {                                                                             \
197             t    = u[i*MBS+j*BS+k];                                                                          \
198             APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                                    \
199             FILTER(p[i*MBS+j*BS+k] = t);                                                                     \
200           }                                                                                                  \
201     } else if (!opt) { /* No optimizations available */                                                      \
202       for (i=0; i<count; i++)                                                                                \
203         for (j=0; j<M; j++)                                                                                  \
204           for (k=0; k<BS; k++) {                                                                             \
205               t    = u[idx[i]*MBS+j*BS+k];                                                                   \
206               APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                             \
207               FILTER(p[i*MBS+j*BS+k] = t);                                                                   \
208           }                                                                                                  \
209     } else {                                                                                                 \
210       for (r=0; r<opt->n; r++) {                                                                             \
211         p2 = p + opt->offset[r]*MBS;                                                                         \
212         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
213           idx2 = idx + opt->offset[r];                                                                       \
214           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
215             for (j=0; j<M; j++)                                                                              \
216               for (k=0; k<BS; k++) {                                                                         \
217                 t    = u[idx2[i]*MBS+j*BS+k];                                                                \
218                 APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
219                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
220               }                                                                                              \
221         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
222           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
223             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
224             l  = opt->copy_length[i]*MBS;                                                                    \
225             for (j=0; j<l; j++) {                                                                            \
226               t    = u2[j];                                                                                  \
227               APPLY (u2[j],t,op,p2[j]);                                                                      \
228               FILTER(p2[j] = t);                                                                             \
229             }                                                                                                \
230             p2 += l;                                                                                         \
231           }                                                                                                  \
232         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
233           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
234           step = opt->stride_step[r];                                                                        \
235           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
236             for (j=0; j<M; j++)                                                                              \
237               for (k=0; k<BS; k++) {                                                                         \
238                 t    = u2[i*step*MBS+j*BS+k];                                                                \
239                 APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
240                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
241               }                                                                                              \
242         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
243       }                                                                                                      \
244     }                                                                                                        \
245     PetscFunctionReturn(0);                                                                                  \
246   }
247 
248 /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine
249 
250    Arguments:
251   +Action     Unpack or Fetch
252   .locname    Max or Min
253   .type1      Type of the first data in a pair type
254   .type2      Type of the second data in a pair type, usually PetscMPIInt for MPI ranks.
255   .op         > or <
256   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
257   .CType      Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2)
258   -Cvoid      void with or without the const qualifier, i.e., const void or void
259  */
260 #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \
261   static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \
262     PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked;                                             \
263     CType                 *p = (CType*)packed;                                                               \
264     PetscInt              i,j;                                                                               \
265     for (i=0; i<count; i++) {                                                                                \
266       FILTER(PairType(Type1,Type2) v);                                                                       \
267       j = idx? idx[i] : i;                                                                                   \
268       FILTER(v = u[j]);                                                                                      \
269       if (p[i].a op u[j].a) {                                                                                \
270         u[j] = p[i];                                                                                         \
271       } else if (p[i].a == u[j].a) {                                                                         \
272         u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */                                 \
273       }                                                                                                      \
274       FILTER(p[i] = v);                                                                                      \
275     }                                                                                                        \
276     PetscFunctionReturn(0);                                                                                  \
277   }
278 
279 /* Pack, Unpack/Fetch ops */
280 #define DEF_Pack(Type,BS,EQ)                                                                   \
281   DEF_PackFunc(Type,BS,EQ)                                                                     \
282   DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void)                                   \
283   DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void)                                              \
284   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) {                           \
285     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);                            \
286     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);                            \
287     link->h_FetchAndInsert  = CPPJoin4(FetchAndInsert, Type,BS,EQ);                            \
288   }
289 
290 /* Add, Mult ops */
291 #define DEF_Add(Type,BS,EQ)                                                                    \
292   DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void)             \
293   DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void)             \
294   DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void)                        \
295   DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void)                        \
296   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) {                            \
297     link->h_UnpackAndAdd    = CPPJoin4(UnpackAndAdd,   Type,BS,EQ);                            \
298     link->h_UnpackAndMult   = CPPJoin4(UnpackAndMult,  Type,BS,EQ);                            \
299     link->h_FetchAndAdd     = CPPJoin4(FetchAndAdd,    Type,BS,EQ);                            \
300     link->h_FetchAndMult    = CPPJoin4(FetchAndMult,   Type,BS,EQ);                            \
301   }
302 
303 /* Max, Min ops */
304 #define DEF_Cmp(Type,BS,EQ)                                                                    \
305   DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void)     \
306   DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void)     \
307   DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void)                \
308   DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void)                \
309   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) {                        \
310     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);                            \
311     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);                            \
312     link->h_FetchAndMax     = CPPJoin4(FetchAndMax ,   Type,BS,EQ);                            \
313     link->h_FetchAndMin     = CPPJoin4(FetchAndMin ,   Type,BS,EQ);                            \
314   }
315 
316 /* Logical ops.
317   The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid
318   the compilation warning "empty macro arguments are undefined in ISO C90"
319  */
320 #define DEF_Log(Type,BS,EQ)                                                                    \
321   DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void)            \
322   DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void)            \
323   DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP,  IGNORE,const Type,const void)            \
324   DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void)                       \
325   DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void)                       \
326   DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP,  EXECUTE,Type,void)                       \
327   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) {                        \
328     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND,Type,BS,EQ);                              \
329     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR, Type,BS,EQ);                              \
330     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR,Type,BS,EQ);                              \
331     link->h_FetchAndLAND    = CPPJoin4(FetchAndLAND, Type,BS,EQ);                              \
332     link->h_FetchAndLOR     = CPPJoin4(FetchAndLOR,  Type,BS,EQ);                              \
333     link->h_FetchAndLXOR    = CPPJoin4(FetchAndLXOR, Type,BS,EQ);                              \
334   }
335 
336 /* Bitwise ops */
337 #define DEF_Bit(Type,BS,EQ)                                                                    \
338   DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void)             \
339   DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void)             \
340   DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void)             \
341   DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void)                        \
342   DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void)                        \
343   DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void)                        \
344   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) {                        \
345     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND,Type,BS,EQ);                              \
346     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR, Type,BS,EQ);                              \
347     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR,Type,BS,EQ);                              \
348     link->h_FetchAndBAND    = CPPJoin4(FetchAndBAND, Type,BS,EQ);                              \
349     link->h_FetchAndBOR     = CPPJoin4(FetchAndBOR,  Type,BS,EQ);                              \
350     link->h_FetchAndBXOR    = CPPJoin4(FetchAndBXOR, Type,BS,EQ);                              \
351   }
352 
353 /* Maxloc, Minloc */
354 #define DEF_Xloc(Type1,Type2)                                                                  \
355   DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void)    \
356   DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void)    \
357   DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void)               \
358   DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void)               \
359   static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) {                          \
360     link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1);             \
361     link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1);             \
362     link->h_FetchAndMaxloc  = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1);             \
363     link->h_FetchAndMinloc  = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1);             \
364   }
365 
366 #define DEF_IntegerType(Type,BS,EQ)                                                            \
367   DEF_Pack(Type,BS,EQ)                                                                         \
368   DEF_Add(Type,BS,EQ)                                                                          \
369   DEF_Cmp(Type,BS,EQ)                                                                          \
370   DEF_Log(Type,BS,EQ)                                                                          \
371   DEF_Bit(Type,BS,EQ)                                                                          \
372   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) {                    \
373     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
374     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
375     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
376     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                               \
377     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                               \
378   }
379 
380 #define DEF_RealType(Type,BS,EQ)                                                               \
381   DEF_Pack(Type,BS,EQ)                                                                         \
382   DEF_Add(Type,BS,EQ)                                                                          \
383   DEF_Cmp(Type,BS,EQ)                                                                          \
384   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) {                       \
385     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
386     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
387     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
388   }
389 
390 #if defined(PETSC_HAVE_COMPLEX)
391 #define DEF_ComplexType(Type,BS,EQ)                                                            \
392   DEF_Pack(Type,BS,EQ)                                                                         \
393   DEF_Add(Type,BS,EQ)                                                                          \
394   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) {                    \
395     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
396     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
397   }
398 #endif
399 
400 #define DEF_DumbType(Type,BS,EQ)                                                               \
401   DEF_Pack(Type,BS,EQ)                                                                         \
402   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) {                       \
403     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
404   }
405 
406 /* Maxloc, Minloc */
407 #define DEF_PairType(Type1,Type2)                                                              \
408   typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2);                                    \
409   DEF_Pack(PairType(Type1,Type2),1,1)                                                          \
410   DEF_Xloc(Type1,Type2)                                                                        \
411   static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) {                      \
412     CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link);                                   \
413     CPPJoin3(PackInit_Xloc,Type1,Type2)(link);                                                 \
414   }
415 
416 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
417 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
418 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
419 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
420 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
421 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
422 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
423 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
424 
425 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
426 DEF_IntegerType(int,1,1)
427 DEF_IntegerType(int,2,1)
428 DEF_IntegerType(int,4,1)
429 DEF_IntegerType(int,8,1)
430 DEF_IntegerType(int,1,0)
431 DEF_IntegerType(int,2,0)
432 DEF_IntegerType(int,4,0)
433 DEF_IntegerType(int,8,0)
434 #endif
435 
436 /* The typedefs are used to get a typename without space that CPPJoin can handle */
437 typedef signed char SignedChar;
438 DEF_IntegerType(SignedChar,1,1)
439 DEF_IntegerType(SignedChar,2,1)
440 DEF_IntegerType(SignedChar,4,1)
441 DEF_IntegerType(SignedChar,8,1)
442 DEF_IntegerType(SignedChar,1,0)
443 DEF_IntegerType(SignedChar,2,0)
444 DEF_IntegerType(SignedChar,4,0)
445 DEF_IntegerType(SignedChar,8,0)
446 
447 typedef unsigned char UnsignedChar;
448 DEF_IntegerType(UnsignedChar,1,1)
449 DEF_IntegerType(UnsignedChar,2,1)
450 DEF_IntegerType(UnsignedChar,4,1)
451 DEF_IntegerType(UnsignedChar,8,1)
452 DEF_IntegerType(UnsignedChar,1,0)
453 DEF_IntegerType(UnsignedChar,2,0)
454 DEF_IntegerType(UnsignedChar,4,0)
455 DEF_IntegerType(UnsignedChar,8,0)
456 
457 DEF_RealType(PetscReal,1,1)
458 DEF_RealType(PetscReal,2,1)
459 DEF_RealType(PetscReal,4,1)
460 DEF_RealType(PetscReal,8,1)
461 DEF_RealType(PetscReal,1,0)
462 DEF_RealType(PetscReal,2,0)
463 DEF_RealType(PetscReal,4,0)
464 DEF_RealType(PetscReal,8,0)
465 
466 #if defined(PETSC_HAVE_COMPLEX)
467 DEF_ComplexType(PetscComplex,1,1)
468 DEF_ComplexType(PetscComplex,2,1)
469 DEF_ComplexType(PetscComplex,4,1)
470 DEF_ComplexType(PetscComplex,8,1)
471 DEF_ComplexType(PetscComplex,1,0)
472 DEF_ComplexType(PetscComplex,2,0)
473 DEF_ComplexType(PetscComplex,4,0)
474 DEF_ComplexType(PetscComplex,8,0)
475 #endif
476 
477 DEF_PairType(int,int)
478 DEF_PairType(PetscInt,PetscInt)
479 
480 /* If we don't know the basic type, we treat it as a stream of chars or ints */
481 DEF_DumbType(char,1,1)
482 DEF_DumbType(char,2,1)
483 DEF_DumbType(char,4,1)
484 DEF_DumbType(char,1,0)
485 DEF_DumbType(char,2,0)
486 DEF_DumbType(char,4,0)
487 
488 typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
489 DEF_DumbType(DumbInt,1,1)
490 DEF_DumbType(DumbInt,2,1)
491 DEF_DumbType(DumbInt,4,1)
492 DEF_DumbType(DumbInt,8,1)
493 DEF_DumbType(DumbInt,1,0)
494 DEF_DumbType(DumbInt,2,0)
495 DEF_DumbType(DumbInt,4,0)
496 DEF_DumbType(DumbInt,8,0)
497 
498 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
499 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
500 {
501   int ierr;
502   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
503   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
504   return MPI_SUCCESS;
505 }
506 #endif
507 
508 PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink)
509 {
510   PetscErrorCode    ierr;
511   PetscSFPack       link,*p;
512   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
513 
514   PetscFunctionBegin;
515   /* Look for types in cache */
516   for (p=&bas->inuse; (link=*p); p=&link->next) {
517     PetscBool match;
518     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
519     if (match && (rkey == link->rkey) && (lkey == link->lkey)) {
520       switch (cmode) {
521       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
522       case PETSC_USE_POINTER: break;
523       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
524       }
525       *mylink = link;
526       PetscFunctionReturn(0);
527     }
528   }
529   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
530   PetscFunctionReturn(0);
531 }
532 
533 PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link)
534 {
535   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
536 
537   PetscFunctionBegin;
538   (*link)->rkey = NULL;
539   (*link)->lkey = NULL;
540   (*link)->next = bas->avail;
541   bas->avail    = *link;
542   *link         = NULL;
543   PetscFunctionReturn(0);
544 }
545 
546 /* Destroy all links, i.e., PetscSFPacks in the linked list, usually named 'avail' */
547 PetscErrorCode PetscSFPackDestoryAvailable(PetscSFPack *avail)
548 {
549   PetscErrorCode    ierr;
550   PetscSFPack       link=*avail,next;
551   PetscInt          i;
552 
553   PetscFunctionBegin;
554   for (; link; link=next) {
555     next = link->next;
556     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
557     for (i=0; i<(link->nrootreqs+link->nleafreqs)*4; i++) { /* Persistent reqs must be freed. */
558       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
559     }
560     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
561     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
562     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
563     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->selfbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
564 
565 #if defined(PETSC_HAVE_CUDA)
566     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
567     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
568     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->selfbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
569     if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
570 #endif
571     ierr = PetscFree(link);CHKERRQ(ierr);
572   }
573   *avail = NULL;
574   PetscFunctionReturn(0);
575 }
576 
577 /* Error out on unsupported overlapped communications */
578 PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey)
579 {
580   PetscErrorCode    ierr;
581   PetscSFPack       link,*p;
582   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
583   PetscBool         match;
584 
585   PetscFunctionBegin;
586   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
587      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
588   */
589   if (rkey || lkey) {
590     for (p=&bas->inuse; (link=*p); p=&link->next) {
591       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
592       if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey);
593     }
594   }
595   PetscFunctionReturn(0);
596 }
597 
598 PetscErrorCode PetscSFPackSetUp_Host(PetscSF sf,PetscSFPack link,MPI_Datatype unit)
599 {
600   PetscErrorCode ierr;
601   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
602   PetscBool      is2Int,is2PetscInt;
603   PetscMPIInt    ni,na,nd,combiner;
604 #if defined(PETSC_HAVE_COMPLEX)
605   PetscInt       nPetscComplex=0;
606 #endif
607 
608   PetscFunctionBegin;
609   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
610   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
611   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
612   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
613   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
614   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
615 #if defined(PETSC_HAVE_COMPLEX)
616   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
617 #endif
618   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
619   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
620   /* TODO: shaell we also handle Fortran MPI_2REAL? */
621   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
622   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE;
623   link->bs = 1; /* default */
624 
625   if (is2Int) {
626     PackInit_PairType_int_int(link);
627     link->bs        = 1;
628     link->unitbytes = 2*sizeof(int);
629     link->basicunit = MPI_2INT;
630   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
631     PackInit_PairType_PetscInt_PetscInt(link);
632     link->bs        = 1;
633     link->unitbytes = 2*sizeof(PetscInt);
634     link->basicunit = MPIU_2INT;
635   } else if (nPetscReal) {
636     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
637     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
638     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
639     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
640     link->bs        = nPetscReal;
641     link->unitbytes = nPetscReal*sizeof(PetscReal);
642     link->basicunit = MPIU_REAL;
643   } else if (nPetscInt) {
644     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
645     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
646     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
647     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
648     link->bs        = nPetscInt;
649     link->unitbytes = nPetscInt*sizeof(PetscInt);
650     link->basicunit = MPIU_INT;
651 #if defined(PETSC_USE_64BIT_INDICES)
652   } else if (nInt) {
653     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
654     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
655     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
656     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
657     link->bs        = nInt;
658     link->unitbytes = nInt*sizeof(int);
659     link->basicunit = MPI_INT;
660 #endif
661   } else if (nSignedChar) {
662     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
663     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
664     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
665     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
666     link->bs        = nSignedChar;
667     link->unitbytes = nSignedChar*sizeof(SignedChar);
668     link->basicunit = MPI_SIGNED_CHAR;
669   }  else if (nUnsignedChar) {
670     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
671     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
672     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
673     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
674     link->bs        = nUnsignedChar;
675     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
676     link->basicunit = MPI_UNSIGNED_CHAR;
677 #if defined(PETSC_HAVE_COMPLEX)
678   } else if (nPetscComplex) {
679     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
680     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
681     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
682     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
683     link->bs        = nPetscComplex;
684     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
685     link->basicunit = MPIU_COMPLEX;
686 #endif
687   } else {
688     MPI_Aint lb,nbyte;
689     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
690     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
691     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
692       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
693       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
694       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
695       link->bs        = nbyte;
696       link->unitbytes = nbyte;
697       link->basicunit = MPI_BYTE;
698     } else {
699       nInt = nbyte / sizeof(int);
700       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
701       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
702       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
703       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
704       link->bs        = nInt;
705       link->unitbytes = nbyte;
706       link->basicunit = MPI_INT;
707     }
708   }
709 
710   if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */
711   else {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
712   PetscFunctionReturn(0);
713 }
714 
715 PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*))
716 {
717   PetscFunctionBegin;
718   *UnpackAndOp = NULL;
719   if (mtype == PETSC_MEMTYPE_HOST) {
720     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
721     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
722     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
723     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
724     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
725     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
726     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
727     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
728     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
729     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
730     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
731     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
732     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
733   }
734 #if defined(PETSC_HAVE_CUDA)
735   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
736     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
737     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
738     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
739     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
740     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
741     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
742     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
743     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
744     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
745     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
746     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
747     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
748     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
749   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
750     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
751     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
752     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
753     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
754     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
755     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
756     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
757     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
758     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
759     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
760     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
761     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
762     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
763   }
764 #endif
765   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
766 
767   PetscFunctionReturn(0);
768 }
769 
770 PetscErrorCode PetscSFPackGetFetchAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*))
771 {
772   PetscFunctionBegin;
773   *FetchAndOp = NULL;
774   if (mtype == PETSC_MEMTYPE_HOST) {
775     if (op == MPIU_REPLACE)                   *FetchAndOp = link->h_FetchAndInsert;
776     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->h_FetchAndAdd;
777     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->h_FetchAndMax;
778     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->h_FetchAndMin;
779     else if (op == MPI_MAXLOC)                *FetchAndOp = link->h_FetchAndMaxloc;
780     else if (op == MPI_MINLOC)                *FetchAndOp = link->h_FetchAndMinloc;
781     else if (op == MPI_PROD)                  *FetchAndOp = link->h_FetchAndMult;
782     else if (op == MPI_LAND)                  *FetchAndOp = link->h_FetchAndLAND;
783     else if (op == MPI_BAND)                  *FetchAndOp = link->h_FetchAndBAND;
784     else if (op == MPI_LOR)                   *FetchAndOp = link->h_FetchAndLOR;
785     else if (op == MPI_BOR)                   *FetchAndOp = link->h_FetchAndBOR;
786     else if (op == MPI_LXOR)                  *FetchAndOp = link->h_FetchAndLXOR;
787     else if (op == MPI_BXOR)                  *FetchAndOp = link->h_FetchAndBXOR;
788     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
789   }
790 #if defined(PETSC_HAVE_CUDA)
791   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
792     if (op == MPIU_REPLACE)                   *FetchAndOp = link->d_FetchAndInsert;
793     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->d_FetchAndAdd;
794     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->d_FetchAndMax;
795     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->d_FetchAndMin;
796     else if (op == MPI_MAXLOC)                *FetchAndOp = link->d_FetchAndMaxloc;
797     else if (op == MPI_MINLOC)                *FetchAndOp = link->d_FetchAndMinloc;
798     else if (op == MPI_PROD)                  *FetchAndOp = link->d_FetchAndMult;
799     else if (op == MPI_LAND)                  *FetchAndOp = link->d_FetchAndLAND;
800     else if (op == MPI_BAND)                  *FetchAndOp = link->d_FetchAndBAND;
801     else if (op == MPI_LOR)                   *FetchAndOp = link->d_FetchAndLOR;
802     else if (op == MPI_BOR)                   *FetchAndOp = link->d_FetchAndBOR;
803     else if (op == MPI_LXOR)                  *FetchAndOp = link->d_FetchAndLXOR;
804     else if (op == MPI_BXOR)                  *FetchAndOp = link->d_FetchAndBXOR;
805     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
806   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
807     if (op == MPIU_REPLACE)                   *FetchAndOp = link->da_FetchAndInsert;
808     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->da_FetchAndAdd;
809     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->da_FetchAndMax;
810     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->da_FetchAndMin;
811     else if (op == MPI_MAXLOC)                *FetchAndOp = link->da_FetchAndMaxloc;
812     else if (op == MPI_MINLOC)                *FetchAndOp = link->da_FetchAndMinloc;
813     else if (op == MPI_PROD)                  *FetchAndOp = link->da_FetchAndMult;
814     else if (op == MPI_LAND)                  *FetchAndOp = link->da_FetchAndLAND;
815     else if (op == MPI_BAND)                  *FetchAndOp = link->da_FetchAndBAND;
816     else if (op == MPI_LOR)                   *FetchAndOp = link->da_FetchAndLOR;
817     else if (op == MPI_BOR)                   *FetchAndOp = link->da_FetchAndBOR;
818     else if (op == MPI_LXOR)                  *FetchAndOp = link->da_FetchAndLXOR;
819     else if (op == MPI_BXOR)                  *FetchAndOp = link->da_FetchAndBXOR;
820     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
821   }
822 #endif
823   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
824   PetscFunctionReturn(0);
825 }
826 
827 /*
828   Create pack/unpack optimization plans based on indice patterns available
829 
830    Input Parameters:
831   +  n       - Number of target ranks
832   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
833   -  idx     - [*]   Array storing indices
834 
835    Output Parameters:
836   +  opt    - Optimization plans. Maybe NULL if no optimization can be built.
837 */
838 PetscErrorCode PetscSFPackOptCreate(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
839 {
840   PetscErrorCode ierr;
841   PetscInt       i,j,k,n_copies,tot_copies=0,step;
842   PetscBool      strided,optimized=PETSC_FALSE;
843   PetscSFPackOpt opt;
844 
845   PetscFunctionBegin;
846   if (!n) {
847     *out = NULL;
848     PetscFunctionReturn(0);
849   }
850 
851   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
852   ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr);
853   ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr);
854   if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/
855 
856   opt->n = n;
857 
858   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */
859   for (i=0; i<n; i++) { /* for each target processor */
860     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
861     n_copies = 1;
862     for (j=offset[i]; j<offset[i+1]-1; j++) {
863       if (idx[j]+1 != idx[j+1]) n_copies++;
864     }
865     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
866        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
867      */
868     if ((offset[i+1]-offset[i])/n_copies >= 32) {
869       opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
870       optimized    = PETSC_TRUE;
871       tot_copies  += n_copies;
872     }
873   }
874 
875   /* Setup memcpy plan for each contiguous piece */
876   k    = 0; /* k-th copy */
877   ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
878   for (i=0; i<n; i++) { /* for each target processor */
879     if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
880       n_copies           = 1;
881       opt->copy_start[k] = offset[i] - offset[0];
882       for (j=offset[i]; j<offset[i+1]-1; j++) {
883         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
884           n_copies++;
885           opt->copy_start[k+1] = j-offset[0]+1;
886           opt->copy_length[k]  = opt->copy_start[k+1] - opt->copy_start[k];
887           k++;
888         }
889       }
890       /* Set copy length of the last copy for this target */
891       opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
892       k++;
893     }
894     /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
895     opt->copy_offset[i+1] = k;
896   }
897 
898   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
899   for (i=0; i<n; i++) { /* for each remote */
900     if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
901       strided = PETSC_TRUE;
902       step    = idx[offset[i]+1] - idx[offset[i]];
903       for (j=offset[i]; j<offset[i+1]-1; j++) {
904         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
905       }
906       if (strided) {
907         opt->type[i]         = PETSCSF_PACKOPT_STRIDE;
908         opt->stride_step[i]  = step;
909         opt->stride_n[i]     = offset[i+1] - offset[i];
910         optimized            = PETSC_TRUE;
911       }
912     }
913   }
914   /* If no rank gets optimized, free arrays to save memory */
915   if (!optimized) {
916     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
917     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
918     ierr = PetscFree(opt);CHKERRQ(ierr);
919     *out = NULL;
920   } else *out = opt;
921   PetscFunctionReturn(0);
922 }
923 
924 PetscErrorCode PetscSFPackOptDestory(PetscSFPackOpt *out)
925 {
926   PetscErrorCode ierr;
927   PetscSFPackOpt opt = *out;
928 
929   PetscFunctionBegin;
930   if (opt) {
931     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
932     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
933     ierr = PetscFree(opt);CHKERRQ(ierr);
934     *out = NULL;
935   }
936   PetscFunctionReturn(0);
937 }
938