xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision b23bfdefca792e3d9f47034521b8d4c437693123)
1 
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /*
6  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
7  * therefore we pack data types manually. This file defines packing routines for the standard data types.
8  */
9 
10 #define CPPJoin2(a,b)         a ##_## b
11 #define CPPJoin3(a,b,c)       a ##_## b ##_## c
12 #define CPPJoin4_(a,b,c,d)    a##b##_##c##_##d
13 #define CPPJoin4(a,b,c,d)     CPPJoin4_(a##_,b,c,d)
14 
15 #define EXECUTE(statement)    statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */
16 #define IGNORE(statement)     do {} while(0)
17 
18 #define BINARY_OP(r,s,op,t)   do {(r) = (s) op (t);  } while(0)  /* binary ops in the middle such as +, *, && etc. */
19 #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0)  /* ops like a function, such as PetscMax, PetscMin */
20 #define LXOR_OP(r,s,op,t)     do {(r) = (!s) != (!t);} while(0)  /* logical exclusive OR */
21 #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0)
22 
23 #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */
24 
25 /* DEF_PackFunc - macro defining a Pack routine
26 
27    Arguments of the macro:
28    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
29    .BS        Block size for vectorization. It is a factor of bs.
30    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
31 
32    Arguments of the Pack routine:
33    +count     Number of indices in idx[]
34    .idx       Indices of entries to packed. NULL means contiguous indices, that is [0,count)
35    .bs        Number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int
36    .opt       Pack optimization plans. NULL means no plan at all.
37    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count)
38    -packed    Address of the packed data for each rank
39  */
40 #define DEF_PackFunc(Type,BS,EQ) \
41   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,const void *unpacked,void *packed) \
42   {                                                                                                          \
43     PetscErrorCode ierr;                                                                                     \
44     const Type     *u = (const Type*)unpacked,*u2;                                                           \
45     Type           *p = (Type*)packed,*p2;                                                                   \
46     PetscInt       i,j,k,l,r,step;                                                                           \
47     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
48     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
49     PetscFunctionBegin;                                                                                      \
50     if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);}  /* Indices are contiguous */             \
51     else if (!opt) { /* No optimizations available */                                                        \
52       for (i=0; i<count; i++)                                                                                \
53         for (j=0; j<M; j++)     /* Decent compilers should fuse this loop when M = const 1 */                \
54           for (k=0; k<BS; k++)  /* Compiler either fuses (BS=1) or vectorizes (BS=2,4,8,etc) this loop */    \
55             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
56     } else {                                                                                                 \
57       for (r=0; r<opt->n; r++) {                                                                             \
58         p2  = p + opt->offset[r]*MBS;                                                                        \
59         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
60           idx2 = idx + opt->offset[r];                                                                       \
61           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
62             for (j=0; j<M; j++)                                                                              \
63               for (k=0; k<BS; k++)                                                                           \
64                 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k];                                                    \
65         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
66           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
67             u2   = u + idx[opt->copy_start[i]]*MBS;                                                          \
68             l    = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */                       \
69             ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr);                                                     \
70             p2  += l;                                                                                        \
71           }                                                                                                  \
72         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
73           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
74           step = opt->stride_step[r];                                                                        \
75           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
76             for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j];                                            \
77         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
78       }                                                                                                      \
79     }                                                                                                        \
80     PetscFunctionReturn(0);                                                                                  \
81   }
82 
83 /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine
84 
85    Arguments:
86   +action     Unpack or Fetch
87   .Type       Type of the data
88   .BS         Block size for vectorization
89   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
90   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
91   .CType      Type with or without the const qualifier, i.e., const Type or Type
92   .Cvoid      void with or without the const qualifier, i.e., const void or void
93 
94   Notes:
95    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
96    The two arguments CType and Cvoid are used (instead of one constness argument), because we want to
97    get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument,
98    sometimes we input 'const', sometimes we have to input empty.
99 
100    If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare.
101  */
102 #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid)               \
103   static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
104   {                                                                                                          \
105     PetscErrorCode ierr;                                                                                     \
106     Type           *u = (Type*)unpacked,*u2;                                                                 \
107     CType          *p = (CType*)packed,*p2;                                                                  \
108     PetscInt       i,j,k,l,r,step;                                                                           \
109     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
110     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
111     PetscFunctionBegin;                                                                                      \
112     if (!idx) {                                                                                              \
113       FILTER(Type *v);                                                                                       \
114       FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr));                                               \
115       FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr));                                             \
116              ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);                                              \
117       FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr));                                             \
118       FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                             \
119     } else if (!opt) { /* No optimizations available */                                                      \
120       for (i=0; i<count; i++)                                                                                \
121         for (j=0; j<M; j++)                                                                                  \
122           for (k=0; k<BS; k++) {                                                                             \
123             FILTER(Type t                = u[idx[i]*MBS+j*BS+k]);                                            \
124                    u[idx[i]*MBS+j*BS+k]  = p[i*MBS+j*BS+k];                                                  \
125             FILTER(p[i*MBS+j*BS+k]       = t);                                                               \
126           }                                                                                                  \
127     } else {                                                                                                 \
128       for (r=0; r<opt->n; r++) {                                                                             \
129         p2 = p + opt->offset[r]*MBS;                                                                         \
130         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
131           idx2 = idx + opt->offset[r];                                                                       \
132           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
133             for (j=0; j<M; j++)                                                                              \
134               for (k=0; k<BS; k++) {                                                                         \
135                 FILTER(Type t                = u[idx2[i]*MBS+j*BS+k]);                                       \
136                        u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
137                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
138               }                                                                                              \
139         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
140           FILTER(Type *v);                                                                                   \
141           FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \
142           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
143             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
144             l  = opt->copy_length[i]*MBS;                                                                    \
145             FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr));                                              \
146                    ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr);                                              \
147             FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr));                                              \
148             p2 += l;                                                                                         \
149           }                                                                                                  \
150           FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                         \
151         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
152           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
153           step = opt->stride_step[r];                                                                        \
154           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
155             for (j=0; j<M; j++)                                                                              \
156               for (k=0; k<BS; k++) {                                                                         \
157                 FILTER(Type t                = u2[i*step*MBS+j*BS+k]);                                       \
158                        u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
159                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
160               }                                                                                              \
161         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
162       }                                                                                                      \
163     }                                                                                                        \
164     PetscFunctionReturn(0);                                                                                  \
165   }
166 
167 /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc
168 
169    Arguments:
170   +action     Unpack or Fetch
171   .opname     Name of the Op, such as Add, Mult, LAND, etc.
172   .Type       Type of the data
173   .BS         Block size for vectorization
174   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
175   .op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
176   .APPLY      Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP
177   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
178   .CType      Type with or without the const qualifier, i.e., const Type or Type
179   -Cvoid      void with or without the const qualifier, i.e., const void or void
180  */
181 #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \
182   static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
183   {                                                                                                          \
184     Type           *u = (Type*)unpacked,*u2,t;                                                               \
185     CType          *p = (CType*)packed,*p2;                                                                  \
186     PetscInt       i,j,k,l,r,step;                                                                           \
187     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
188     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
189     PetscFunctionBegin;                                                                                      \
190     if (!idx) {                                                                                              \
191       for (i=0; i<count; i++)                                                                                \
192         for (j=0; j<M; j++)                                                                                  \
193           for (k=0; k<BS; k++) {                                                                             \
194             t    = u[i*MBS+j*BS+k];                                                                          \
195             APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                                    \
196             FILTER(p[i*MBS+j*BS+k] = t);                                                                     \
197           }                                                                                                  \
198     } else if (!opt) { /* No optimizations available */                                                      \
199       for (i=0; i<count; i++)                                                                                \
200         for (j=0; j<M; j++)                                                                                  \
201           for (k=0; k<BS; k++) {                                                                             \
202               t    = u[idx[i]*MBS+j*BS+k];                                                                   \
203               APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                             \
204               FILTER(p[i*MBS+j*BS+k] = t);                                                                   \
205           }                                                                                                  \
206     } else {                                                                                                 \
207       for (r=0; r<opt->n; r++) {                                                                             \
208         p2 = p + opt->offset[r]*MBS;                                                                         \
209         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
210           idx2 = idx + opt->offset[r];                                                                       \
211           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
212             for (j=0; j<M; j++)                                                                              \
213               for (k=0; k<BS; k++) {                                                                         \
214                 t    = u[idx2[i]*MBS+j*BS+k];                                                                \
215                 APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
216                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
217               }                                                                                              \
218         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
219           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
220             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
221             l  = opt->copy_length[i]*MBS;                                                                    \
222             for (j=0; j<l; j++) {                                                                            \
223               t    = u2[j];                                                                                  \
224               APPLY (u2[j],t,op,p2[j]);                                                                      \
225               FILTER(p2[j] = t);                                                                             \
226             }                                                                                                \
227             p2 += l;                                                                                         \
228           }                                                                                                  \
229         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
230           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
231           step = opt->stride_step[r];                                                                        \
232           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
233             for (j=0; j<M; j++)                                                                              \
234               for (k=0; k<BS; k++) {                                                                         \
235                 t    = u2[i*step*MBS+j*BS+k];                                                                \
236                 APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
237                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
238               }                                                                                              \
239         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
240       }                                                                                                      \
241     }                                                                                                        \
242     PetscFunctionReturn(0);                                                                                  \
243   }
244 
245 /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine
246 
247    Arguments:
248   +Action     Unpack or Fetch
249   .locname    Max or Min
250   .type1      Type of the first data in a pair type
251   .type2      Type of the second data in a pair type, usually PetscMPIInt for MPI ranks.
252   .op         > or <
253   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
254   .CType      Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2)
255   -Cvoid      void with or without the const qualifier, i.e., const void or void
256  */
257 #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \
258   static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \
259     PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked;                                             \
260     CType                 *p = (CType*)packed;                                                               \
261     PetscInt              i,j;                                                                               \
262     for (i=0; i<count; i++) {                                                                                \
263       j = idx? idx[i] : i;                                                                                   \
264       FILTER(PairType(Type1,Type2) v = u[j]);                                                                \
265       if (p[i].a op u[j].a) {                                                                                \
266         u[j] = p[i];                                                                                         \
267       } else if (p[i].a == u[j].a) {                                                                         \
268         u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */                                 \
269       }                                                                                                      \
270       FILTER(p[i] = v);                                                                                      \
271     }                                                                                                        \
272     PetscFunctionReturn(0);                                                                                  \
273   }
274 
275 /* Pack, Unpack/Fetch ops */
276 #define DEF_Pack(Type,BS,EQ)                                                                   \
277   DEF_PackFunc(Type,BS,EQ)                                                                     \
278   DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void)                                   \
279   DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void)                                              \
280   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) {                           \
281     link->Pack            = CPPJoin4(Pack,           Type,BS,EQ);                              \
282     link->UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);                              \
283     link->FetchAndInsert  = CPPJoin4(FetchAndInsert, Type,BS,EQ);                              \
284     link->unitbytes       = sizeof(Type);                                                      \
285   }
286 
287 /* Add, Mult ops */
288 #define DEF_Add(Type,BS,EQ)                                                                    \
289   DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void)             \
290   DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void)             \
291   DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void)                        \
292   DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void)                        \
293   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) {                            \
294     link->UnpackAndAdd    = CPPJoin4(UnpackAndAdd,   Type,BS,EQ);                              \
295     link->UnpackAndMult   = CPPJoin4(UnpackAndMult,  Type,BS,EQ);                              \
296     link->FetchAndAdd     = CPPJoin4(FetchAndAdd,    Type,BS,EQ);                              \
297     link->FetchAndMult    = CPPJoin4(FetchAndMult,   Type,BS,EQ);                              \
298   }
299 
300 /* Max, Min ops */
301 #define DEF_Cmp(Type,BS,EQ)                                                                    \
302   DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void)     \
303   DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void)     \
304   DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void)                \
305   DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void)                \
306   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) {                        \
307     link->UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);                              \
308     link->UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);                              \
309     link->FetchAndMax     = CPPJoin4(FetchAndMax ,   Type,BS,EQ);                              \
310     link->FetchAndMin     = CPPJoin4(FetchAndMin ,   Type,BS,EQ);                              \
311   }
312 
313 /* Logical ops.
314   The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid
315   the compilation warning "empty macro arguments are undefined in ISO C90"
316  */
317 #define DEF_Log(Type,BS,EQ)                                                                    \
318   DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void)            \
319   DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void)            \
320   DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP,  IGNORE,const Type,const void)            \
321   DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void)                       \
322   DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void)                       \
323   DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP,  EXECUTE,Type,void)                       \
324   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) {                        \
325     link->UnpackAndLAND   = CPPJoin4(UnpackAndLAND,Type,BS,EQ);                                \
326     link->UnpackAndLOR    = CPPJoin4(UnpackAndLOR, Type,BS,EQ);                                \
327     link->UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR,Type,BS,EQ);                                \
328     link->FetchAndLAND    = CPPJoin4(FetchAndLAND, Type,BS,EQ);                                \
329     link->FetchAndLOR     = CPPJoin4(FetchAndLOR,  Type,BS,EQ);                                \
330     link->FetchAndLXOR    = CPPJoin4(FetchAndLXOR, Type,BS,EQ);                                \
331   }
332 
333 /* Bitwise ops */
334 #define DEF_Bit(Type,BS,EQ)                                                                    \
335   DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void)             \
336   DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void)             \
337   DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void)             \
338   DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void)                        \
339   DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void)                        \
340   DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void)                        \
341   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) {                        \
342     link->UnpackAndBAND   = CPPJoin4(UnpackAndBAND,Type,BS,EQ);                                \
343     link->UnpackAndBOR    = CPPJoin4(UnpackAndBOR, Type,BS,EQ);                                \
344     link->UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR,Type,BS,EQ);                                \
345     link->FetchAndBAND    = CPPJoin4(FetchAndBAND, Type,BS,EQ);                                \
346     link->FetchAndBOR     = CPPJoin4(FetchAndBOR,  Type,BS,EQ);                                \
347     link->FetchAndBXOR    = CPPJoin4(FetchAndBXOR, Type,BS,EQ);                                \
348   }
349 
350 /* Maxloc, Minloc */
351 #define DEF_Xloc(Type1,Type2)                                                                  \
352   DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void)    \
353   DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void)    \
354   DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void)               \
355   DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void)               \
356   static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) {                          \
357     link->UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1);               \
358     link->UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1);               \
359     link->FetchAndMaxloc  = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1);               \
360     link->FetchAndMinloc  = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1);               \
361   }
362 
363 #define DEF_IntegerType(Type,BS,EQ)                                                            \
364   DEF_Pack(Type,BS,EQ)                                                                         \
365   DEF_Add(Type,BS,EQ)                                                                          \
366   DEF_Cmp(Type,BS,EQ)                                                                          \
367   DEF_Log(Type,BS,EQ)                                                                          \
368   DEF_Bit(Type,BS,EQ)                                                                          \
369   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) {                    \
370     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
371     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
372     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
373     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                               \
374     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                               \
375   }
376 
377 #define DEF_RealType(Type,BS,EQ)                                                               \
378   DEF_Pack(Type,BS,EQ)                                                                         \
379   DEF_Add(Type,BS,EQ)                                                                          \
380   DEF_Cmp(Type,BS,EQ)                                                                          \
381   DEF_Log(Type,BS,EQ)                                                                          \
382   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) {                       \
383     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
384     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
385     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
386     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                               \
387   }
388 
389 #if defined(PETSC_HAVE_COMPLEX)
390 #define DEF_ComplexType(Type,BS,EQ)                                                            \
391   DEF_Pack(Type,BS,EQ)                                                                         \
392   DEF_Add(Type,BS,EQ)                                                                          \
393   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) {                    \
394     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
395     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
396   }
397 #endif
398 
399 #define DEF_DumbType(Type,BS,EQ)                                                               \
400   DEF_Pack(Type,BS,EQ)                                                                         \
401   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) {                       \
402     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
403   }
404 
405 /* Maxloc, Minloc */
406 #define DEF_PairType(Type1,Type2)                                                              \
407   typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2);                                    \
408   DEF_Pack(PairType(Type1,Type2),1,1)                                                          \
409   DEF_Xloc(Type1,Type2)                                                                        \
410   static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) {                      \
411     CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link);                                   \
412     CPPJoin3(PackInit_Xloc,Type1,Type2)(link);                                                 \
413   }
414 
415 DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
416 DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
417 DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
418 DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
419 DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
420 DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
421 DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
422 DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
423 
424 #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
425 DEF_IntegerType(int,1,1)
426 DEF_IntegerType(int,2,1)
427 DEF_IntegerType(int,4,1)
428 DEF_IntegerType(int,8,1)
429 DEF_IntegerType(int,1,0)
430 DEF_IntegerType(int,2,0)
431 DEF_IntegerType(int,4,0)
432 DEF_IntegerType(int,8,0)
433 #endif
434 
435 /* The typedefs are used to get a typename without space that CPPJoin can handle */
436 typedef signed char SignedChar;
437 DEF_IntegerType(SignedChar,1,1)
438 DEF_IntegerType(SignedChar,2,1)
439 DEF_IntegerType(SignedChar,4,1)
440 DEF_IntegerType(SignedChar,8,1)
441 DEF_IntegerType(SignedChar,1,0)
442 DEF_IntegerType(SignedChar,2,0)
443 DEF_IntegerType(SignedChar,4,0)
444 DEF_IntegerType(SignedChar,8,0)
445 
446 typedef unsigned char UnsignedChar;
447 DEF_IntegerType(UnsignedChar,1,1)
448 DEF_IntegerType(UnsignedChar,2,1)
449 DEF_IntegerType(UnsignedChar,4,1)
450 DEF_IntegerType(UnsignedChar,8,1)
451 DEF_IntegerType(UnsignedChar,1,0)
452 DEF_IntegerType(UnsignedChar,2,0)
453 DEF_IntegerType(UnsignedChar,4,0)
454 DEF_IntegerType(UnsignedChar,8,0)
455 
456 DEF_RealType(PetscReal,1,1)
457 DEF_RealType(PetscReal,2,1)
458 DEF_RealType(PetscReal,4,1)
459 DEF_RealType(PetscReal,8,1)
460 DEF_RealType(PetscReal,1,0)
461 DEF_RealType(PetscReal,2,0)
462 DEF_RealType(PetscReal,4,0)
463 DEF_RealType(PetscReal,8,0)
464 
465 #if defined(PETSC_HAVE_COMPLEX)
466 DEF_ComplexType(PetscComplex,1,1)
467 DEF_ComplexType(PetscComplex,2,1)
468 DEF_ComplexType(PetscComplex,4,1)
469 DEF_ComplexType(PetscComplex,8,1)
470 DEF_ComplexType(PetscComplex,1,0)
471 DEF_ComplexType(PetscComplex,2,0)
472 DEF_ComplexType(PetscComplex,4,0)
473 DEF_ComplexType(PetscComplex,8,0)
474 #endif
475 
476 DEF_PairType(int,int)
477 DEF_PairType(PetscInt,PetscInt)
478 
479 /* If we don't know the basic type, we treat it as a stream of chars or ints */
480 DEF_DumbType(char,1,1)
481 DEF_DumbType(char,2,1)
482 DEF_DumbType(char,4,1)
483 DEF_DumbType(char,1,0)
484 DEF_DumbType(char,2,0)
485 DEF_DumbType(char,4,0)
486 
487 typedef int DumbInt; /* To differentiate with regular int used above */
488 DEF_DumbType(DumbInt,1,1)
489 DEF_DumbType(DumbInt,2,1)
490 DEF_DumbType(DumbInt,4,1)
491 DEF_DumbType(DumbInt,8,1)
492 DEF_DumbType(DumbInt,1,0)
493 DEF_DumbType(DumbInt,2,0)
494 DEF_DumbType(DumbInt,4,0)
495 DEF_DumbType(DumbInt,8,0)
496 
497 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
498 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
499 {
500   int ierr;
501   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
502   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
503   return MPI_SUCCESS;
504 }
505 #endif
506 
507 PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink)
508 {
509   PetscErrorCode    ierr;
510   PetscSFPack       link,*p;
511   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
512 
513   PetscFunctionBegin;
514   /* Look for types in cache */
515   for (p=&bas->inuse; (link=*p); p=&link->next) {
516     PetscBool match;
517     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
518     if (match && (rkey == link->rkey) && (lkey == link->lkey)) {
519       switch (cmode) {
520       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
521       case PETSC_USE_POINTER: break;
522       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
523       }
524       *mylink = link;
525       PetscFunctionReturn(0);
526     }
527   }
528   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
529   PetscFunctionReturn(0);
530 }
531 
532 PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link)
533 {
534   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
535 
536   PetscFunctionBegin;
537   (*link)->rkey = NULL;
538   (*link)->lkey = NULL;
539   (*link)->next = bas->avail;
540   bas->avail    = *link;
541   *link         = NULL;
542   PetscFunctionReturn(0);
543 }
544 
545 /* Error out on unsupported overlapped communications */
546 PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey)
547 {
548   PetscErrorCode    ierr;
549   PetscSFPack       link,*p;
550   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
551   PetscBool         match;
552 
553   PetscFunctionBegin;
554   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
555      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
556   */
557   if (rkey || lkey) {
558     for (p=&bas->inuse; (link=*p); p=&link->next) {
559       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
560       if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey);
561     }
562   }
563   PetscFunctionReturn(0);
564 }
565 
566 PetscErrorCode PetscSFPackSetupType(PetscSFPack link,MPI_Datatype unit)
567 {
568   PetscErrorCode ierr;
569   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
570   PetscBool      is2Int,is2PetscInt;
571   PetscMPIInt    ni,na,nd,combiner;
572 #if defined(PETSC_HAVE_COMPLEX)
573   PetscInt       nPetscComplex=0;
574 #endif
575 
576   PetscFunctionBegin;
577   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
578   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
579   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
580   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
581   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
582   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
583 #if defined(PETSC_HAVE_COMPLEX)
584   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
585 #endif
586   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
587   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
588   /* TODO: shaell we also handle Fortran MPI_2REAL? */
589   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
590   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE;
591   link->bs = 1; /* default */
592 
593   if (nPetscReal) {
594     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
595     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
596     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
597     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
598     link->bs         = nPetscReal;
599     link->unitbytes *= nPetscReal;
600     link->basicunit  = MPIU_REAL;
601   } else if (nPetscInt) {
602     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
603     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
604     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
605     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
606     link->bs         = nPetscInt;
607     link->unitbytes *= nPetscInt;
608     link->basicunit  = MPIU_INT;
609 #if defined(PETSC_USE_64BIT_INDICES)
610   } else if (nInt) {
611     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
612     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
613     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
614     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
615     link->bs         = nInt;
616     link->unitbytes *= nInt;
617     link->basicunit  = MPI_INT;
618 #endif
619   } else if (nSignedChar) {
620     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
621     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
622     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
623     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
624     link->bs         = nSignedChar;
625     link->unitbytes *= nSignedChar;
626     link->basicunit  = MPI_SIGNED_CHAR;
627   }  else if (nUnsignedChar) {
628     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
629     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
630     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
631     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
632     link->bs         = nUnsignedChar;
633     link->unitbytes *= nUnsignedChar;
634     link->basicunit  = MPI_UNSIGNED_CHAR;
635 #if defined(PETSC_HAVE_COMPLEX)
636   } else if (nPetscComplex) {
637     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
638     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
639     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
640     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
641     link->bs         = nPetscComplex;
642     link->unitbytes *= nPetscComplex;
643     link->basicunit  = MPIU_COMPLEX;
644 #endif
645   } else if (is2Int) {
646     PackInit_PairType_int_int(link);
647     link->bs         = 1;
648     link->basicunit  = MPI_2INT;
649   } else if (is2PetscInt) {
650     PackInit_PairType_PetscInt_PetscInt(link);
651     link->bs         = 1;
652     link->basicunit  = MPIU_2INT;
653   } else {
654     MPI_Aint lb,nbyte;
655     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
656     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
657     if (nbyte % sizeof(DumbInt)) { /* If the type size is not multiple of int */
658       if      (nbyte == 4) {PackInit_DumbType_char_4_1(link); link->bs = nbyte/4;}  else if (nbyte%4 == 0) {PackInit_DumbType_char_4_0(link); link->bs = nbyte/4;} /* Note the basic type is char[4] */
659       else if (nbyte == 2) {PackInit_DumbType_char_2_1(link); link->bs = nbyte/2;}  else if (nbyte%2 == 0) {PackInit_DumbType_char_2_0(link); link->bs = nbyte/2;}
660       else if (nbyte == 1) {PackInit_DumbType_char_1_1(link); link->bs = nbyte/1;}  else if (nbyte%1 == 0) {PackInit_DumbType_char_1_0(link); link->bs = nbyte/1;}
661       link->unitbytes = nbyte;
662       link->basicunit = MPI_BYTE;
663     } else {
664       nInt = nbyte / sizeof(DumbInt);
665       if      (nInt == 8)  {PackInit_DumbType_DumbInt_8_1(link); link->bs = nInt/8;} else if (nInt%8 == 0) {PackInit_DumbType_DumbInt_8_0(link); link->bs = nInt/8;}
666       else if (nInt == 4)  {PackInit_DumbType_DumbInt_4_1(link); link->bs = nInt/4;} else if (nInt%4 == 0) {PackInit_DumbType_DumbInt_4_0(link); link->bs = nInt/4;}
667       else if (nInt == 2)  {PackInit_DumbType_DumbInt_2_1(link); link->bs = nInt/2;} else if (nInt%2 == 0) {PackInit_DumbType_DumbInt_2_0(link); link->bs = nInt/2;}
668       else if (nInt == 1)  {PackInit_DumbType_DumbInt_1_1(link); link->bs = nInt/1;} else if (nInt%1 == 0) {PackInit_DumbType_DumbInt_1_0(link); link->bs = nInt/1;}
669       link->unitbytes = nbyte;
670       link->basicunit = MPI_INT;
671     }
672   }
673 
674   if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */
675   else {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
676   PetscFunctionReturn(0);
677 }
678 
679 PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*))
680 {
681   PetscFunctionBegin;
682   *UnpackAndOp = NULL;
683   if (op == MPIU_REPLACE) *UnpackAndOp = link->UnpackAndInsert;
684   else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->UnpackAndAdd;
685   else if (op == MPI_PROD) *UnpackAndOp = link->UnpackAndMult;
686   else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->UnpackAndMax;
687   else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->UnpackAndMin;
688   else if (op == MPI_LAND)   *UnpackAndOp = link->UnpackAndLAND;
689   else if (op == MPI_BAND)   *UnpackAndOp = link->UnpackAndBAND;
690   else if (op == MPI_LOR)    *UnpackAndOp = link->UnpackAndLOR;
691   else if (op == MPI_BOR)    *UnpackAndOp = link->UnpackAndBOR;
692   else if (op == MPI_LXOR)   *UnpackAndOp = link->UnpackAndLXOR;
693   else if (op == MPI_BXOR)   *UnpackAndOp = link->UnpackAndBXOR;
694   else if (op == MPI_MAXLOC) *UnpackAndOp = link->UnpackAndMaxloc;
695   else if (op == MPI_MINLOC) *UnpackAndOp = link->UnpackAndMinloc;
696   else *UnpackAndOp = NULL;
697   PetscFunctionReturn(0);
698 }
699 
700 PetscErrorCode PetscSFPackGetFetchAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*))
701 {
702   PetscFunctionBegin;
703   *FetchAndOp = NULL;
704   if (op == MPIU_REPLACE) *FetchAndOp = link->FetchAndInsert;
705   else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->FetchAndAdd;
706   else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->FetchAndMax;
707   else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->FetchAndMin;
708   else if (op == MPI_MAXLOC) *FetchAndOp = link->FetchAndMaxloc;
709   else if (op == MPI_MINLOC) *FetchAndOp = link->FetchAndMinloc;
710   else if (op == MPI_PROD)   *FetchAndOp = link->FetchAndMult;
711   else if (op == MPI_LAND)   *FetchAndOp = link->FetchAndLAND;
712   else if (op == MPI_BAND)   *FetchAndOp = link->FetchAndBAND;
713   else if (op == MPI_LOR)    *FetchAndOp = link->FetchAndLOR;
714   else if (op == MPI_BOR)    *FetchAndOp = link->FetchAndBOR;
715   else if (op == MPI_LXOR)   *FetchAndOp = link->FetchAndLXOR;
716   else if (op == MPI_BXOR)   *FetchAndOp = link->FetchAndBXOR;
717   else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for MPI_Op");
718   PetscFunctionReturn(0);
719 }
720 
721 /*
722   Setup pack/unpack optimization plans based on indice patterns available
723 
724    Input Parameters:
725   +  n       - Number of target ranks
726   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] is not necessarily 0.
727   -  idx     - [*]   Array storing indices
728 
729    Output Parameters:
730   +  opt    - the optimization
731 */
732 PetscErrorCode PetscSFPackSetupOptimization(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
733 {
734   PetscErrorCode ierr;
735   PetscInt       i,j,k,n_copies,tot_copies=0,step;
736   PetscBool      strided,optimized=PETSC_FALSE;
737   PetscSFPackOpt opt;
738 
739   PetscFunctionBegin;
740   if (!n) {
741     *out = NULL;
742     PetscFunctionReturn(0);
743   }
744 
745   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
746   ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr);
747   ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr);
748   if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/
749 
750   opt->n = n;
751 
752   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */
753   for (i=0; i<n; i++) { /* for each target processor */
754     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
755     n_copies = 1;
756     for (j=offset[i]; j<offset[i+1]-1; j++) {
757       if (idx[j]+1 != idx[j+1]) n_copies++;
758     }
759     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
760        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
761      */
762     if ((offset[i+1]-offset[i])/n_copies >= 32) {
763       opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
764       optimized    = PETSC_TRUE;
765       tot_copies  += n_copies;
766     }
767   }
768 
769   /* Setup memcpy plan for each contiguous piece */
770   k    = 0; /* k-th copy */
771   ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
772   for (i=0; i<n; i++) { /* for each target processor */
773     if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
774       n_copies           = 1;
775       opt->copy_start[k] = offset[i] - offset[0];
776       for (j=offset[i]; j<offset[i+1]-1; j++) {
777         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
778           n_copies++;
779           opt->copy_start[k+1] = j-offset[0]+1;
780           opt->copy_length[k]  = opt->copy_start[k+1] - opt->copy_start[k];
781           k++;
782         }
783       }
784       /* Set copy length of the last copy for this target */
785       opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
786       k++;
787     }
788     /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
789     opt->copy_offset[i+1] = k;
790   }
791 
792   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
793   for (i=0; i<n; i++) { /* for each remote */
794     if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
795       strided = PETSC_TRUE;
796       step    = idx[offset[i]+1] - idx[offset[i]];
797       for (j=offset[i]; j<offset[i+1]-1; j++) {
798         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
799       }
800       if (strided) {
801         opt->type[i]         = PETSCSF_PACKOPT_STRIDE;
802         opt->stride_step[i]  = step;
803         opt->stride_n[i]     = offset[i+1] - offset[i];
804         optimized            = PETSC_TRUE;
805       }
806     }
807   }
808   /* If no rank gets optimized, free arrays to save memory */
809   if (!optimized) {
810     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
811     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
812     ierr = PetscFree(opt);CHKERRQ(ierr);
813     *out = NULL;
814   } else *out = opt;
815   PetscFunctionReturn(0);
816 }
817 
818 PetscErrorCode PetscSFPackDestoryOptimization(PetscSFPackOpt *out)
819 {
820   PetscErrorCode ierr;
821   PetscSFPackOpt opt = *out;
822 
823   PetscFunctionBegin;
824   if (opt) {
825     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
826     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
827     ierr = PetscFree(opt);CHKERRQ(ierr);
828     *out = NULL;
829   }
830   PetscFunctionReturn(0);
831 }
832