xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 3ca0882ebe6e6bc3c96f853b3244799ed8a662a6)
1 
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 
5 /*
6  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
7  * therefore we pack data types manually. This file defines packing routines for the standard data types.
8  */
9 
10 #define CPPJoin2_exp(a,b)     a ## b
11 #define CPPJoin2(a,b)         CPPJoin2_exp(a,b)
12 #define CPPJoin3_exp_(a,b,c)  a ## b ## _ ## c
13 #define CPPJoin3_(a,b,c)      CPPJoin3_exp_(a,b,c)
14 
15 #define EXECUTE(statement)    statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */
16 #define IGNORE(statement)     do {} while(0)
17 
18 #define BINARY_OP(r,s,op,t)   do {(r) = (s) op (t);  } while(0)  /* binary ops in the middle such as +, *, && etc. */
19 #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0)  /* ops like a function, such as PetscMax, PetscMin */
20 #define LXOR_OP(r,s,op,t)     do {(r) = (!s) != (!t);} while(0)  /* logical exclusive OR */
21 #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0)
22 
23 #define BlockType(type,count) CPPJoin3_(_blocktype_,type,count) /* typename for struct {type v[count];} */
24 #define PairType(type1,type2) CPPJoin3_(_pairtype_,type1,type2) /* typename for struct {type1 a; type2 b;} */
25 
26 /* DEF_PackFunc - macro defining a Pack routine
27 
28    Arguments of the macro:
29    +type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30    -BS        Block size for vectorization. It is a factor of bs.
31 
32    Arguments of the Pack routine:
33    +n         Number of entries to pack. Each entry is of type 'unit'. Here the unit is the arg used in calls like PetscSFBcastBegin(sf,unit,..).
34               If idx in not NULL, then n also indicates the number of indices in idx[]
35    .bs        Number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int
36    .idx       Indices of entries. NULL means contiguous indices [0,n)
37    .r         Do packing for the r-th target processor
38    .opt       Pack optimization plans. NULL means no plan.
39    .unpacked  Address of the unpacked data
40    -packed    Address of the packed data
41  */
42 #define DEF_PackFunc(type,BS) \
43   static PetscErrorCode CPPJoin3_(Pack_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,const void *unpacked,void *packed) { \
44     PetscErrorCode ierr;                                                                                   \
45     const type     *u = (const type*)unpacked,*u2;                                                         \
46     type           *p = (type*)packed;                                                                     \
47     PetscInt       i,j,k,l,step;                                                                           \
48     PetscFunctionBegin;                                                                                    \
49     if (!idx) {  /* idx[] is contiguous */                                                                 \
50       ierr = PetscArraycpy(p,u,bs*n);CHKERRQ(ierr);                                             \
51     } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/                                   \
52       for (i=0; i<n; i++)                                                                                  \
53         for (j=0; j<bs; j+=BS)                                                                             \
54           for (k=j; k<j+BS; k++)                                                                           \
55             p[i*bs+k] = u[idx[i]*bs+k];                                                                    \
56     } else { /* idx[] is optimized*/                                                                       \
57       if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */             \
58         for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
59           l    = opt->copy_length[i]*bs; /* length in types */                                             \
60           u2   = u + opt->copy_start[i]*bs;                                                                \
61           ierr = PetscArraycpy(p,u2,l);CHKERRQ(ierr);                                           \
62           p   += l;                                                                                        \
63         }                                                                                                  \
64       } else { /* idx[] is strided */                                                                      \
65         u   += opt->stride_first[r]*bs;                                                                    \
66         step = opt->stride_step[r];                                                                        \
67         for (i=0; i<opt->stride_n[r]; i++)                                                                 \
68           for (j=0; j<bs; j++)                                                                             \
69             p[i*bs+j] = u[i*step*bs+j];                                                                    \
70       }                                                                                                    \
71     }                                                                                                      \
72     PetscFunctionReturn(0);                                                                                \
73   }
74 
75 /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine
76 
77    Arguments:
78   +action     Unpack or Fetch
79   .type       Type of the data
80   .BS         Block size for vectorization
81   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
82   .ctype      Type with or without the const qualifier, i.e., const type or type
83   .cvoid      void with or without the const qualifier, i.e., const void or void
84 
85   Notes:
86    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
87    The two arguments ctype and cvoid are used (instead of one constness argument), because we want to
88    get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument,
89    sometimes we input 'const', sometimes we have to input empty.
90  */
91 #define DEF_Action(action,type,BS,FILTER,ctype,cvoid)               \
92   static PetscErrorCode CPPJoin3_(action##AndInsert_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \
93     PetscErrorCode ierr;                                                                                   \
94     type           *u = (type*)unpacked,*u2;                                                               \
95     ctype          *p = (ctype*)packed;                                                                    \
96     PetscInt       i,j,k,l,step;                                                                           \
97     PetscFunctionBegin;                                                                                    \
98     if (!idx) {  /* idx[] is contiguous */                                                                 \
99       FILTER(type *v);                                                                                     \
100       FILTER(ierr = PetscMalloc1(bs*n,&v);CHKERRQ(ierr));                                                  \
101       FILTER(ierr = PetscArraycpy(v,u,bs*n);CHKERRQ(ierr));                                     \
102              ierr = PetscArraycpy(u,p,bs*n);CHKERRQ(ierr);                                      \
103       FILTER(ierr = PetscArraycpy(p,v,bs*n);CHKERRQ(ierr));                                     \
104       FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                           \
105     } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/                                   \
106       for (i=0; i<n; i++) {                                                                                \
107         for (j=0; j<bs; j+=BS) {                                                                           \
108           for (k=j; k<j+BS; k++) {                                                                         \
109             FILTER(type t = u[idx[i]*bs+k]);                                                               \
110             u[idx[i]*bs+k] = p[i*bs+k];                                                                    \
111             FILTER(p[i*bs+k] = t);                                                                         \
112           }                                                                                                \
113         }                                                                                                  \
114       }                                                                                                    \
115     } else { /* idx[] is optimized*/                                                                       \
116       if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */             \
117         FILTER(type *v);                                                                                   \
118         FILTER(ierr = PetscMalloc1(bs*n,&v);CHKERRQ(ierr)); /* maximal buffer  */                          \
119         for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
120           l  = opt->copy_length[i]*bs; /* length in types */                                               \
121           u2 = u + opt->copy_start[i]*bs;                                                                  \
122           FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr));                                   \
123                  ierr = PetscArraycpy(u2,p,l);CHKERRQ(ierr);                                    \
124           FILTER(ierr = PetscArraycpy(p,v,l);CHKERRQ(ierr));                                    \
125           p += l;                                                                                          \
126         }                                                                                                  \
127         FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                         \
128       } else { /* idx[] is strided */                                                                      \
129         u   += opt->stride_first[r]*bs;                                                                    \
130         step = opt->stride_step[r];                                                                        \
131         for (i=0; i<opt->stride_n[r]; i++)                                                                 \
132           for (j=0; j<bs; j++) {                                                                           \
133             FILTER(type t = u[i*step*bs+j]);                                                               \
134             u[i*step*bs+j] = p[i*bs+j];                                                                    \
135             FILTER(p[i*bs+j] = t);                                                                         \
136           }                                                                                                \
137       }                                                                                                    \
138     }                                                                                                      \
139     PetscFunctionReturn(0);                                                                                \
140   }
141 
142 /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc
143 
144    Arguments:
145   +action     Unpack or Fetch
146   .opname     Name of the Op, such as Add, Mult, LAND, etc.
147   .type       Type of the data
148   .BS         Block size for vectorization
149   .op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
150   .APPLY      Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP
151   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
152   .ctype      Type with or without the const qualifier, i.e., const type or type
153   -cvoid      void with or without the const qualifier, i.e., const void or void
154  */
155 #define DEF_ActionAndOp(action,opname,type,BS,op,APPLY,FILTER,ctype,cvoid) \
156   static PetscErrorCode CPPJoin3_(action##And##opname##_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \
157     type     *u = (type*)unpacked,*u2,t;                                                                   \
158     ctype    *p = (ctype*)packed;                                                                          \
159     PetscInt i,j,k,l,step;                                                                                 \
160     PetscFunctionBegin;                                                                                    \
161     if (!idx) {  /* idx[] is contiguous */                                                                 \
162       for (i=0; i<n*bs; i++) {                                                                             \
163         t = u[i];                                                                                          \
164         APPLY(u[i],t,op,p[i]);                                                                             \
165         FILTER(p[i] = t);                                                                                  \
166       }                                                                                                    \
167     } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/                                   \
168       for (i=0; i<n; i++) {                                                                                \
169         for (j=0; j<bs; j+=BS) {                                                                           \
170           for (k=j; k<j+BS; k++) {                                                                         \
171             t = u[idx[i]*bs+k];                                                                            \
172             APPLY(u[idx[i]*bs+k],t,op,p[i*bs+k]);                                                          \
173             FILTER(p[i*bs+k] = t);                                                                         \
174           }                                                                                                \
175         }                                                                                                  \
176       }                                                                                                    \
177     } else { /* idx[] is optimized*/                                                                       \
178       if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */             \
179         for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
180           l  = opt->copy_length[i]*bs; /* length in types */                                               \
181           u2 = u + opt->copy_start[i]*bs;                                                                  \
182           for (j=0; j<l; j++) {                                                                            \
183             t = u2[j];                                                                                     \
184             APPLY(u2[j],t,op,p[j]);                                                                        \
185             FILTER(p[j] = t);                                                                              \
186           }                                                                                                \
187           p += l;                                                                                          \
188         }                                                                                                  \
189       } else { /* idx[] is strided */                                                                      \
190         u   += opt->stride_first[r]*bs;                                                                    \
191         step = opt->stride_step[r];                                                                        \
192         for (i=0; i<opt->stride_n[r]; i++)                                                                 \
193           for (j=0; j<bs; j++) {                                                                           \
194             t = u[i*step*bs+j];                                                                            \
195             APPLY(u[i*step*bs+j],t,op,p[i*bs+j]);                                                          \
196             FILTER(p[i*bs+j] = t);                                                                         \
197           }                                                                                                \
198       }                                                                                                    \
199     }                                                                                                      \
200     PetscFunctionReturn(0);                                                                                \
201   }
202 
203 /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine
204 
205    Arguments:
206   +Action     Unpack or Fetch
207   .locname    Max or Min
208   .type1      Type of the first data in a pair type
209   .type2      Type of the second data in a pair type, usually PetscMPIInt for MPI ranks.
210   .op         > or <
211   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
212   .ctype      Type with or without the const qualifier, i.e., const PairType(type1,type2) or PairType(type1,type2)
213   -cvoid      void with or without the const qualifier, i.e., const void or void
214  */
215 #define DEF_ActionAndXloc(action,locname,type1,type2,op,FILTER,ctype,cvoid) \
216   static PetscErrorCode CPPJoin3_(action##And##locname##loc_,PairType(type1,type2),1)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \
217     PairType(type1,type2) *u = (PairType(type1,type2)*)unpacked;                                           \
218     ctype                 *p = (ctype*)packed;                                                             \
219     PetscInt              i;                                                                               \
220     for (i=0; i<n; i++) {                                                                                  \
221       PetscInt j = idx[i];                                                                                 \
222       FILTER(PairType(type1,type2) v = u[j]);                                                              \
223       if (p[i].a op u[j].a) {                                                                              \
224         u[j] = p[i];                                                                                       \
225       } else if (p[i].a == u[j].a) {                                                                       \
226         u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */                               \
227       }                                                                                                    \
228       FILTER(p[i] = v);                                                                                    \
229     }                                                                                                      \
230     PetscFunctionReturn(0);                                                                                \
231   }
232 
233 
234 /* Pack/unpack/fetch ops for all types */
235 #define DEF_PackNoInit(type,BS)                                                         \
236   DEF_PackFunc(type,BS)                                                                 \
237   DEF_Action(Unpack,type,BS,IGNORE,const type,const void)                               \
238   DEF_Action(Fetch, type,BS,EXECUTE,type,void)                                          \
239 
240 
241 /* Extra addition ops for types supporting them */
242 #define DEF_PackAddNoInit(type,BS)                                                      \
243   DEF_PackNoInit(type,BS)                                                               \
244   DEF_ActionAndOp(Unpack,Add, type,BS,+,BINARY_OP,IGNORE,const type,const void)         \
245   DEF_ActionAndOp(Unpack,Mult,type,BS,*,BINARY_OP,IGNORE,const type,const void)         \
246   DEF_ActionAndOp(Fetch, Add, type,BS,+,BINARY_OP,EXECUTE,type,void)                    \
247   DEF_ActionAndOp(Fetch, Mult,type,BS,*,BINARY_OP,EXECUTE,type,void)
248 
249 /* Basic types */
250 #define DEF_Pack(type,BS)                                                               \
251   DEF_PackAddNoInit(type,BS)                                                            \
252   static void CPPJoin3_(PackInit_,type,BS)(PetscSFPack link) {                          \
253     link->Pack            = CPPJoin3_(Pack_,           type,BS);                        \
254     link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,type,BS);                        \
255     link->UnpackAndAdd    = CPPJoin3_(UnpackAndAdd_,   type,BS);                        \
256     link->UnpackAndMult   = CPPJoin3_(UnpackAndMult_,  type,BS);                        \
257     link->FetchAndInsert  = CPPJoin3_(FetchAndInsert_, type,BS);                        \
258     link->FetchAndAdd     = CPPJoin3_(FetchAndAdd_,    type,BS);                        \
259     link->FetchAndMult    = CPPJoin3_(FetchAndMult_,   type,BS);                        \
260     link->unitbytes       = sizeof(type);                                               \
261   }
262 
263 /* Comparable types */
264 #define DEF_PackCmp(type)                                                               \
265   DEF_PackAddNoInit(type,1)                                                             \
266   DEF_ActionAndOp(Unpack,Max,type,1,PetscMax,FUNCTION_OP,IGNORE,const type,const void)  \
267   DEF_ActionAndOp(Unpack,Min,type,1,PetscMin,FUNCTION_OP,IGNORE,const type,const void)  \
268   DEF_ActionAndOp(Fetch, Max,type,1,PetscMax,FUNCTION_OP,EXECUTE,type,void)             \
269   DEF_ActionAndOp(Fetch, Min,type,1,PetscMin,FUNCTION_OP,EXECUTE,type,void)             \
270   static void CPPJoin2(PackInit_,type)(PetscSFPack link) {                              \
271     link->Pack            = CPPJoin3_(Pack_,           type,1);                         \
272     link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,type,1);                         \
273     link->UnpackAndAdd    = CPPJoin3_(UnpackAndAdd_,   type,1);                         \
274     link->UnpackAndMult   = CPPJoin3_(UnpackAndMult_,  type,1);                         \
275     link->UnpackAndMax    = CPPJoin3_(UnpackAndMax_,   type,1);                         \
276     link->UnpackAndMin    = CPPJoin3_(UnpackAndMin_,   type,1);                         \
277     link->FetchAndInsert  = CPPJoin3_(FetchAndInsert_, type,1);                         \
278     link->FetchAndAdd     = CPPJoin3_(FetchAndAdd_ ,   type,1);                         \
279     link->FetchAndMult    = CPPJoin3_(FetchAndMult_,   type,1);                         \
280     link->FetchAndMax     = CPPJoin3_(FetchAndMax_ ,   type,1);                         \
281     link->FetchAndMin     = CPPJoin3_(FetchAndMin_ ,   type,1);                         \
282     link->unitbytes       = sizeof(type);                                               \
283   }
284 
285 /* Logical Types */
286 /* The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid
287    the compilation warning "empty macro arguments are undefined in ISO C90"
288  */
289 #define DEF_PackLog(type)                                                               \
290   DEF_ActionAndOp(Unpack,LAND,type,1,&&,BINARY_OP,IGNORE,const type,const void)         \
291   DEF_ActionAndOp(Unpack,LOR, type,1,||,BINARY_OP,IGNORE,const type,const void)         \
292   DEF_ActionAndOp(Unpack,LXOR,type,1,&, LXOR_OP,  IGNORE,const type,const void)         \
293   DEF_ActionAndOp(Fetch, LAND,type,1,&&,BINARY_OP,EXECUTE,type,void)                    \
294   DEF_ActionAndOp(Fetch, LOR, type,1,||,BINARY_OP,EXECUTE,type,void)                    \
295   DEF_ActionAndOp(Fetch, LXOR,type,1,&, LXOR_OP,  EXECUTE,type,void)                    \
296   static void CPPJoin2(PackInit_Logical_,type)(PetscSFPack link) {                      \
297     link->UnpackAndLAND   = CPPJoin3_(UnpackAndLAND_,type,1);                           \
298     link->UnpackAndLOR    = CPPJoin3_(UnpackAndLOR_, type,1);                           \
299     link->UnpackAndLXOR   = CPPJoin3_(UnpackAndLXOR_,type,1);                           \
300     link->FetchAndLAND    = CPPJoin3_(FetchAndLAND_, type,1);                           \
301     link->FetchAndLOR     = CPPJoin3_(FetchAndLOR_,  type,1);                           \
302     link->FetchAndLXOR    = CPPJoin3_(FetchAndLXOR_, type,1);                           \
303   }
304 
305 
306 /* Bitwise Types */
307 #define DEF_PackBit(type)                                                               \
308   DEF_ActionAndOp(Unpack,BAND,type,1,&,BINARY_OP,IGNORE,const type,const void)          \
309   DEF_ActionAndOp(Unpack,BOR, type,1,|,BINARY_OP,IGNORE,const type,const void)          \
310   DEF_ActionAndOp(Unpack,BXOR,type,1,^,BINARY_OP,IGNORE,const type,const void)          \
311   DEF_ActionAndOp(Fetch, BAND,type,1,&,BINARY_OP,EXECUTE,type,void)                     \
312   DEF_ActionAndOp(Fetch, BOR, type,1,|,BINARY_OP,EXECUTE,type,void)                     \
313   DEF_ActionAndOp(Fetch, BXOR,type,1,^,BINARY_OP,EXECUTE,type,void)                     \
314   static void CPPJoin2(PackInit_Bitwise_,type)(PetscSFPack link) {                      \
315     link->UnpackAndBAND   = CPPJoin3_(UnpackAndBAND_,type,1);                           \
316     link->UnpackAndBOR    = CPPJoin3_(UnpackAndBOR_, type,1);                           \
317     link->UnpackAndBXOR   = CPPJoin3_(UnpackAndBXOR_,type,1);                           \
318     link->FetchAndBAND    = CPPJoin3_(FetchAndBAND_, type,1);                           \
319     link->FetchAndBOR     = CPPJoin3_(FetchAndBOR_,  type,1);                           \
320     link->FetchAndBXOR    = CPPJoin3_(FetchAndBXOR_, type,1);                           \
321   }
322 
323 
324 /* Pair types */
325 #define DEF_PackPair(type1,type2)                                                                                   \
326   typedef struct {type1 a; type2 b;} PairType(type1,type2);                                                         \
327   DEF_PackFunc(PairType(type1,type2),1)                                                                             \
328   DEF_Action(Unpack,PairType(type1,type2),1,IGNORE,const PairType(type1,type2),const void)                          \
329   DEF_Action(Fetch, PairType(type1,type2),1,EXECUTE,PairType(type1,type2),void)                                     \
330   DEF_ActionAndOp(Unpack,Add,PairType(type1,type2),1,+,PAIRTYPE_OP,IGNORE,const PairType(type1,type2),const void)   \
331   DEF_ActionAndOp(Fetch, Add,PairType(type1,type2),1,+,PAIRTYPE_OP,EXECUTE,PairType(type1,type2),void)              \
332   DEF_ActionAndXloc(Unpack,Max,type1,type2,>,IGNORE,const PairType(type1,type2),const void)                         \
333   DEF_ActionAndXloc(Unpack,Min,type1,type2,<,IGNORE,const PairType(type1,type2),const void)                         \
334   DEF_ActionAndXloc(Fetch, Max,type1,type2,>,EXECUTE,PairType(type1,type2),void)                                    \
335   DEF_ActionAndXloc(Fetch, Min,type1,type2,<,EXECUTE,PairType(type1,type2),void)                                    \
336   static void CPPJoin3_(PackInit_,type1,type2)(PetscSFPack link) {                                                  \
337     link->Pack            = CPPJoin3_(Pack_,           PairType(type1,type2),1);                                    \
338     link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,PairType(type1,type2),1);                                    \
339     link->UnpackAndAdd    = CPPJoin3_(UnpackAndAdd_,   PairType(type1,type2),1);                                    \
340     link->UnpackAndMaxloc = CPPJoin3_(UnpackAndMaxloc_,PairType(type1,type2),1);                                    \
341     link->UnpackAndMinloc = CPPJoin3_(UnpackAndMinloc_,PairType(type1,type2),1);                                    \
342     link->FetchAndInsert  = CPPJoin3_(FetchAndInsert_, PairType(type1,type2),1);                                    \
343     link->FetchAndAdd     = CPPJoin3_(FetchAndAdd_,    PairType(type1,type2),1);                                    \
344     link->FetchAndMaxloc  = CPPJoin3_(FetchAndMaxloc_, PairType(type1,type2),1);                                    \
345     link->FetchAndMinloc  = CPPJoin3_(FetchAndMinloc_, PairType(type1,type2),1);                                    \
346     link->unitbytes       = sizeof(PairType(type1,type2));                                                          \
347   }
348 
349 
350 /* Currently only dumb blocks of data */
351 #define DEF_Block(type,count)                                                           \
352   typedef struct {type v[count];} BlockType(type,count);                                \
353   DEF_PackNoInit(BlockType(type,count),1)                                               \
354   static void CPPJoin3_(PackInit_block_,type,count)(PetscSFPack link) {                 \
355     link->Pack            = CPPJoin3_(Pack_,           BlockType(type,count),1);        \
356     link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,BlockType(type,count),1);        \
357     link->FetchAndInsert  = CPPJoin3_(FetchAndInsert_, BlockType(type,count),1);        \
358     link->unitbytes       = sizeof(BlockType(type,count));                              \
359   }
360 
361 /* The typedef is used to get a typename without space that CPPJoin can handle */
362 typedef signed char SignedChar;
363 typedef unsigned char UnsignedChar;
364 
365 DEF_PackCmp(SignedChar)
366 DEF_PackBit(SignedChar)
367 DEF_PackLog(SignedChar)
368 DEF_PackCmp(UnsignedChar)
369 DEF_PackBit(UnsignedChar)
370 DEF_PackLog(UnsignedChar)
371 DEF_PackCmp(int)
372 DEF_PackBit(int)
373 DEF_PackLog(int)
374 DEF_PackCmp(PetscInt)
375 DEF_PackBit(PetscInt)
376 DEF_PackLog(PetscInt)
377 DEF_Pack(PetscInt,2)
378 DEF_Pack(PetscInt,3)
379 DEF_Pack(PetscInt,4)
380 DEF_Pack(PetscInt,5)
381 DEF_Pack(PetscInt,7)
382 DEF_PackCmp(PetscReal)
383 DEF_PackLog(PetscReal)
384 DEF_Pack(PetscReal,2)
385 DEF_Pack(PetscReal,3)
386 DEF_Pack(PetscReal,4)
387 DEF_Pack(PetscReal,5)
388 DEF_Pack(PetscReal,7)
389 #if defined(PETSC_HAVE_COMPLEX)
390 DEF_Pack(PetscComplex,1)
391 DEF_Pack(PetscComplex,2)
392 DEF_Pack(PetscComplex,3)
393 DEF_Pack(PetscComplex,4)
394 DEF_Pack(PetscComplex,5)
395 DEF_Pack(PetscComplex,7)
396 #endif
397 DEF_PackPair(int,int)
398 DEF_PackPair(PetscInt,PetscInt)
399 DEF_Block(int,1)
400 DEF_Block(int,2)
401 DEF_Block(int,4)
402 DEF_Block(int,8)
403 DEF_Block(char,1)
404 DEF_Block(char,2)
405 DEF_Block(char,4)
406 
407 #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
408 PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
409 {
410   int ierr;
411   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
412   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
413   return MPI_SUCCESS;
414 }
415 #endif
416 
417 PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink)
418 {
419   PetscErrorCode    ierr;
420   PetscSFPack       link,*p;
421   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
422 
423   PetscFunctionBegin;
424   /* Look for types in cache */
425   for (p=&bas->inuse; (link=*p); p=&link->next) {
426     PetscBool match;
427     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
428     if (match && (rkey == link->rkey) && (lkey == link->lkey)) {
429       switch (cmode) {
430       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
431       case PETSC_USE_POINTER: break;
432       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
433       }
434       *mylink = link;
435       PetscFunctionReturn(0);
436     }
437   }
438   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
439   PetscFunctionReturn(0);
440 }
441 
442 PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link)
443 {
444   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
445 
446   PetscFunctionBegin;
447   (*link)->rkey = NULL;
448   (*link)->lkey = NULL;
449   (*link)->next = bas->avail;
450   bas->avail    = *link;
451   *link         = NULL;
452   PetscFunctionReturn(0);
453 }
454 
455 /* Error out on unsupported overlapped communications */
456 PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey)
457 {
458   PetscErrorCode    ierr;
459   PetscSFPack       link,*p;
460   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
461   PetscBool         match;
462 
463   PetscFunctionBegin;
464   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
465      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
466   */
467   if (rkey || lkey) {
468     for (p=&bas->inuse; (link=*p); p=&link->next) {
469       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
470       if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey);
471     }
472   }
473   PetscFunctionReturn(0);
474 }
475 
476 PetscErrorCode PetscSFPackSetupType(PetscSFPack link,MPI_Datatype unit)
477 {
478   PetscErrorCode ierr;
479   PetscBool      isInt,isPetscInt,isPetscReal,is2Int,is2PetscInt,isSignedChar,isUnsignedChar;
480   PetscInt       nPetscIntContig,nPetscRealContig;
481   PetscMPIInt    ni,na,nd,combiner;
482 #if defined(PETSC_HAVE_COMPLEX)
483   PetscBool isPetscComplex;
484   PetscInt nPetscComplexContig;
485 #endif
486 
487   PetscFunctionBegin;
488   ierr = MPIPetsc_Type_compare(unit,MPI_SIGNED_CHAR,&isSignedChar);CHKERRQ(ierr);
489   ierr = MPIPetsc_Type_compare(unit,MPI_UNSIGNED_CHAR,&isUnsignedChar);CHKERRQ(ierr);
490   /* MPI_CHAR is treated below as a dumb block type that does not support reduction according to MPI standard */
491   ierr = MPIPetsc_Type_compare(unit,MPI_INT,&isInt);CHKERRQ(ierr);
492   ierr = MPIPetsc_Type_compare(unit,MPIU_INT,&isPetscInt);CHKERRQ(ierr);
493   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT,&nPetscIntContig);CHKERRQ(ierr);
494   ierr = MPIPetsc_Type_compare(unit,MPIU_REAL,&isPetscReal);CHKERRQ(ierr);
495   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscRealContig);CHKERRQ(ierr);
496 #if defined(PETSC_HAVE_COMPLEX)
497   ierr = MPIPetsc_Type_compare(unit,MPIU_COMPLEX,&isPetscComplex);CHKERRQ(ierr);
498   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplexContig);CHKERRQ(ierr);
499 #endif
500   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
501   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
502   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
503   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE;
504   link->bs = 1;
505 
506   if (isSignedChar) {PackInit_SignedChar(link); PackInit_Logical_SignedChar(link); PackInit_Bitwise_SignedChar(link); link->basicunit = MPI_SIGNED_CHAR;}
507   else if (isUnsignedChar) {PackInit_UnsignedChar(link); PackInit_Logical_UnsignedChar(link); PackInit_Bitwise_UnsignedChar(link); link->basicunit = MPI_UNSIGNED_CHAR;}
508   else if (isInt) {PackInit_int(link); PackInit_Logical_int(link); PackInit_Bitwise_int(link); link->basicunit = MPI_INT;}
509   else if (isPetscInt) {PackInit_PetscInt(link); PackInit_Logical_PetscInt(link); PackInit_Bitwise_PetscInt(link); link->basicunit = MPIU_INT;}
510   else if (isPetscReal) {PackInit_PetscReal(link); PackInit_Logical_PetscReal(link); link->basicunit = MPIU_REAL;}
511 #if defined(PETSC_HAVE_COMPLEX)
512   else if (isPetscComplex) {PackInit_PetscComplex_1(link); link->basicunit = MPIU_COMPLEX;}
513 #endif
514   else if (is2Int) {PackInit_int_int(link); link->basicunit = MPI_2INT;}
515   else if (is2PetscInt) {PackInit_PetscInt_PetscInt(link); link->basicunit = MPIU_2INT;}
516   else if (nPetscIntContig) {
517     if (nPetscIntContig%7 == 0) PackInit_PetscInt_7(link);
518     else if (nPetscIntContig%5 == 0) PackInit_PetscInt_5(link);
519     else if (nPetscIntContig%4 == 0) PackInit_PetscInt_4(link);
520     else if (nPetscIntContig%3 == 0) PackInit_PetscInt_3(link);
521     else if (nPetscIntContig%2 == 0) PackInit_PetscInt_2(link);
522     else PackInit_PetscInt(link);
523     link->bs = nPetscIntContig;
524     link->unitbytes *= nPetscIntContig;
525     link->basicunit = MPIU_INT;
526   } else if (nPetscRealContig) {
527     if (nPetscRealContig%7 == 0) PackInit_PetscReal_7(link);
528     else if (nPetscRealContig%5 == 0) PackInit_PetscReal_5(link);
529     else if (nPetscRealContig%4 == 0) PackInit_PetscReal_4(link);
530     else if (nPetscRealContig%3 == 0) PackInit_PetscReal_3(link);
531     else if (nPetscRealContig%2 == 0) PackInit_PetscReal_2(link);
532     else PackInit_PetscReal(link);
533     link->bs = nPetscRealContig;
534     link->unitbytes *= nPetscRealContig;
535     link->basicunit = MPIU_REAL;
536 #if defined(PETSC_HAVE_COMPLEX)
537   } else if (nPetscComplexContig) {
538     if (nPetscComplexContig%7 == 0) PackInit_PetscComplex_7(link);
539     else if (nPetscComplexContig%5 == 0) PackInit_PetscComplex_5(link);
540     else if (nPetscComplexContig%4 == 0) PackInit_PetscComplex_4(link);
541     else if (nPetscComplexContig%3 == 0) PackInit_PetscComplex_3(link);
542     else if (nPetscComplexContig%2 == 0) PackInit_PetscComplex_2(link);
543     else PackInit_PetscComplex_1(link);
544     link->bs = nPetscComplexContig;
545     link->unitbytes *= nPetscComplexContig;
546     link->basicunit = MPIU_COMPLEX;
547 #endif
548   } else {
549     MPI_Aint lb,bytes;
550     ierr = MPI_Type_get_extent(unit,&lb,&bytes);CHKERRQ(ierr);
551     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
552     if (bytes % sizeof(int)) { /* If the type size is not multiple of int */
553       if      (bytes%4 == 0) {PackInit_block_char_4(link); link->bs = bytes/4;} /* Note the basic type is char[4] */
554       else if (bytes%2 == 0) {PackInit_block_char_2(link); link->bs = bytes/2;}
555       else                   {PackInit_block_char_1(link); link->bs = bytes/1;}
556       link->unitbytes = bytes;
557       link->basicunit = MPI_CHAR;
558     } else {
559       PetscInt nInt = bytes / sizeof(int);
560       if      (nInt%8 == 0)  {PackInit_block_int_8(link);  link->bs = nInt/8;} /* Note the basic type is int[8] */
561       else if (nInt%4 == 0)  {PackInit_block_int_4(link);  link->bs = nInt/4;}
562       else if (nInt%2 == 0)  {PackInit_block_int_2(link);  link->bs = nInt/2;}
563       else                   {PackInit_block_int_1(link);  link->bs = nInt/1;}
564       link->unitbytes = bytes;
565       link->basicunit = MPI_INT;
566     }
567   }
568   if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */
569   else {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
570   PetscFunctionReturn(0);
571 }
572 
573 PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**UnpackAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*))
574 {
575   PetscFunctionBegin;
576   *UnpackAndOp = NULL;
577   if (op == MPIU_REPLACE) *UnpackAndOp = link->UnpackAndInsert;
578   else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->UnpackAndAdd;
579   else if (op == MPI_PROD) *UnpackAndOp = link->UnpackAndMult;
580   else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->UnpackAndMax;
581   else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->UnpackAndMin;
582   else if (op == MPI_LAND)   *UnpackAndOp = link->UnpackAndLAND;
583   else if (op == MPI_BAND)   *UnpackAndOp = link->UnpackAndBAND;
584   else if (op == MPI_LOR)    *UnpackAndOp = link->UnpackAndLOR;
585   else if (op == MPI_BOR)    *UnpackAndOp = link->UnpackAndBOR;
586   else if (op == MPI_LXOR)   *UnpackAndOp = link->UnpackAndLXOR;
587   else if (op == MPI_BXOR)   *UnpackAndOp = link->UnpackAndBXOR;
588   else if (op == MPI_MAXLOC) *UnpackAndOp = link->UnpackAndMaxloc;
589   else if (op == MPI_MINLOC) *UnpackAndOp = link->UnpackAndMinloc;
590   else *UnpackAndOp = NULL;
591   PetscFunctionReturn(0);
592 }
593 
594 PetscErrorCode PetscSFPackGetFetchAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**FetchAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*))
595 {
596   PetscFunctionBegin;
597   *FetchAndOp = NULL;
598   if (op == MPIU_REPLACE) *FetchAndOp = link->FetchAndInsert;
599   else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->FetchAndAdd;
600   else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->FetchAndMax;
601   else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->FetchAndMin;
602   else if (op == MPI_MAXLOC) *FetchAndOp = link->FetchAndMaxloc;
603   else if (op == MPI_MINLOC) *FetchAndOp = link->FetchAndMinloc;
604   else if (op == MPI_PROD)   *FetchAndOp = link->FetchAndMult;
605   else if (op == MPI_LAND)   *FetchAndOp = link->FetchAndLAND;
606   else if (op == MPI_BAND)   *FetchAndOp = link->FetchAndBAND;
607   else if (op == MPI_LOR)    *FetchAndOp = link->FetchAndLOR;
608   else if (op == MPI_BOR)    *FetchAndOp = link->FetchAndBOR;
609   else if (op == MPI_LXOR)   *FetchAndOp = link->FetchAndLXOR;
610   else if (op == MPI_BXOR)   *FetchAndOp = link->FetchAndBXOR;
611   else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for MPI_Op");
612   PetscFunctionReturn(0);
613 }
614 
615 /*
616   Setup pack/unpack optimization plans based on indice patterns available
617 
618    Input Parameters:
619   +  n       - number of target processors
620   .  offset  - [n+1] for the i-th processor, its associated indices are idx[offset[i], offset[i+1])
621   -  idx     - [] array storing indices. Its length is offset[n+1]
622 
623    Output Parameters:
624   +  opt    - the optimization
625 */
626 PetscErrorCode PetscSFPackSetupOptimization(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
627 {
628   PetscErrorCode ierr;
629   PetscInt       i,j,k,n_copies,tot_copies=0,step;
630   PetscBool      strided,has_strided=PETSC_FALSE,has_optimized=PETSC_FALSE;
631   PetscSFPackOpt opt;
632 
633   PetscFunctionBegin;
634   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
635   ierr = PetscCalloc2(n,&opt->optimized,n+1,&opt->copy_offset);CHKERRQ(ierr);
636 
637   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */
638   for (i=0; i<n; i++) { /* for each target processor */
639     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
640     n_copies = 1;
641     for (j=offset[i]; j<offset[i+1]-1; j++) {
642       if (idx[j]+1 != idx[j+1]) n_copies++;
643     }
644     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
645        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
646      */
647     if ((offset[i+1]-offset[i])/n_copies >= 32) {
648       opt->optimized[i] = PETSC_TRUE;
649       has_optimized     = PETSC_TRUE;
650       tot_copies       += n_copies;
651     }
652   }
653 
654   /* Setup memcpy plan for each contiguous piece */
655   k    = 0; /* k-th copy */
656   ierr = PetscMalloc2(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length);CHKERRQ(ierr);
657   for (i=0; i<n; i++) { /* for each target processor procs[i] */
658     if (opt->optimized[i]) {
659       n_copies           = 1;
660       opt->copy_start[k] = idx[offset[i]];
661       for (j=offset[i]; j<offset[i+1]-1; j++) {
662         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
663           n_copies++;
664           opt->copy_start[k+1] = idx[j+1];
665           opt->copy_length[k]  = idx[j] - opt->copy_start[k] + 1;
666           k++;
667         }
668       }
669       /* Set copy length of the last copy for this target */
670       opt->copy_length[k] = idx[j] - opt->copy_start[k] + 1;
671       k++;
672     }
673     /* Set offset for next target. When optimized[i]=false, copy_offsets[i]=copy_offsets[i+1] */
674     opt->copy_offset[i+1] = k;
675   }
676 
677   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
678   ierr = PetscMalloc3(n,&opt->stride_first,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
679   for (i=0; i<n; i++) { /* for each remote */
680     if (!opt->optimized[i] && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
681       strided = PETSC_TRUE;
682       step    = idx[offset[i]+1] - idx[offset[i]];
683       for (j=offset[i]; j<offset[i+1]-1; j++) {
684         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
685       }
686       if (strided) {
687         opt->optimized[i]    = PETSC_TRUE;
688         opt->stride_first[i] = idx[offset[i]];
689         opt->stride_step[i]  = step;
690         opt->stride_n[i]     = offset[i+1] - offset[i];
691         has_strided          = PETSC_TRUE;
692         has_optimized        = PETSC_TRUE;
693       }
694     }
695   }
696   /* If no target has been stride-optimized or optimized, free related arrays to save memory */
697   if (!has_strided) {ierr = PetscFree3(opt->stride_first,opt->stride_step,opt->stride_n);CHKERRQ(ierr);}
698   if (!has_optimized) {
699     ierr = PetscFree2(opt->optimized,opt->copy_offset);CHKERRQ(ierr);
700     ierr = PetscFree2(opt->copy_start,opt->copy_length);CHKERRQ(ierr);
701     ierr = PetscFree(opt);CHKERRQ(ierr);
702     *out = NULL;
703   } else *out = opt;
704   PetscFunctionReturn(0);
705 }
706 
707 PetscErrorCode PetscSFPackDestoryOptimization(PetscSFPackOpt *out)
708 {
709   PetscErrorCode ierr;
710   PetscSFPackOpt opt = *out;
711 
712   PetscFunctionBegin;
713   if (opt) {
714     ierr = PetscFree2(opt->optimized,opt->copy_offset);CHKERRQ(ierr);
715     ierr = PetscFree2(opt->copy_start,opt->copy_length);CHKERRQ(ierr);
716     ierr = PetscFree3(opt->stride_first,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
717     ierr = PetscFree(opt);CHKERRQ(ierr);
718     *out = NULL;
719   }
720   PetscFunctionReturn(0);
721 }
722