xref: /petsc/src/sys/utils/mpits.c (revision 89157a57023e004f42a2c772078aa1e89232a29a)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 
3 PetscLogEvent PETSC_BuildTwoSided;
4 
5 const char *const PetscBuildTwoSidedTypes[] = {
6   "ALLREDUCE",
7   "IBARRIER",
8   "REDSCATTER",
9   "PetscBuildTwoSidedType",
10   "PETSC_BUILDTWOSIDED_",
11   0
12 };
13 
14 static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
15 
16 #undef __FUNCT__
17 #define __FUNCT__ "PetscCommBuildTwoSidedSetType"
18 /*@
19    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
20 
21    Logically Collective
22 
23    Input Arguments:
24 +  comm - PETSC_COMM_WORLD
25 -  twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()
26 
27    Level: developer
28 
29    Note:
30    This option is currently global, but could be made per-communicator.
31 
32 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedGetType()
33 @*/
34 PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)
35 {
36   PetscFunctionBegin;
37 #if defined(PETSC_USE_DEBUG)
38   {                             /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
39     PetscMPIInt ierr;
40     PetscMPIInt b1[2],b2[2];
41     b1[0] = -(PetscMPIInt)twosided;
42     b1[1] = (PetscMPIInt)twosided;
43     ierr  = MPI_Allreduce(b1,b2,2,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
44     if (-b2[0] != b2[1]) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"Enum value must be same on all processes");
45   }
46 #endif
47   _twosided_type = twosided;
48   PetscFunctionReturn(0);
49 }
50 
51 #undef __FUNCT__
52 #define __FUNCT__ "PetscCommBuildTwoSidedGetType"
53 /*@
54    PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication
55 
56    Logically Collective
57 
58    Output Arguments:
59 +  comm - communicator on which to query algorithm
60 -  twosided - algorithm to use for PetscCommBuildTwoSided()
61 
62    Level: developer
63 
64 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedSetType()
65 @*/
66 PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType *twosided)
67 {
68   PetscErrorCode ierr;
69 
70   PetscFunctionBegin;
71   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
72   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
73 #if defined(PETSC_HAVE_MPI_IBARRIER)
74 #  if defined(PETSC_HAVE_MPICH_CH3_SOCK) && !defined(PETSC_HAVE_MPICH_CH3_SOCK_FIXED_NBC_PROGRESS)
75     /* Deadlock in Ibarrier: http://trac.mpich.org/projects/mpich/ticket/1785 */
76     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE;
77 #  else
78     _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
79 #  endif
80 #else
81     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE;
82 #endif
83     ierr = PetscOptionsGetEnum(NULL,"-build_twosided",PetscBuildTwoSidedTypes,(PetscEnum*)&_twosided_type,NULL);CHKERRQ(ierr);
84   }
85   *twosided = _twosided_type;
86   PetscFunctionReturn(0);
87 }
88 
89 #if defined(PETSC_HAVE_MPI_IBARRIER)
90 
91 #undef __FUNCT__
92 #define __FUNCT__ "PetscCommBuildTwoSided_Ibarrier"
93 static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
94 {
95   PetscErrorCode ierr;
96   PetscMPIInt    nrecvs,tag,done,i;
97   MPI_Aint       lb,unitbytes;
98   char           *tdata;
99   MPI_Request    *sendreqs,barrier;
100   PetscSegBuffer segrank,segdata;
101 
102   PetscFunctionBegin;
103   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
104   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
105   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
106   tdata = (char*)todata;
107   ierr  = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
108   for (i=0; i<nto; i++) {
109     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
110   }
111   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
112   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
113 
114   nrecvs  = 0;
115   barrier = MPI_REQUEST_NULL;
116   for (done=0; !done; ) {
117     PetscMPIInt flag;
118     MPI_Status  status;
119     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
120     if (flag) {                 /* incoming message */
121       PetscMPIInt *recvrank;
122       void        *buf;
123       ierr      = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
124       ierr      = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
125       *recvrank = status.MPI_SOURCE;
126       ierr      = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
127       nrecvs++;
128     }
129     if (barrier == MPI_REQUEST_NULL) {
130       PetscMPIInt sent,nsends;
131       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
132       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
133       if (sent) {
134         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
135         ierr = PetscFree(sendreqs);CHKERRQ(ierr);
136       }
137     } else {
138       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
139     }
140   }
141   *nfrom = nrecvs;
142   ierr   = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
143   ierr   = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
144   ierr   = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
145   ierr   = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
146   ierr   = PetscCommDestroy(&comm);CHKERRQ(ierr);
147   PetscFunctionReturn(0);
148 }
149 #endif
150 
151 #undef __FUNCT__
152 #define __FUNCT__ "PetscCommBuildTwoSided_Allreduce"
153 static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
154 {
155   PetscErrorCode ierr;
156   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i;
157   MPI_Aint       lb,unitbytes;
158   char           *tdata,*fdata;
159   MPI_Request    *reqs,*sendreqs;
160   MPI_Status     *statuses;
161 
162   PetscFunctionBegin;
163   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
164   ierr = PetscCalloc1(size,&iflags);CHKERRQ(ierr);
165   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
166   ierr = PetscGatherNumberOfMessages(comm,iflags,NULL,&nrecvs);CHKERRQ(ierr);
167   ierr = PetscFree(iflags);CHKERRQ(ierr);
168 
169   ierr     = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
170   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
171   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
172   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
173   tdata    = (char*)todata;
174   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
175   sendreqs = reqs + nrecvs;
176   for (i=0; i<nrecvs; i++) {
177     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
178   }
179   for (i=0; i<nto; i++) {
180     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
181   }
182   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
183   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
184   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
185   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
186   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
187 
188   *nfrom            = nrecvs;
189   *fromranks        = franks;
190   *(void**)fromdata = fdata;
191   PetscFunctionReturn(0);
192 }
193 
194 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
195 #undef __FUNCT__
196 #define __FUNCT__ "PetscCommBuildTwoSided_RedScatter"
197 static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
198 {
199   PetscErrorCode ierr;
200   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i;
201   MPI_Aint       lb,unitbytes;
202   char           *tdata,*fdata;
203   MPI_Request    *reqs,*sendreqs;
204   MPI_Status     *statuses;
205 
206   PetscFunctionBegin;
207   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
208   ierr = PetscMalloc1(size,&iflags);CHKERRQ(ierr);
209   ierr = PetscMemzero(iflags,size*sizeof(*iflags));CHKERRQ(ierr);
210   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
211   ierr = MPI_Reduce_scatter_block(iflags,&nrecvs,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
212   ierr = PetscFree(iflags);CHKERRQ(ierr);
213 
214   ierr     = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
215   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
216   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
217   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
218   tdata    = (char*)todata;
219   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
220   sendreqs = reqs + nrecvs;
221   for (i=0; i<nrecvs; i++) {
222     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
223   }
224   for (i=0; i<nto; i++) {
225     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
226   }
227   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
228   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
229   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
230   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
231   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
232 
233   *nfrom            = nrecvs;
234   *fromranks        = franks;
235   *(void**)fromdata = fdata;
236   PetscFunctionReturn(0);
237 }
238 #endif
239 
240 #undef __FUNCT__
241 #define __FUNCT__ "PetscCommBuildTwoSided"
242 /*@C
243    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
244 
245    Collective on MPI_Comm
246 
247    Input Arguments:
248 +  comm - communicator
249 .  count - number of entries to send/receive (must match on all ranks)
250 .  dtype - datatype to send/receive from each rank (must match on all ranks)
251 .  nto - number of ranks to send data to
252 .  toranks - ranks to send to (array of length nto)
253 -  todata - data to send to each rank (packed)
254 
255    Output Arguments:
256 +  nfrom - number of ranks receiving messages from
257 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
258 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
259 
260    Level: developer
261 
262    Options Database Keys:
263 .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication
264 
265    Notes:
266    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
267    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.
268 
269    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
270 
271    References:
272    The MPI_Ibarrier implementation uses the algorithm in
273    Hoefler, Siebert and Lumsdaine, Scalable communication protocols for dynamic sparse data exchange, 2010.
274 
275 .seealso: PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
276 @*/
277 PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
278 {
279   PetscErrorCode         ierr;
280   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
281 
282   PetscFunctionBegin;
283   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
284   ierr = PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
285   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
286   switch (buildtype) {
287   case PETSC_BUILDTWOSIDED_IBARRIER:
288 #if defined(PETSC_HAVE_MPI_IBARRIER)
289     ierr = PetscCommBuildTwoSided_Ibarrier(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
290 #else
291     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
292 #endif
293     break;
294   case PETSC_BUILDTWOSIDED_ALLREDUCE:
295     ierr = PetscCommBuildTwoSided_Allreduce(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
296     break;
297   case PETSC_BUILDTWOSIDED_REDSCATTER:
298 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
299     ierr = PetscCommBuildTwoSided_RedScatter(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
300 #else
301     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
302 #endif
303     break;
304   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
305   }
306   ierr = PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 #undef __FUNCT__
311 #define __FUNCT__ "PetscCommBuildTwoSidedFReq_Reference"
312 static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
313                                                            PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
314                                                            PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
315                                                            PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
316 {
317   PetscErrorCode ierr;
318   PetscMPIInt i,*tag;
319   MPI_Aint    lb,unitbytes;
320   MPI_Request *sendreq,*recvreq;
321 
322   PetscFunctionBegin;
323   ierr = PetscMalloc1(ntags,&tag);CHKERRQ(ierr);
324   if (ntags > 0) {
325     ierr = PetscCommDuplicate(comm,&comm,&tag[0]);CHKERRQ(ierr);
326   }
327   for (i=1; i<ntags; i++) {
328     ierr = PetscCommGetNewTag(comm,&tag[i]);CHKERRQ(ierr);
329   }
330 
331   /* Perform complete initial rendezvous */
332   ierr = PetscCommBuildTwoSided(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
333 
334   ierr = PetscMalloc1(nto*ntags,&sendreq);CHKERRQ(ierr);
335   ierr = PetscMalloc1(*nfrom*ntags,&recvreq);CHKERRQ(ierr);
336 
337   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
338   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
339   for (i=0; i<nto; i++) {
340     PetscMPIInt k;
341     for (k=0; k<ntags; k++) sendreq[i*ntags+k] = MPI_REQUEST_NULL;
342     ierr = (*send)(comm,tag,i,toranks[i],((char*)todata)+count*unitbytes*i,sendreq+i*ntags,ctx);CHKERRQ(ierr);
343   }
344   for (i=0; i<*nfrom; i++) {
345     void *header = (*(char**)fromdata) + count*unitbytes*i;
346     PetscMPIInt k;
347     for (k=0; k<ntags; k++) recvreq[i*ntags+k] = MPI_REQUEST_NULL;
348     ierr = (*recv)(comm,tag,(*fromranks)[i],header,recvreq+i*ntags,ctx);CHKERRQ(ierr);
349   }
350   ierr = PetscFree(tag);CHKERRQ(ierr);
351   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
352   *toreqs = sendreq;
353   *fromreqs = recvreq;
354   PetscFunctionReturn(0);
355 }
356 
357 #if defined(PETSC_HAVE_MPI_IBARRIER)
358 
359 #undef __FUNCT__
360 #define __FUNCT__ "PetscCommBuildTwoSidedFReq_Ibarrier"
361 static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
362                                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
363                                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
364                                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
365 {
366   PetscErrorCode ierr;
367   PetscMPIInt    nrecvs,tag,*tags,done,i;
368   MPI_Aint       lb,unitbytes;
369   char           *tdata;
370   MPI_Request    *sendreqs,*usendreqs,*req,barrier;
371   PetscSegBuffer segrank,segdata,segreq;
372 
373   PetscFunctionBegin;
374   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
375   ierr = PetscMalloc1(ntags,&tags);CHKERRQ(ierr);
376   for (i=0; i<ntags; i++) {
377     ierr = PetscCommGetNewTag(comm,&tags[i]);CHKERRQ(ierr);
378   }
379   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
380   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
381   tdata = (char*)todata;
382   ierr = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
383   ierr = PetscMalloc1(nto*ntags,&usendreqs);CHKERRQ(ierr);
384   /* Post synchronous sends */
385   for (i=0; i<nto; i++) {
386     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
387   }
388   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
389    * synchronous messages above. */
390   for (i=0; i<nto; i++) {
391     PetscMPIInt k;
392     for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
393     ierr = (*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx);CHKERRQ(ierr);
394   }
395 
396   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
397   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
398   ierr = PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq);CHKERRQ(ierr);
399 
400   nrecvs  = 0;
401   barrier = MPI_REQUEST_NULL;
402   for (done=0; !done; ) {
403     PetscMPIInt flag;
404     MPI_Status  status;
405     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
406     if (flag) {                 /* incoming message */
407       PetscMPIInt *recvrank,k;
408       void        *buf;
409       ierr = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
410       ierr = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
411       *recvrank = status.MPI_SOURCE;
412       ierr = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
413       ierr = PetscSegBufferGet(segreq,ntags,&req);CHKERRQ(ierr);
414       for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
415       ierr = (*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx);CHKERRQ(ierr);
416       nrecvs++;
417     }
418     if (barrier == MPI_REQUEST_NULL) {
419       PetscMPIInt sent,nsends;
420       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
421       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
422       if (sent) {
423         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
424       }
425     } else {
426       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
427     }
428   }
429   *nfrom = nrecvs;
430   ierr = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
431   ierr = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
432   ierr = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
433   ierr = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
434   *toreqs = usendreqs;
435   ierr = PetscSegBufferExtractAlloc(segreq,fromreqs);CHKERRQ(ierr);
436   ierr = PetscSegBufferDestroy(&segreq);CHKERRQ(ierr);
437   ierr = PetscFree(sendreqs);CHKERRQ(ierr);
438   ierr = PetscFree(tags);CHKERRQ(ierr);
439   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
440   PetscFunctionReturn(0);
441 }
442 #endif
443 
444 #undef __FUNCT__
445 #define __FUNCT__ "PetscCommBuildTwoSidedF"
446 /*@C
447    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
448 
449    Collective on MPI_Comm
450 
451    Input Arguments:
452 +  comm - communicator
453 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
454 .  dtype - datatype to send/receive from each rank (must match on all ranks)
455 .  nto - number of ranks to send data to
456 .  toranks - ranks to send to (array of length nto)
457 .  todata - data to send to each rank (packed)
458 .  ntags - number of tags needed by send/recv callbacks
459 .  send - callback invoked on sending process when ready to send primary payload
460 .  recv - callback invoked on receiving process after delivery of rendezvous message
461 -  ctx - context for callbacks
462 
463    Output Arguments:
464 +  nfrom - number of ranks receiving messages from
465 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
466 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
467 
468    Level: developer
469 
470    Notes:
471    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
472    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
473 
474    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
475 
476    References:
477    The MPI_Ibarrier implementation uses the algorithm in
478    Hoefler, Siebert and Lumsdaine, Scalable communication protocols for dynamic sparse data exchange, 2010.
479 
480 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedFReq(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
481 @*/
482 PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,
483                                        PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
484                                        PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
485 {
486   PetscErrorCode ierr;
487   MPI_Request    *toreqs,*fromreqs;
488 
489   PetscFunctionBegin;
490   ierr = PetscCommBuildTwoSidedFReq(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,&toreqs,&fromreqs,send,recv,ctx);CHKERRQ(ierr);
491   ierr = MPI_Waitall(nto*ntags,toreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
492   ierr = MPI_Waitall(*nfrom*ntags,fromreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
493   ierr = PetscFree(toreqs);CHKERRQ(ierr);
494   ierr = PetscFree(fromreqs);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #undef __FUNCT__
499 #define __FUNCT__ "PetscCommBuildTwoSidedFReq"
500 /*@C
501    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
502 
503    Collective on MPI_Comm
504 
505    Input Arguments:
506 +  comm - communicator
507 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
508 .  dtype - datatype to send/receive from each rank (must match on all ranks)
509 .  nto - number of ranks to send data to
510 .  toranks - ranks to send to (array of length nto)
511 .  todata - data to send to each rank (packed)
512 .  ntags - number of tags needed by send/recv callbacks
513 .  send - callback invoked on sending process when ready to send primary payload
514 .  recv - callback invoked on receiving process after delivery of rendezvous message
515 -  ctx - context for callbacks
516 
517    Output Arguments:
518 +  nfrom - number of ranks receiving messages from
519 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
520 .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
521 .  toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
522 -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())
523 
524    Level: developer
525 
526    Notes:
527    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
528    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
529 
530    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
531 
532    References:
533    The MPI_Ibarrier implementation uses the algorithm in
534    Hoefler, Siebert and Lumsdaine, Scalable communication protocols for dynamic sparse data exchange, 2010.
535 
536 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedF(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
537 @*/
538 PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
539                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
540                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
541                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
542 {
543   PetscErrorCode         ierr,(*f)(MPI_Comm,PetscMPIInt,MPI_Datatype,PetscMPIInt,const PetscMPIInt[],const void*,
544                                    PetscMPIInt*,PetscMPIInt**,void*,PetscMPIInt,MPI_Request**,MPI_Request**,
545                                    PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
546                                    PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx);
547   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
548 
549   PetscFunctionBegin;
550   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
551   ierr = PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
552   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
553   switch (buildtype) {
554   case PETSC_BUILDTWOSIDED_IBARRIER:
555 #if defined(PETSC_HAVE_MPI_IBARRIER)
556     f = PetscCommBuildTwoSidedFReq_Ibarrier;
557 #else
558     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
559 #endif
560     break;
561   case PETSC_BUILDTWOSIDED_ALLREDUCE:
562   case PETSC_BUILDTWOSIDED_REDSCATTER:
563     f = PetscCommBuildTwoSidedFReq_Reference;
564     break;
565   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
566   }
567   ierr = (*f)(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,toreqs,fromreqs,send,recv,ctx);CHKERRQ(ierr);
568   ierr = PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
569   PetscFunctionReturn(0);
570 }
571