xref: /petsc/src/sys/utils/mpits.c (revision 95c0884e6f7665b705eebf88174e89dc920c2fc0)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 PetscLogEvent PETSC_BuildTwoSided;
5 PetscLogEvent PETSC_BuildTwoSidedF;
6 
7 const char *const PetscBuildTwoSidedTypes[] = {
8   "ALLREDUCE",
9   "IBARRIER",
10   "REDSCATTER",
11   "PetscBuildTwoSidedType",
12   "PETSC_BUILDTWOSIDED_",
13   0
14 };
15 
16 static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
17 
18 /*@
19    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
20 
21    Logically Collective
22 
23    Input Arguments:
24 +  comm - PETSC_COMM_WORLD
25 -  twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()
26 
27    Level: developer
28 
29    Note:
30    This option is currently global, but could be made per-communicator.
31 
32 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedGetType()
33 @*/
34 PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)
35 {
36   PetscFunctionBegin;
37 #if defined(PETSC_USE_DEBUG)
38   {                             /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
39     PetscMPIInt ierr;
40     PetscMPIInt b1[2],b2[2];
41     b1[0] = -(PetscMPIInt)twosided;
42     b1[1] = (PetscMPIInt)twosided;
43     ierr  = MPIU_Allreduce(b1,b2,2,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
44     if (-b2[0] != b2[1]) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"Enum value must be same on all processes");
45   }
46 #endif
47   _twosided_type = twosided;
48   PetscFunctionReturn(0);
49 }
50 
51 /*@
52    PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication
53 
54    Logically Collective
55 
56    Output Arguments:
57 +  comm - communicator on which to query algorithm
58 -  twosided - algorithm to use for PetscCommBuildTwoSided()
59 
60    Level: developer
61 
62 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedSetType()
63 @*/
64 PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType *twosided)
65 {
66   PetscErrorCode ierr;
67 
68   PetscFunctionBegin;
69   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
70   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
71 #if defined(PETSC_HAVE_MPI_IBARRIER)
72 #  if defined(PETSC_HAVE_MPICH_CH3_SOCK) && !defined(PETSC_HAVE_MPICH_CH3_SOCK_FIXED_NBC_PROGRESS)
73     /* Deadlock in Ibarrier: http://trac.mpich.org/projects/mpich/ticket/1785 */
74     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE;
75 #  else
76     _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
77 #  endif
78 #else
79     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE;
80 #endif
81     ierr = PetscOptionsGetEnum(NULL,NULL,"-build_twosided",PetscBuildTwoSidedTypes,(PetscEnum*)&_twosided_type,NULL);CHKERRQ(ierr);
82   }
83   *twosided = _twosided_type;
84   PetscFunctionReturn(0);
85 }
86 
87 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
88 
89 static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
90 {
91   PetscErrorCode ierr;
92   PetscMPIInt    nrecvs,tag,done,i;
93   MPI_Aint       lb,unitbytes;
94   char           *tdata;
95   MPI_Request    *sendreqs,barrier;
96   PetscSegBuffer segrank,segdata;
97   PetscBool      barrier_started;
98 
99   PetscFunctionBegin;
100   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
101   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
102   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
103   tdata = (char*)todata;
104   ierr  = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
105   for (i=0; i<nto; i++) {
106     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
107   }
108   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
109   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
110 
111   nrecvs  = 0;
112   barrier = MPI_REQUEST_NULL;
113   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
114    * but we need to work around it. */
115   barrier_started = PETSC_FALSE;
116   for (done=0; !done; ) {
117     PetscMPIInt flag;
118     MPI_Status  status;
119     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
120     if (flag) {                 /* incoming message */
121       PetscMPIInt *recvrank;
122       void        *buf;
123       ierr      = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
124       ierr      = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
125       *recvrank = status.MPI_SOURCE;
126       ierr      = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
127       nrecvs++;
128     }
129     if (!barrier_started) {
130       PetscMPIInt sent,nsends;
131       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
132       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
133       if (sent) {
134 #if defined(PETSC_HAVE_MPI_IBARRIER)
135         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
136 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
137         ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
138 #endif
139         barrier_started = PETSC_TRUE;
140         ierr = PetscFree(sendreqs);CHKERRQ(ierr);
141       }
142     } else {
143       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
144     }
145   }
146   *nfrom = nrecvs;
147   ierr   = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
148   ierr   = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
149   ierr   = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
150   ierr   = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
151   ierr   = PetscCommDestroy(&comm);CHKERRQ(ierr);
152   PetscFunctionReturn(0);
153 }
154 #endif
155 
156 static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
157 {
158   PetscErrorCode ierr;
159   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i;
160   MPI_Aint       lb,unitbytes;
161   char           *tdata,*fdata;
162   MPI_Request    *reqs,*sendreqs;
163   MPI_Status     *statuses;
164 
165   PetscFunctionBegin;
166   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
167   ierr = PetscCalloc1(size,&iflags);CHKERRQ(ierr);
168   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
169   ierr = PetscGatherNumberOfMessages(comm,iflags,NULL,&nrecvs);CHKERRQ(ierr);
170   ierr = PetscFree(iflags);CHKERRQ(ierr);
171 
172   ierr     = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
173   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
174   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
175   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
176   tdata    = (char*)todata;
177   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
178   sendreqs = reqs + nrecvs;
179   for (i=0; i<nrecvs; i++) {
180     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
181   }
182   for (i=0; i<nto; i++) {
183     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
184   }
185   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
186   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
187   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
188   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
189   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
190 
191   *nfrom            = nrecvs;
192   *fromranks        = franks;
193   *(void**)fromdata = fdata;
194   PetscFunctionReturn(0);
195 }
196 
197 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
198 static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
199 {
200   PetscErrorCode ierr;
201   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i;
202   MPI_Aint       lb,unitbytes;
203   char           *tdata,*fdata;
204   MPI_Request    *reqs,*sendreqs;
205   MPI_Status     *statuses;
206 
207   PetscFunctionBegin;
208   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
209   ierr = PetscMalloc1(size,&iflags);CHKERRQ(ierr);
210   ierr = PetscMemzero(iflags,size*sizeof(*iflags));CHKERRQ(ierr);
211   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
212   ierr = MPI_Reduce_scatter_block(iflags,&nrecvs,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
213   ierr = PetscFree(iflags);CHKERRQ(ierr);
214 
215   ierr     = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
216   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
217   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
218   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
219   tdata    = (char*)todata;
220   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
221   sendreqs = reqs + nrecvs;
222   for (i=0; i<nrecvs; i++) {
223     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
224   }
225   for (i=0; i<nto; i++) {
226     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
227   }
228   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
229   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
230   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
231   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
232   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
233 
234   *nfrom            = nrecvs;
235   *fromranks        = franks;
236   *(void**)fromdata = fdata;
237   PetscFunctionReturn(0);
238 }
239 #endif
240 
241 /*@C
242    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
243 
244    Collective on MPI_Comm
245 
246    Input Arguments:
247 +  comm - communicator
248 .  count - number of entries to send/receive (must match on all ranks)
249 .  dtype - datatype to send/receive from each rank (must match on all ranks)
250 .  nto - number of ranks to send data to
251 .  toranks - ranks to send to (array of length nto)
252 -  todata - data to send to each rank (packed)
253 
254    Output Arguments:
255 +  nfrom - number of ranks receiving messages from
256 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
257 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
258 
259    Level: developer
260 
261    Options Database Keys:
262 .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication
263 
264    Notes:
265    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
266    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.
267 
268    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
269 
270    References:
271 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
272    Scalable communication protocols for dynamic sparse data exchange, 2010.
273 
274 .seealso: PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
275 @*/
276 PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
277 {
278   PetscErrorCode         ierr;
279   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
280 
281   PetscFunctionBegin;
282   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
283   ierr = PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
284   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
285   switch (buildtype) {
286   case PETSC_BUILDTWOSIDED_IBARRIER:
287 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
288     ierr = PetscCommBuildTwoSided_Ibarrier(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
289 #else
290     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
291 #endif
292     break;
293   case PETSC_BUILDTWOSIDED_ALLREDUCE:
294     ierr = PetscCommBuildTwoSided_Allreduce(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
295     break;
296   case PETSC_BUILDTWOSIDED_REDSCATTER:
297 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
298     ierr = PetscCommBuildTwoSided_RedScatter(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
299 #else
300     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
301 #endif
302     break;
303   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
304   }
305   ierr = PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
306   PetscFunctionReturn(0);
307 }
308 
309 static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
310                                                            PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
311                                                            PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
312                                                            PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
313 {
314   PetscErrorCode ierr;
315   PetscMPIInt i,*tag;
316   MPI_Aint    lb,unitbytes;
317   MPI_Request *sendreq,*recvreq;
318 
319   PetscFunctionBegin;
320   ierr = PetscMalloc1(ntags,&tag);CHKERRQ(ierr);
321   if (ntags > 0) {
322     ierr = PetscCommDuplicate(comm,&comm,&tag[0]);CHKERRQ(ierr);
323   }
324   for (i=1; i<ntags; i++) {
325     ierr = PetscCommGetNewTag(comm,&tag[i]);CHKERRQ(ierr);
326   }
327 
328   /* Perform complete initial rendezvous */
329   ierr = PetscCommBuildTwoSided(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
330 
331   ierr = PetscMalloc1(nto*ntags,&sendreq);CHKERRQ(ierr);
332   ierr = PetscMalloc1(*nfrom*ntags,&recvreq);CHKERRQ(ierr);
333 
334   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
335   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
336   for (i=0; i<nto; i++) {
337     PetscMPIInt k;
338     for (k=0; k<ntags; k++) sendreq[i*ntags+k] = MPI_REQUEST_NULL;
339     ierr = (*send)(comm,tag,i,toranks[i],((char*)todata)+count*unitbytes*i,sendreq+i*ntags,ctx);CHKERRQ(ierr);
340   }
341   for (i=0; i<*nfrom; i++) {
342     void *header = (*(char**)fromdata) + count*unitbytes*i;
343     PetscMPIInt k;
344     for (k=0; k<ntags; k++) recvreq[i*ntags+k] = MPI_REQUEST_NULL;
345     ierr = (*recv)(comm,tag,(*fromranks)[i],header,recvreq+i*ntags,ctx);CHKERRQ(ierr);
346   }
347   ierr = PetscFree(tag);CHKERRQ(ierr);
348   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
349   *toreqs = sendreq;
350   *fromreqs = recvreq;
351   PetscFunctionReturn(0);
352 }
353 
354 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
355 
356 static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
357                                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
358                                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
359                                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
360 {
361   PetscErrorCode ierr;
362   PetscMPIInt    nrecvs,tag,*tags,done,i;
363   MPI_Aint       lb,unitbytes;
364   char           *tdata;
365   MPI_Request    *sendreqs,*usendreqs,*req,barrier;
366   PetscSegBuffer segrank,segdata,segreq;
367   PetscBool      barrier_started;
368 
369   PetscFunctionBegin;
370   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
371   ierr = PetscMalloc1(ntags,&tags);CHKERRQ(ierr);
372   for (i=0; i<ntags; i++) {
373     ierr = PetscCommGetNewTag(comm,&tags[i]);CHKERRQ(ierr);
374   }
375   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
376   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
377   tdata = (char*)todata;
378   ierr = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
379   ierr = PetscMalloc1(nto*ntags,&usendreqs);CHKERRQ(ierr);
380   /* Post synchronous sends */
381   for (i=0; i<nto; i++) {
382     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
383   }
384   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
385    * synchronous messages above. */
386   for (i=0; i<nto; i++) {
387     PetscMPIInt k;
388     for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
389     ierr = (*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx);CHKERRQ(ierr);
390   }
391 
392   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
393   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
394   ierr = PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq);CHKERRQ(ierr);
395 
396   nrecvs  = 0;
397   barrier = MPI_REQUEST_NULL;
398   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
399    * but we need to work around it. */
400   barrier_started = PETSC_FALSE;
401   for (done=0; !done; ) {
402     PetscMPIInt flag;
403     MPI_Status  status;
404     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
405     if (flag) {                 /* incoming message */
406       PetscMPIInt *recvrank,k;
407       void        *buf;
408       ierr = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
409       ierr = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
410       *recvrank = status.MPI_SOURCE;
411       ierr = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
412       ierr = PetscSegBufferGet(segreq,ntags,&req);CHKERRQ(ierr);
413       for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
414       ierr = (*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx);CHKERRQ(ierr);
415       nrecvs++;
416     }
417     if (!barrier_started) {
418       PetscMPIInt sent,nsends;
419       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
420       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
421       if (sent) {
422 #if defined(PETSC_HAVE_MPI_IBARRIER)
423         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
424 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
425         ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
426 #endif
427         barrier_started = PETSC_TRUE;
428       }
429     } else {
430       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
431     }
432   }
433   *nfrom = nrecvs;
434   ierr = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
435   ierr = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
436   ierr = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
437   ierr = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
438   *toreqs = usendreqs;
439   ierr = PetscSegBufferExtractAlloc(segreq,fromreqs);CHKERRQ(ierr);
440   ierr = PetscSegBufferDestroy(&segreq);CHKERRQ(ierr);
441   ierr = PetscFree(sendreqs);CHKERRQ(ierr);
442   ierr = PetscFree(tags);CHKERRQ(ierr);
443   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
444   PetscFunctionReturn(0);
445 }
446 #endif
447 
448 /*@C
449    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
450 
451    Collective on MPI_Comm
452 
453    Input Arguments:
454 +  comm - communicator
455 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
456 .  dtype - datatype to send/receive from each rank (must match on all ranks)
457 .  nto - number of ranks to send data to
458 .  toranks - ranks to send to (array of length nto)
459 .  todata - data to send to each rank (packed)
460 .  ntags - number of tags needed by send/recv callbacks
461 .  send - callback invoked on sending process when ready to send primary payload
462 .  recv - callback invoked on receiving process after delivery of rendezvous message
463 -  ctx - context for callbacks
464 
465    Output Arguments:
466 +  nfrom - number of ranks receiving messages from
467 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
468 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
469 
470    Level: developer
471 
472    Notes:
473    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
474    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
475 
476    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
477 
478    References:
479 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
480    Scalable communication protocols for dynamic sparse data exchange, 2010.
481 
482 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedFReq(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
483 @*/
484 PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,
485                                        PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
486                                        PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
487 {
488   PetscErrorCode ierr;
489   MPI_Request    *toreqs,*fromreqs;
490 
491   PetscFunctionBegin;
492   ierr = PetscCommBuildTwoSidedFReq(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,&toreqs,&fromreqs,send,recv,ctx);CHKERRQ(ierr);
493   ierr = MPI_Waitall(nto*ntags,toreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
494   ierr = MPI_Waitall(*nfrom*ntags,fromreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
495   ierr = PetscFree(toreqs);CHKERRQ(ierr);
496   ierr = PetscFree(fromreqs);CHKERRQ(ierr);
497   PetscFunctionReturn(0);
498 }
499 
500 /*@C
501    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
502 
503    Collective on MPI_Comm
504 
505    Input Arguments:
506 +  comm - communicator
507 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
508 .  dtype - datatype to send/receive from each rank (must match on all ranks)
509 .  nto - number of ranks to send data to
510 .  toranks - ranks to send to (array of length nto)
511 .  todata - data to send to each rank (packed)
512 .  ntags - number of tags needed by send/recv callbacks
513 .  send - callback invoked on sending process when ready to send primary payload
514 .  recv - callback invoked on receiving process after delivery of rendezvous message
515 -  ctx - context for callbacks
516 
517    Output Arguments:
518 +  nfrom - number of ranks receiving messages from
519 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
520 .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
521 .  toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
522 -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())
523 
524    Level: developer
525 
526    Notes:
527    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
528    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
529 
530    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
531 
532    References:
533 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
534    Scalable communication protocols for dynamic sparse data exchange, 2010.
535 
536 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedF(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
537 @*/
538 PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
539                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
540                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
541                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
542 {
543   PetscErrorCode         ierr,(*f)(MPI_Comm,PetscMPIInt,MPI_Datatype,PetscMPIInt,const PetscMPIInt[],const void*,
544                                    PetscMPIInt*,PetscMPIInt**,void*,PetscMPIInt,MPI_Request**,MPI_Request**,
545                                    PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
546                                    PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx);
547   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
548   PetscMPIInt i,size;
549 
550   PetscFunctionBegin;
551   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
552   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
553   for (i=0; i<nto; i++) {
554     if (toranks[i] < 0 || size <= toranks[i]) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"toranks[%d] %d not in comm size %d",i,toranks[i],size);
555   }
556   ierr = PetscLogEventBegin(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
557   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
558   switch (buildtype) {
559   case PETSC_BUILDTWOSIDED_IBARRIER:
560 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
561     f = PetscCommBuildTwoSidedFReq_Ibarrier;
562 #else
563     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
564 #endif
565     break;
566   case PETSC_BUILDTWOSIDED_ALLREDUCE:
567   case PETSC_BUILDTWOSIDED_REDSCATTER:
568     f = PetscCommBuildTwoSidedFReq_Reference;
569     break;
570   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
571   }
572   ierr = (*f)(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,toreqs,fromreqs,send,recv,ctx);CHKERRQ(ierr);
573   ierr = PetscLogEventEnd(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
574   PetscFunctionReturn(0);
575 }
576