xref: /petsc/src/sys/utils/mpits.c (revision fbfcfee5110779e3d6a9465ca0a2e0f9a1a6e5e3)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 
3 PetscLogEvent PETSC_BuildTwoSided,PETSC_BuildTwoSidedF;
4 
5 const char *const PetscBuildTwoSidedTypes[] = {
6   "ALLREDUCE",
7   "IBARRIER",
8   "REDSCATTER",
9   "PetscBuildTwoSidedType",
10   "PETSC_BUILDTWOSIDED_",
11   0
12 };
13 
14 static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
15 
16 /*@
17    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
18 
19    Logically Collective
20 
21    Input Arguments:
22 +  comm - PETSC_COMM_WORLD
23 -  twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()
24 
25    Level: developer
26 
27    Note:
28    This option is currently global, but could be made per-communicator.
29 
30 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedGetType()
31 @*/
32 PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)
33 {
34   PetscFunctionBegin;
35 #if defined(PETSC_USE_DEBUG)
36   {                             /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
37     PetscMPIInt ierr;
38     PetscMPIInt b1[2],b2[2];
39     b1[0] = -(PetscMPIInt)twosided;
40     b1[1] = (PetscMPIInt)twosided;
41     ierr  = MPIU_Allreduce(b1,b2,2,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
42     if (-b2[0] != b2[1]) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"Enum value must be same on all processes");
43   }
44 #endif
45   _twosided_type = twosided;
46   PetscFunctionReturn(0);
47 }
48 
49 /*@
50    PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication
51 
52    Logically Collective
53 
54    Output Arguments:
55 +  comm - communicator on which to query algorithm
56 -  twosided - algorithm to use for PetscCommBuildTwoSided()
57 
58    Level: developer
59 
60 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedSetType()
61 @*/
62 PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType *twosided)
63 {
64   PetscErrorCode ierr;
65 
66   PetscFunctionBegin;
67   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
68   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
69 #if defined(PETSC_HAVE_MPI_IBARRIER)
70 #  if defined(PETSC_HAVE_MPICH_CH3_SOCK) && !defined(PETSC_HAVE_MPICH_CH3_SOCK_FIXED_NBC_PROGRESS)
71     /* Deadlock in Ibarrier: http://trac.mpich.org/projects/mpich/ticket/1785 */
72     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE;
73 #  else
74     _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
75 #  endif
76 #else
77     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE;
78 #endif
79     ierr = PetscOptionsGetEnum(NULL,NULL,"-build_twosided",PetscBuildTwoSidedTypes,(PetscEnum*)&_twosided_type,NULL);CHKERRQ(ierr);
80   }
81   *twosided = _twosided_type;
82   PetscFunctionReturn(0);
83 }
84 
85 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
86 
87 static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
88 {
89   PetscErrorCode ierr;
90   PetscMPIInt    nrecvs,tag,done,i;
91   MPI_Aint       lb,unitbytes;
92   char           *tdata;
93   MPI_Request    *sendreqs,barrier;
94   PetscSegBuffer segrank,segdata;
95   PetscBool      barrier_started;
96 
97   PetscFunctionBegin;
98   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
99   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
100   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
101   tdata = (char*)todata;
102   ierr  = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
103   for (i=0; i<nto; i++) {
104     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
105   }
106   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
107   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
108 
109   nrecvs  = 0;
110   barrier = MPI_REQUEST_NULL;
111   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
112    * but we need to work around it. */
113   barrier_started = PETSC_FALSE;
114   for (done=0; !done; ) {
115     PetscMPIInt flag;
116     MPI_Status  status;
117     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
118     if (flag) {                 /* incoming message */
119       PetscMPIInt *recvrank;
120       void        *buf;
121       ierr      = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
122       ierr      = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
123       *recvrank = status.MPI_SOURCE;
124       ierr      = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
125       nrecvs++;
126     }
127     if (!barrier_started) {
128       PetscMPIInt sent,nsends;
129       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
130       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
131       if (sent) {
132 #if defined(PETSC_HAVE_MPI_IBARRIER)
133         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
134 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
135         ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
136 #endif
137         barrier_started = PETSC_TRUE;
138         ierr = PetscFree(sendreqs);CHKERRQ(ierr);
139       }
140     } else {
141       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
142     }
143   }
144   *nfrom = nrecvs;
145   ierr   = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
146   ierr   = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
147   ierr   = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
148   ierr   = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
149   ierr   = PetscCommDestroy(&comm);CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 #endif
153 
154 static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
155 {
156   PetscErrorCode ierr;
157   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i;
158   MPI_Aint       lb,unitbytes;
159   char           *tdata,*fdata;
160   MPI_Request    *reqs,*sendreqs;
161   MPI_Status     *statuses;
162 
163   PetscFunctionBegin;
164   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
165   ierr = PetscCalloc1(size,&iflags);CHKERRQ(ierr);
166   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
167   ierr = PetscGatherNumberOfMessages(comm,iflags,NULL,&nrecvs);CHKERRQ(ierr);
168   ierr = PetscFree(iflags);CHKERRQ(ierr);
169 
170   ierr     = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
171   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
172   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
173   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
174   tdata    = (char*)todata;
175   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
176   sendreqs = reqs + nrecvs;
177   for (i=0; i<nrecvs; i++) {
178     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
179   }
180   for (i=0; i<nto; i++) {
181     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
182   }
183   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
184   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
185   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
186   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
187   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
188 
189   *nfrom            = nrecvs;
190   *fromranks        = franks;
191   *(void**)fromdata = fdata;
192   PetscFunctionReturn(0);
193 }
194 
195 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
196 static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
197 {
198   PetscErrorCode ierr;
199   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i;
200   MPI_Aint       lb,unitbytes;
201   char           *tdata,*fdata;
202   MPI_Request    *reqs,*sendreqs;
203   MPI_Status     *statuses;
204 
205   PetscFunctionBegin;
206   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
207   ierr = PetscMalloc1(size,&iflags);CHKERRQ(ierr);
208   ierr = PetscMemzero(iflags,size*sizeof(*iflags));CHKERRQ(ierr);
209   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
210   ierr = MPI_Reduce_scatter_block(iflags,&nrecvs,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
211   ierr = PetscFree(iflags);CHKERRQ(ierr);
212 
213   ierr     = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
214   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
215   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
216   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
217   tdata    = (char*)todata;
218   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
219   sendreqs = reqs + nrecvs;
220   for (i=0; i<nrecvs; i++) {
221     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
222   }
223   for (i=0; i<nto; i++) {
224     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
225   }
226   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
227   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
228   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
229   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
230   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
231 
232   *nfrom            = nrecvs;
233   *fromranks        = franks;
234   *(void**)fromdata = fdata;
235   PetscFunctionReturn(0);
236 }
237 #endif
238 
239 /*@C
240    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
241 
242    Collective on MPI_Comm
243 
244    Input Arguments:
245 +  comm - communicator
246 .  count - number of entries to send/receive (must match on all ranks)
247 .  dtype - datatype to send/receive from each rank (must match on all ranks)
248 .  nto - number of ranks to send data to
249 .  toranks - ranks to send to (array of length nto)
250 -  todata - data to send to each rank (packed)
251 
252    Output Arguments:
253 +  nfrom - number of ranks receiving messages from
254 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
255 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
256 
257    Level: developer
258 
259    Options Database Keys:
260 .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication
261 
262    Notes:
263    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
264    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.
265 
266    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
267 
268    References:
269 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
270    Scalable communication protocols for dynamic sparse data exchange, 2010.
271 
272 .seealso: PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
273 @*/
274 PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
275 {
276   PetscErrorCode         ierr;
277   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
278 
279   PetscFunctionBegin;
280   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
281   ierr = PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
282   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
283   switch (buildtype) {
284   case PETSC_BUILDTWOSIDED_IBARRIER:
285 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
286     ierr = PetscCommBuildTwoSided_Ibarrier(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
287 #else
288     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
289 #endif
290     break;
291   case PETSC_BUILDTWOSIDED_ALLREDUCE:
292     ierr = PetscCommBuildTwoSided_Allreduce(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
293     break;
294   case PETSC_BUILDTWOSIDED_REDSCATTER:
295 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
296     ierr = PetscCommBuildTwoSided_RedScatter(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
297 #else
298     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
299 #endif
300     break;
301   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
302   }
303   ierr = PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
304   PetscFunctionReturn(0);
305 }
306 
307 static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
308                                                            PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
309                                                            PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
310                                                            PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
311 {
312   PetscErrorCode ierr;
313   PetscMPIInt i,*tag;
314   MPI_Aint    lb,unitbytes;
315   MPI_Request *sendreq,*recvreq;
316 
317   PetscFunctionBegin;
318   ierr = PetscMalloc1(ntags,&tag);CHKERRQ(ierr);
319   if (ntags > 0) {
320     ierr = PetscCommDuplicate(comm,&comm,&tag[0]);CHKERRQ(ierr);
321   }
322   for (i=1; i<ntags; i++) {
323     ierr = PetscCommGetNewTag(comm,&tag[i]);CHKERRQ(ierr);
324   }
325 
326   /* Perform complete initial rendezvous */
327   ierr = PetscCommBuildTwoSided(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
328 
329   ierr = PetscMalloc1(nto*ntags,&sendreq);CHKERRQ(ierr);
330   ierr = PetscMalloc1(*nfrom*ntags,&recvreq);CHKERRQ(ierr);
331 
332   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
333   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
334   for (i=0; i<nto; i++) {
335     PetscMPIInt k;
336     for (k=0; k<ntags; k++) sendreq[i*ntags+k] = MPI_REQUEST_NULL;
337     ierr = (*send)(comm,tag,i,toranks[i],((char*)todata)+count*unitbytes*i,sendreq+i*ntags,ctx);CHKERRQ(ierr);
338   }
339   for (i=0; i<*nfrom; i++) {
340     void *header = (*(char**)fromdata) + count*unitbytes*i;
341     PetscMPIInt k;
342     for (k=0; k<ntags; k++) recvreq[i*ntags+k] = MPI_REQUEST_NULL;
343     ierr = (*recv)(comm,tag,(*fromranks)[i],header,recvreq+i*ntags,ctx);CHKERRQ(ierr);
344   }
345   ierr = PetscFree(tag);CHKERRQ(ierr);
346   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
347   *toreqs = sendreq;
348   *fromreqs = recvreq;
349   PetscFunctionReturn(0);
350 }
351 
352 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
353 
354 static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
355                                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
356                                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
357                                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
358 {
359   PetscErrorCode ierr;
360   PetscMPIInt    nrecvs,tag,*tags,done,i;
361   MPI_Aint       lb,unitbytes;
362   char           *tdata;
363   MPI_Request    *sendreqs,*usendreqs,*req,barrier;
364   PetscSegBuffer segrank,segdata,segreq;
365   PetscBool      barrier_started;
366 
367   PetscFunctionBegin;
368   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
369   ierr = PetscMalloc1(ntags,&tags);CHKERRQ(ierr);
370   for (i=0; i<ntags; i++) {
371     ierr = PetscCommGetNewTag(comm,&tags[i]);CHKERRQ(ierr);
372   }
373   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
374   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
375   tdata = (char*)todata;
376   ierr = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
377   ierr = PetscMalloc1(nto*ntags,&usendreqs);CHKERRQ(ierr);
378   /* Post synchronous sends */
379   for (i=0; i<nto; i++) {
380     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
381   }
382   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
383    * synchronous messages above. */
384   for (i=0; i<nto; i++) {
385     PetscMPIInt k;
386     for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
387     ierr = (*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx);CHKERRQ(ierr);
388   }
389 
390   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
391   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
392   ierr = PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq);CHKERRQ(ierr);
393 
394   nrecvs  = 0;
395   barrier = MPI_REQUEST_NULL;
396   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
397    * but we need to work around it. */
398   barrier_started = PETSC_FALSE;
399   for (done=0; !done; ) {
400     PetscMPIInt flag;
401     MPI_Status  status;
402     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
403     if (flag) {                 /* incoming message */
404       PetscMPIInt *recvrank,k;
405       void        *buf;
406       ierr = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
407       ierr = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
408       *recvrank = status.MPI_SOURCE;
409       ierr = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
410       ierr = PetscSegBufferGet(segreq,ntags,&req);CHKERRQ(ierr);
411       for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
412       ierr = (*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx);CHKERRQ(ierr);
413       nrecvs++;
414     }
415     if (!barrier_started) {
416       PetscMPIInt sent,nsends;
417       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
418       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
419       if (sent) {
420 #if defined(PETSC_HAVE_MPI_IBARRIER)
421         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
422 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
423         ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
424 #endif
425         barrier_started = PETSC_TRUE;
426       }
427     } else {
428       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
429     }
430   }
431   *nfrom = nrecvs;
432   ierr = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
433   ierr = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
434   ierr = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
435   ierr = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
436   *toreqs = usendreqs;
437   ierr = PetscSegBufferExtractAlloc(segreq,fromreqs);CHKERRQ(ierr);
438   ierr = PetscSegBufferDestroy(&segreq);CHKERRQ(ierr);
439   ierr = PetscFree(sendreqs);CHKERRQ(ierr);
440   ierr = PetscFree(tags);CHKERRQ(ierr);
441   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
442   PetscFunctionReturn(0);
443 }
444 #endif
445 
446 /*@C
447    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
448 
449    Collective on MPI_Comm
450 
451    Input Arguments:
452 +  comm - communicator
453 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
454 .  dtype - datatype to send/receive from each rank (must match on all ranks)
455 .  nto - number of ranks to send data to
456 .  toranks - ranks to send to (array of length nto)
457 .  todata - data to send to each rank (packed)
458 .  ntags - number of tags needed by send/recv callbacks
459 .  send - callback invoked on sending process when ready to send primary payload
460 .  recv - callback invoked on receiving process after delivery of rendezvous message
461 -  ctx - context for callbacks
462 
463    Output Arguments:
464 +  nfrom - number of ranks receiving messages from
465 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
466 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
467 
468    Level: developer
469 
470    Notes:
471    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
472    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
473 
474    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
475 
476    References:
477 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
478    Scalable communication protocols for dynamic sparse data exchange, 2010.
479 
480 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedFReq(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
481 @*/
482 PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,
483                                        PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
484                                        PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
485 {
486   PetscErrorCode ierr;
487   MPI_Request    *toreqs,*fromreqs;
488 
489   PetscFunctionBegin;
490   ierr = PetscCommBuildTwoSidedFReq(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,&toreqs,&fromreqs,send,recv,ctx);CHKERRQ(ierr);
491   ierr = MPI_Waitall(nto*ntags,toreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
492   ierr = MPI_Waitall(*nfrom*ntags,fromreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
493   ierr = PetscFree(toreqs);CHKERRQ(ierr);
494   ierr = PetscFree(fromreqs);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 /*@C
499    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
500 
501    Collective on MPI_Comm
502 
503    Input Arguments:
504 +  comm - communicator
505 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
506 .  dtype - datatype to send/receive from each rank (must match on all ranks)
507 .  nto - number of ranks to send data to
508 .  toranks - ranks to send to (array of length nto)
509 .  todata - data to send to each rank (packed)
510 .  ntags - number of tags needed by send/recv callbacks
511 .  send - callback invoked on sending process when ready to send primary payload
512 .  recv - callback invoked on receiving process after delivery of rendezvous message
513 -  ctx - context for callbacks
514 
515    Output Arguments:
516 +  nfrom - number of ranks receiving messages from
517 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
518 .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
519 .  toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
520 -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())
521 
522    Level: developer
523 
524    Notes:
525    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
526    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
527 
528    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
529 
530    References:
531 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
532    Scalable communication protocols for dynamic sparse data exchange, 2010.
533 
534 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedF(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
535 @*/
536 PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
537                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
538                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
539                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
540 {
541   PetscErrorCode         ierr,(*f)(MPI_Comm,PetscMPIInt,MPI_Datatype,PetscMPIInt,const PetscMPIInt[],const void*,
542                                    PetscMPIInt*,PetscMPIInt**,void*,PetscMPIInt,MPI_Request**,MPI_Request**,
543                                    PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
544                                    PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx);
545   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
546   PetscMPIInt i,size;
547 
548   PetscFunctionBegin;
549   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
550   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
551   for (i=0; i<nto; i++) {
552     if (toranks[i] < 0 || size <= toranks[i]) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"toranks[%d] %d not in comm size %d",i,toranks[i],size);
553   }
554   ierr = PetscLogEventBegin(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
555   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
556   switch (buildtype) {
557   case PETSC_BUILDTWOSIDED_IBARRIER:
558 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
559     f = PetscCommBuildTwoSidedFReq_Ibarrier;
560 #else
561     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
562 #endif
563     break;
564   case PETSC_BUILDTWOSIDED_ALLREDUCE:
565   case PETSC_BUILDTWOSIDED_REDSCATTER:
566     f = PetscCommBuildTwoSidedFReq_Reference;
567     break;
568   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
569   }
570   ierr = (*f)(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,toreqs,fromreqs,send,recv,ctx);CHKERRQ(ierr);
571   ierr = PetscLogEventEnd(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
572   PetscFunctionReturn(0);
573 }
574