xref: /petsc/src/sys/utils/mpits.c (revision 030f984af8d8bb4c203755d35bded3c05b3d83ce)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 PetscLogEvent PETSC_BuildTwoSided;
5 PetscLogEvent PETSC_BuildTwoSidedF;
6 
7 const char *const PetscBuildTwoSidedTypes[] = {
8   "ALLREDUCE",
9   "IBARRIER",
10   "REDSCATTER",
11   "PetscBuildTwoSidedType",
12   "PETSC_BUILDTWOSIDED_",
13   NULL
14 };
15 
16 static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
17 
18 /*@
19    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
20 
21    Logically Collective
22 
23    Input Arguments:
24 +  comm - PETSC_COMM_WORLD
25 -  twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()
26 
27    Level: developer
28 
29    Note:
30    This option is currently global, but could be made per-communicator.
31 
32 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedGetType()
33 @*/
34 PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)
35 {
36   PetscFunctionBegin;
37   if (PetscDefined(USE_DEBUG)) {                             /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
38     PetscMPIInt ierr;
39     PetscMPIInt b1[2],b2[2];
40     b1[0] = -(PetscMPIInt)twosided;
41     b1[1] = (PetscMPIInt)twosided;
42     ierr  = MPIU_Allreduce(b1,b2,2,MPI_INT,MPI_MAX,comm);CHKERRMPI(ierr);
43     if (-b2[0] != b2[1]) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"Enum value must be same on all processes");
44   }
45   _twosided_type = twosided;
46   PetscFunctionReturn(0);
47 }
48 
49 /*@
50    PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication
51 
52    Logically Collective
53 
54    Output Arguments:
55 +  comm - communicator on which to query algorithm
56 -  twosided - algorithm to use for PetscCommBuildTwoSided()
57 
58    Level: developer
59 
60 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedSetType()
61 @*/
62 PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType *twosided)
63 {
64   PetscErrorCode ierr;
65   PetscMPIInt    size;
66 
67   PetscFunctionBegin;
68   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
69   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
70     ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
71     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE; /* default for small comms, see https://gitlab.com/petsc/petsc/-/merge_requests/2611 */
72 #if defined(PETSC_HAVE_MPI_IBARRIER)
73     if (size > 1024) _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
74 #endif
75     ierr = PetscOptionsGetEnum(NULL,NULL,"-build_twosided",PetscBuildTwoSidedTypes,(PetscEnum*)&_twosided_type,NULL);CHKERRQ(ierr);
76   }
77   *twosided = _twosided_type;
78   PetscFunctionReturn(0);
79 }
80 
81 #if defined(PETSC_HAVE_MPI_IBARRIER)
82 static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
83 {
84   PetscErrorCode ierr;
85   PetscMPIInt    nrecvs,tag,done,i;
86   MPI_Aint       lb,unitbytes;
87   char           *tdata;
88   MPI_Request    *sendreqs,barrier;
89   PetscSegBuffer segrank,segdata;
90   PetscBool      barrier_started;
91 
92   PetscFunctionBegin;
93   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
94   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRMPI(ierr);
95   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
96   tdata = (char*)todata;
97   ierr  = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
98   for (i=0; i<nto; i++) {
99     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRMPI(ierr);
100   }
101   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
102   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
103 
104   nrecvs  = 0;
105   barrier = MPI_REQUEST_NULL;
106   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
107    * but we need to work around it. */
108   barrier_started = PETSC_FALSE;
109   for (done=0; !done;) {
110     PetscMPIInt flag;
111     MPI_Status  status;
112     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRMPI(ierr);
113     if (flag) {                 /* incoming message */
114       PetscMPIInt *recvrank;
115       void        *buf;
116       ierr      = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
117       ierr      = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
118       *recvrank = status.MPI_SOURCE;
119       ierr      = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRMPI(ierr);
120       nrecvs++;
121     }
122     if (!barrier_started) {
123       PetscMPIInt sent,nsends;
124       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
125       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
126       if (sent) {
127         ierr = MPI_Ibarrier(comm,&barrier);CHKERRMPI(ierr);
128         barrier_started = PETSC_TRUE;
129         ierr = PetscFree(sendreqs);CHKERRQ(ierr);
130       }
131     } else {
132       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRMPI(ierr);
133     }
134   }
135   *nfrom = nrecvs;
136   ierr   = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
137   ierr   = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
138   ierr   = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
139   ierr   = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
140   ierr   = PetscCommDestroy(&comm);CHKERRQ(ierr);
141   PetscFunctionReturn(0);
142 }
143 #endif
144 
145 static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
146 {
147   PetscErrorCode   ierr;
148   PetscMPIInt      size,rank,*iflags,nrecvs,tag,*franks,i,flg;
149   MPI_Aint         lb,unitbytes;
150   char             *tdata,*fdata;
151   MPI_Request      *reqs,*sendreqs;
152   MPI_Status       *statuses;
153   PetscCommCounter *counter;
154 
155   PetscFunctionBegin;
156   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
157   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
158   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
159   ierr = MPI_Comm_get_attr(comm,Petsc_Counter_keyval,&counter,&flg);CHKERRMPI(ierr);
160   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Inner PETSc communicator does not have its tag/name counter attribute set");
161   if (!counter->iflags) {
162     ierr   = PetscCalloc1(size,&counter->iflags);CHKERRQ(ierr);
163     iflags = counter->iflags;
164   } else {
165     iflags = counter->iflags;
166     ierr   = PetscArrayzero(iflags,size);CHKERRQ(ierr);
167   }
168   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
169   ierr     = MPIU_Allreduce(MPI_IN_PLACE,iflags,size,MPI_INT,MPI_SUM,comm);CHKERRMPI(ierr);
170   nrecvs   = iflags[rank];
171   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRMPI(ierr);
172   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
173   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
174   tdata    = (char*)todata;
175   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
176   sendreqs = reqs + nrecvs;
177   for (i=0; i<nrecvs; i++) {
178     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRMPI(ierr);
179   }
180   for (i=0; i<nto; i++) {
181     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRMPI(ierr);
182   }
183   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRMPI(ierr);
184   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
185   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
186   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
187   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
188 
189   *nfrom            = nrecvs;
190   *fromranks        = franks;
191   *(void**)fromdata = fdata;
192   PetscFunctionReturn(0);
193 }
194 
195 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
196 static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
197 {
198   PetscErrorCode ierr;
199   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i,flg;
200   MPI_Aint       lb,unitbytes;
201   char           *tdata,*fdata;
202   MPI_Request    *reqs,*sendreqs;
203   MPI_Status     *statuses;
204   PetscCommCounter *counter;
205 
206   PetscFunctionBegin;
207   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
208   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
209   ierr = MPI_Comm_get_attr(comm,Petsc_Counter_keyval,&counter,&flg);CHKERRMPI(ierr);
210   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Inner PETSc communicator does not have its tag/name counter attribute set");
211   if (!counter->iflags) {
212     ierr   = PetscCalloc1(size,&counter->iflags);CHKERRQ(ierr);
213     iflags = counter->iflags;
214   } else {
215     iflags = counter->iflags;
216     ierr   = PetscArrayzero(iflags,size);CHKERRQ(ierr);
217   }
218   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
219   ierr     = MPI_Reduce_scatter_block(iflags,&nrecvs,1,MPI_INT,MPI_SUM,comm);CHKERRMPI(ierr);
220   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRMPI(ierr);
221   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
222   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
223   tdata    = (char*)todata;
224   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
225   sendreqs = reqs + nrecvs;
226   for (i=0; i<nrecvs; i++) {
227     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRMPI(ierr);
228   }
229   for (i=0; i<nto; i++) {
230     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRMPI(ierr);
231   }
232   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRMPI(ierr);
233   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
234   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
235   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
236   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
237 
238   *nfrom            = nrecvs;
239   *fromranks        = franks;
240   *(void**)fromdata = fdata;
241   PetscFunctionReturn(0);
242 }
243 #endif
244 
245 /*@C
246    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
247 
248    Collective
249 
250    Input Arguments:
251 +  comm - communicator
252 .  count - number of entries to send/receive (must match on all ranks)
253 .  dtype - datatype to send/receive from each rank (must match on all ranks)
254 .  nto - number of ranks to send data to
255 .  toranks - ranks to send to (array of length nto)
256 -  todata - data to send to each rank (packed)
257 
258    Output Arguments:
259 +  nfrom - number of ranks receiving messages from
260 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
261 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
262 
263    Level: developer
264 
265    Options Database Keys:
266 .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication. Default is allreduce for communicators with <= 1024 ranks, otherwise ibarrier.
267 
268    Notes:
269    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
270    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.
271 
272    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
273 
274    References:
275 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
276    Scalable communication protocols for dynamic sparse data exchange, 2010.
277 
278 .seealso: PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
279 @*/
280 PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
281 {
282   PetscErrorCode         ierr;
283   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
284 
285   PetscFunctionBegin;
286   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
287   ierr = PetscLogEventSync(PETSC_BuildTwoSided,comm);CHKERRQ(ierr);
288   ierr = PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
289   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
290   switch (buildtype) {
291   case PETSC_BUILDTWOSIDED_IBARRIER:
292 #if defined(PETSC_HAVE_MPI_IBARRIER)
293     ierr = PetscCommBuildTwoSided_Ibarrier(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
294     break;
295 #else
296     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
297 #endif
298   case PETSC_BUILDTWOSIDED_ALLREDUCE:
299     ierr = PetscCommBuildTwoSided_Allreduce(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
300     break;
301   case PETSC_BUILDTWOSIDED_REDSCATTER:
302 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
303     ierr = PetscCommBuildTwoSided_RedScatter(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
304     break;
305 #else
306     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
307 #endif
308   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
309   }
310   ierr = PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
311   PetscFunctionReturn(0);
312 }
313 
314 static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
315                                                            PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
316                                                            PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
317                                                            PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
318 {
319   PetscErrorCode ierr;
320   PetscMPIInt i,*tag;
321   MPI_Aint    lb,unitbytes;
322   MPI_Request *sendreq,*recvreq;
323 
324   PetscFunctionBegin;
325   ierr = PetscMalloc1(ntags,&tag);CHKERRQ(ierr);
326   if (ntags > 0) {
327     ierr = PetscCommDuplicate(comm,&comm,&tag[0]);CHKERRQ(ierr);
328   }
329   for (i=1; i<ntags; i++) {
330     ierr = PetscCommGetNewTag(comm,&tag[i]);CHKERRQ(ierr);
331   }
332 
333   /* Perform complete initial rendezvous */
334   ierr = PetscCommBuildTwoSided(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
335 
336   ierr = PetscMalloc1(nto*ntags,&sendreq);CHKERRQ(ierr);
337   ierr = PetscMalloc1(*nfrom*ntags,&recvreq);CHKERRQ(ierr);
338 
339   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRMPI(ierr);
340   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
341   for (i=0; i<nto; i++) {
342     PetscMPIInt k;
343     for (k=0; k<ntags; k++) sendreq[i*ntags+k] = MPI_REQUEST_NULL;
344     ierr = (*send)(comm,tag,i,toranks[i],((char*)todata)+count*unitbytes*i,sendreq+i*ntags,ctx);CHKERRQ(ierr);
345   }
346   for (i=0; i<*nfrom; i++) {
347     void *header = (*(char**)fromdata) + count*unitbytes*i;
348     PetscMPIInt k;
349     for (k=0; k<ntags; k++) recvreq[i*ntags+k] = MPI_REQUEST_NULL;
350     ierr = (*recv)(comm,tag,(*fromranks)[i],header,recvreq+i*ntags,ctx);CHKERRQ(ierr);
351   }
352   ierr = PetscFree(tag);CHKERRQ(ierr);
353   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
354   *toreqs = sendreq;
355   *fromreqs = recvreq;
356   PetscFunctionReturn(0);
357 }
358 
359 #if defined(PETSC_HAVE_MPI_IBARRIER)
360 
361 static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
362                                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
363                                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
364                                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
365 {
366   PetscErrorCode ierr;
367   PetscMPIInt    nrecvs,tag,*tags,done,i;
368   MPI_Aint       lb,unitbytes;
369   char           *tdata;
370   MPI_Request    *sendreqs,*usendreqs,*req,barrier;
371   PetscSegBuffer segrank,segdata,segreq;
372   PetscBool      barrier_started;
373 
374   PetscFunctionBegin;
375   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
376   ierr = PetscMalloc1(ntags,&tags);CHKERRQ(ierr);
377   for (i=0; i<ntags; i++) {
378     ierr = PetscCommGetNewTag(comm,&tags[i]);CHKERRQ(ierr);
379   }
380   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRMPI(ierr);
381   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
382   tdata = (char*)todata;
383   ierr = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
384   ierr = PetscMalloc1(nto*ntags,&usendreqs);CHKERRQ(ierr);
385   /* Post synchronous sends */
386   for (i=0; i<nto; i++) {
387     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRMPI(ierr);
388   }
389   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
390    * synchronous messages above. */
391   for (i=0; i<nto; i++) {
392     PetscMPIInt k;
393     for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
394     ierr = (*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx);CHKERRQ(ierr);
395   }
396 
397   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
398   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
399   ierr = PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq);CHKERRQ(ierr);
400 
401   nrecvs  = 0;
402   barrier = MPI_REQUEST_NULL;
403   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
404    * but we need to work around it. */
405   barrier_started = PETSC_FALSE;
406   for (done=0; !done;) {
407     PetscMPIInt flag;
408     MPI_Status  status;
409     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRMPI(ierr);
410     if (flag) {                 /* incoming message */
411       PetscMPIInt *recvrank,k;
412       void        *buf;
413       ierr = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
414       ierr = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
415       *recvrank = status.MPI_SOURCE;
416       ierr = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRMPI(ierr);
417       ierr = PetscSegBufferGet(segreq,ntags,&req);CHKERRQ(ierr);
418       for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
419       ierr = (*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx);CHKERRQ(ierr);
420       nrecvs++;
421     }
422     if (!barrier_started) {
423       PetscMPIInt sent,nsends;
424       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
425       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
426       if (sent) {
427         ierr = MPI_Ibarrier(comm,&barrier);CHKERRMPI(ierr);
428         barrier_started = PETSC_TRUE;
429       }
430     } else {
431       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRMPI(ierr);
432     }
433   }
434   *nfrom = nrecvs;
435   ierr = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
436   ierr = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
437   ierr = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
438   ierr = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
439   *toreqs = usendreqs;
440   ierr = PetscSegBufferExtractAlloc(segreq,fromreqs);CHKERRQ(ierr);
441   ierr = PetscSegBufferDestroy(&segreq);CHKERRQ(ierr);
442   ierr = PetscFree(sendreqs);CHKERRQ(ierr);
443   ierr = PetscFree(tags);CHKERRQ(ierr);
444   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
445   PetscFunctionReturn(0);
446 }
447 #endif
448 
449 /*@C
450    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
451 
452    Collective
453 
454    Input Arguments:
455 +  comm - communicator
456 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
457 .  dtype - datatype to send/receive from each rank (must match on all ranks)
458 .  nto - number of ranks to send data to
459 .  toranks - ranks to send to (array of length nto)
460 .  todata - data to send to each rank (packed)
461 .  ntags - number of tags needed by send/recv callbacks
462 .  send - callback invoked on sending process when ready to send primary payload
463 .  recv - callback invoked on receiving process after delivery of rendezvous message
464 -  ctx - context for callbacks
465 
466    Output Arguments:
467 +  nfrom - number of ranks receiving messages from
468 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
469 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
470 
471    Level: developer
472 
473    Notes:
474    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
475    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
476 
477    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
478 
479    References:
480 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
481    Scalable communication protocols for dynamic sparse data exchange, 2010.
482 
483 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedFReq(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
484 @*/
485 PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,
486                                        PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
487                                        PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
488 {
489   PetscErrorCode ierr;
490   MPI_Request    *toreqs,*fromreqs;
491 
492   PetscFunctionBegin;
493   ierr = PetscCommBuildTwoSidedFReq(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,&toreqs,&fromreqs,send,recv,ctx);CHKERRQ(ierr);
494   ierr = MPI_Waitall(nto*ntags,toreqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
495   ierr = MPI_Waitall(*nfrom*ntags,fromreqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
496   ierr = PetscFree(toreqs);CHKERRQ(ierr);
497   ierr = PetscFree(fromreqs);CHKERRQ(ierr);
498   PetscFunctionReturn(0);
499 }
500 
501 /*@C
502    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
503 
504    Collective
505 
506    Input Arguments:
507 +  comm - communicator
508 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
509 .  dtype - datatype to send/receive from each rank (must match on all ranks)
510 .  nto - number of ranks to send data to
511 .  toranks - ranks to send to (array of length nto)
512 .  todata - data to send to each rank (packed)
513 .  ntags - number of tags needed by send/recv callbacks
514 .  send - callback invoked on sending process when ready to send primary payload
515 .  recv - callback invoked on receiving process after delivery of rendezvous message
516 -  ctx - context for callbacks
517 
518    Output Arguments:
519 +  nfrom - number of ranks receiving messages from
520 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
521 .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
522 .  toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
523 -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())
524 
525    Level: developer
526 
527    Notes:
528    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
529    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
530 
531    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
532 
533    References:
534 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
535    Scalable communication protocols for dynamic sparse data exchange, 2010.
536 
537 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedF(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
538 @*/
539 PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
540                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
541                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
542                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
543 {
544   PetscErrorCode         ierr,(*f)(MPI_Comm,PetscMPIInt,MPI_Datatype,PetscMPIInt,const PetscMPIInt[],const void*,
545                                    PetscMPIInt*,PetscMPIInt**,void*,PetscMPIInt,MPI_Request**,MPI_Request**,
546                                    PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
547                                    PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx);
548   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
549   PetscMPIInt i,size;
550 
551   PetscFunctionBegin;
552   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
553   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
554   for (i=0; i<nto; i++) {
555     if (toranks[i] < 0 || size <= toranks[i]) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"toranks[%d] %d not in comm size %d",i,toranks[i],size);
556   }
557   ierr = PetscLogEventSync(PETSC_BuildTwoSidedF,comm);CHKERRQ(ierr);
558   ierr = PetscLogEventBegin(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
559   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
560   switch (buildtype) {
561   case PETSC_BUILDTWOSIDED_IBARRIER:
562 #if defined(PETSC_HAVE_MPI_IBARRIER)
563     f = PetscCommBuildTwoSidedFReq_Ibarrier;
564     break;
565 #else
566     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
567 #endif
568   case PETSC_BUILDTWOSIDED_ALLREDUCE:
569   case PETSC_BUILDTWOSIDED_REDSCATTER:
570     f = PetscCommBuildTwoSidedFReq_Reference;
571     break;
572   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
573   }
574   ierr = (*f)(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,toreqs,fromreqs,send,recv,ctx);CHKERRQ(ierr);
575   ierr = PetscLogEventEnd(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578