xref: /petsc/src/sys/utils/mpits.c (revision 0e03b746557e2551025fde0294144c0532d12f68)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 PetscLogEvent PETSC_BuildTwoSided;
5 PetscLogEvent PETSC_BuildTwoSidedF;
6 
7 const char *const PetscBuildTwoSidedTypes[] = {
8   "ALLREDUCE",
9   "IBARRIER",
10   "REDSCATTER",
11   "PetscBuildTwoSidedType",
12   "PETSC_BUILDTWOSIDED_",
13   NULL
14 };
15 
16 static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
17 
18 /*@
19    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
20 
21    Logically Collective
22 
23    Input Arguments:
24 +  comm - PETSC_COMM_WORLD
25 -  twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()
26 
27    Level: developer
28 
29    Note:
30    This option is currently global, but could be made per-communicator.
31 
32 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedGetType()
33 @*/
34 PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)
35 {
36   PetscFunctionBegin;
37 #if defined(PETSC_USE_DEBUG)
38   {                             /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
39     PetscMPIInt ierr;
40     PetscMPIInt b1[2],b2[2];
41     b1[0] = -(PetscMPIInt)twosided;
42     b1[1] = (PetscMPIInt)twosided;
43     ierr  = MPIU_Allreduce(b1,b2,2,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
44     if (-b2[0] != b2[1]) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"Enum value must be same on all processes");
45   }
46 #endif
47   _twosided_type = twosided;
48   PetscFunctionReturn(0);
49 }
50 
51 /*@
52    PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication
53 
54    Logically Collective
55 
56    Output Arguments:
57 +  comm - communicator on which to query algorithm
58 -  twosided - algorithm to use for PetscCommBuildTwoSided()
59 
60    Level: developer
61 
62 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedSetType()
63 @*/
64 PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType *twosided)
65 {
66   PetscErrorCode ierr;
67   PetscMPIInt    size;
68 
69   PetscFunctionBegin;
70   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
71   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
72     ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
73     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE; /* default for small comms, see https://gitlab.com/petsc/petsc/-/merge_requests/2611 */
74 #if defined(PETSC_HAVE_MPI_IBARRIER)
75     if (size > 1024) _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
76 #endif
77     ierr = PetscOptionsGetEnum(NULL,NULL,"-build_twosided",PetscBuildTwoSidedTypes,(PetscEnum*)&_twosided_type,NULL);CHKERRQ(ierr);
78   }
79   *twosided = _twosided_type;
80   PetscFunctionReturn(0);
81 }
82 
83 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
84 
85 static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
86 {
87   PetscErrorCode ierr;
88   PetscMPIInt    nrecvs,tag,done,i;
89   MPI_Aint       lb,unitbytes;
90   char           *tdata;
91   MPI_Request    *sendreqs,barrier;
92   PetscSegBuffer segrank,segdata;
93   PetscBool      barrier_started;
94 
95   PetscFunctionBegin;
96   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
97   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
98   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
99   tdata = (char*)todata;
100   ierr  = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
101   for (i=0; i<nto; i++) {
102     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
103   }
104   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
105   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
106 
107   nrecvs  = 0;
108   barrier = MPI_REQUEST_NULL;
109   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
110    * but we need to work around it. */
111   barrier_started = PETSC_FALSE;
112   for (done=0; !done; ) {
113     PetscMPIInt flag;
114     MPI_Status  status;
115     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
116     if (flag) {                 /* incoming message */
117       PetscMPIInt *recvrank;
118       void        *buf;
119       ierr      = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
120       ierr      = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
121       *recvrank = status.MPI_SOURCE;
122       ierr      = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
123       nrecvs++;
124     }
125     if (!barrier_started) {
126       PetscMPIInt sent,nsends;
127       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
128       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
129       if (sent) {
130 #if defined(PETSC_HAVE_MPI_IBARRIER)
131         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
132 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
133         ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
134 #endif
135         barrier_started = PETSC_TRUE;
136         ierr = PetscFree(sendreqs);CHKERRQ(ierr);
137       }
138     } else {
139       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
140     }
141   }
142   *nfrom = nrecvs;
143   ierr   = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
144   ierr   = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
145   ierr   = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
146   ierr   = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
147   ierr   = PetscCommDestroy(&comm);CHKERRQ(ierr);
148   PetscFunctionReturn(0);
149 }
150 #endif
151 
152 static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
153 {
154   PetscErrorCode   ierr;
155   PetscMPIInt      size,rank,*iflags,nrecvs,tag,*franks,i,flg;
156   MPI_Aint         lb,unitbytes;
157   char             *tdata,*fdata;
158   MPI_Request      *reqs,*sendreqs;
159   MPI_Status       *statuses;
160   PetscCommCounter *counter;
161 
162   PetscFunctionBegin;
163   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
164   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
165   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
166   ierr = MPI_Comm_get_attr(comm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
167   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Inner PETSc communicator does not have its tag/name counter attribute set");
168   if (!counter->iflags) {
169     ierr   = PetscCalloc1(size,&counter->iflags);CHKERRQ(ierr);
170     iflags = counter->iflags;
171   } else {
172     iflags = counter->iflags;
173     ierr   = PetscArrayzero(iflags,size);CHKERRQ(ierr);
174   }
175   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
176   ierr     = MPIU_Allreduce(MPI_IN_PLACE,iflags,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
177   nrecvs   = iflags[rank];
178   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
179   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
180   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
181   tdata    = (char*)todata;
182   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
183   sendreqs = reqs + nrecvs;
184   for (i=0; i<nrecvs; i++) {
185     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
186   }
187   for (i=0; i<nto; i++) {
188     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
189   }
190   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
191   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
192   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
193   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
194   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
195 
196   *nfrom            = nrecvs;
197   *fromranks        = franks;
198   *(void**)fromdata = fdata;
199   PetscFunctionReturn(0);
200 }
201 
202 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
203 static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
204 {
205   PetscErrorCode ierr;
206   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i,flg;
207   MPI_Aint       lb,unitbytes;
208   char           *tdata,*fdata;
209   MPI_Request    *reqs,*sendreqs;
210   MPI_Status     *statuses;
211   PetscCommCounter *counter;
212 
213   PetscFunctionBegin;
214   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
215   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
216   ierr = MPI_Comm_get_attr(comm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
217   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Inner PETSc communicator does not have its tag/name counter attribute set");
218   if (!counter->iflags) {
219     ierr   = PetscCalloc1(size,&counter->iflags);CHKERRQ(ierr);
220     iflags = counter->iflags;
221   } else {
222     iflags = counter->iflags;
223     ierr   = PetscArrayzero(iflags,size);CHKERRQ(ierr);
224   }
225   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
226   ierr     = MPI_Reduce_scatter_block(iflags,&nrecvs,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
227   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
228   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
229   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
230   tdata    = (char*)todata;
231   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
232   sendreqs = reqs + nrecvs;
233   for (i=0; i<nrecvs; i++) {
234     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
235   }
236   for (i=0; i<nto; i++) {
237     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
238   }
239   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
240   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
241   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
242   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
243   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
244 
245   *nfrom            = nrecvs;
246   *fromranks        = franks;
247   *(void**)fromdata = fdata;
248   PetscFunctionReturn(0);
249 }
250 #endif
251 
252 /*@C
253    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
254 
255    Collective
256 
257    Input Arguments:
258 +  comm - communicator
259 .  count - number of entries to send/receive (must match on all ranks)
260 .  dtype - datatype to send/receive from each rank (must match on all ranks)
261 .  nto - number of ranks to send data to
262 .  toranks - ranks to send to (array of length nto)
263 -  todata - data to send to each rank (packed)
264 
265    Output Arguments:
266 +  nfrom - number of ranks receiving messages from
267 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
268 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
269 
270    Level: developer
271 
272    Options Database Keys:
273 .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication. Default is allreduce for communicators with <= 1024 ranks, otherwise ibarrier.
274 
275    Notes:
276    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
277    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.
278 
279    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
280 
281    References:
282 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
283    Scalable communication protocols for dynamic sparse data exchange, 2010.
284 
285 .seealso: PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
286 @*/
287 PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
288 {
289   PetscErrorCode         ierr;
290   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
291 
292   PetscFunctionBegin;
293   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
294   ierr = PetscLogEventSync(PETSC_BuildTwoSided,comm);CHKERRQ(ierr);
295   ierr = PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
296   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
297   switch (buildtype) {
298   case PETSC_BUILDTWOSIDED_IBARRIER:
299 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
300     ierr = PetscCommBuildTwoSided_Ibarrier(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
301 #else
302     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
303 #endif
304     break;
305   case PETSC_BUILDTWOSIDED_ALLREDUCE:
306     ierr = PetscCommBuildTwoSided_Allreduce(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
307     break;
308   case PETSC_BUILDTWOSIDED_REDSCATTER:
309 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
310     ierr = PetscCommBuildTwoSided_RedScatter(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
311 #else
312     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
313 #endif
314     break;
315   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
316   }
317   ierr = PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
318   PetscFunctionReturn(0);
319 }
320 
321 static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
322                                                            PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
323                                                            PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
324                                                            PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
325 {
326   PetscErrorCode ierr;
327   PetscMPIInt i,*tag;
328   MPI_Aint    lb,unitbytes;
329   MPI_Request *sendreq,*recvreq;
330 
331   PetscFunctionBegin;
332   ierr = PetscMalloc1(ntags,&tag);CHKERRQ(ierr);
333   if (ntags > 0) {
334     ierr = PetscCommDuplicate(comm,&comm,&tag[0]);CHKERRQ(ierr);
335   }
336   for (i=1; i<ntags; i++) {
337     ierr = PetscCommGetNewTag(comm,&tag[i]);CHKERRQ(ierr);
338   }
339 
340   /* Perform complete initial rendezvous */
341   ierr = PetscCommBuildTwoSided(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
342 
343   ierr = PetscMalloc1(nto*ntags,&sendreq);CHKERRQ(ierr);
344   ierr = PetscMalloc1(*nfrom*ntags,&recvreq);CHKERRQ(ierr);
345 
346   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
347   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
348   for (i=0; i<nto; i++) {
349     PetscMPIInt k;
350     for (k=0; k<ntags; k++) sendreq[i*ntags+k] = MPI_REQUEST_NULL;
351     ierr = (*send)(comm,tag,i,toranks[i],((char*)todata)+count*unitbytes*i,sendreq+i*ntags,ctx);CHKERRQ(ierr);
352   }
353   for (i=0; i<*nfrom; i++) {
354     void *header = (*(char**)fromdata) + count*unitbytes*i;
355     PetscMPIInt k;
356     for (k=0; k<ntags; k++) recvreq[i*ntags+k] = MPI_REQUEST_NULL;
357     ierr = (*recv)(comm,tag,(*fromranks)[i],header,recvreq+i*ntags,ctx);CHKERRQ(ierr);
358   }
359   ierr = PetscFree(tag);CHKERRQ(ierr);
360   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
361   *toreqs = sendreq;
362   *fromreqs = recvreq;
363   PetscFunctionReturn(0);
364 }
365 
366 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
367 
368 static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
369                                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
370                                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
371                                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
372 {
373   PetscErrorCode ierr;
374   PetscMPIInt    nrecvs,tag,*tags,done,i;
375   MPI_Aint       lb,unitbytes;
376   char           *tdata;
377   MPI_Request    *sendreqs,*usendreqs,*req,barrier;
378   PetscSegBuffer segrank,segdata,segreq;
379   PetscBool      barrier_started;
380 
381   PetscFunctionBegin;
382   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
383   ierr = PetscMalloc1(ntags,&tags);CHKERRQ(ierr);
384   for (i=0; i<ntags; i++) {
385     ierr = PetscCommGetNewTag(comm,&tags[i]);CHKERRQ(ierr);
386   }
387   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
388   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
389   tdata = (char*)todata;
390   ierr = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
391   ierr = PetscMalloc1(nto*ntags,&usendreqs);CHKERRQ(ierr);
392   /* Post synchronous sends */
393   for (i=0; i<nto; i++) {
394     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
395   }
396   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
397    * synchronous messages above. */
398   for (i=0; i<nto; i++) {
399     PetscMPIInt k;
400     for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
401     ierr = (*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx);CHKERRQ(ierr);
402   }
403 
404   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
405   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
406   ierr = PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq);CHKERRQ(ierr);
407 
408   nrecvs  = 0;
409   barrier = MPI_REQUEST_NULL;
410   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
411    * but we need to work around it. */
412   barrier_started = PETSC_FALSE;
413   for (done=0; !done; ) {
414     PetscMPIInt flag;
415     MPI_Status  status;
416     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
417     if (flag) {                 /* incoming message */
418       PetscMPIInt *recvrank,k;
419       void        *buf;
420       ierr = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
421       ierr = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
422       *recvrank = status.MPI_SOURCE;
423       ierr = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
424       ierr = PetscSegBufferGet(segreq,ntags,&req);CHKERRQ(ierr);
425       for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
426       ierr = (*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx);CHKERRQ(ierr);
427       nrecvs++;
428     }
429     if (!barrier_started) {
430       PetscMPIInt sent,nsends;
431       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
432       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
433       if (sent) {
434 #if defined(PETSC_HAVE_MPI_IBARRIER)
435         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
436 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
437         ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
438 #endif
439         barrier_started = PETSC_TRUE;
440       }
441     } else {
442       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
443     }
444   }
445   *nfrom = nrecvs;
446   ierr = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
447   ierr = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
448   ierr = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
449   ierr = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
450   *toreqs = usendreqs;
451   ierr = PetscSegBufferExtractAlloc(segreq,fromreqs);CHKERRQ(ierr);
452   ierr = PetscSegBufferDestroy(&segreq);CHKERRQ(ierr);
453   ierr = PetscFree(sendreqs);CHKERRQ(ierr);
454   ierr = PetscFree(tags);CHKERRQ(ierr);
455   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
456   PetscFunctionReturn(0);
457 }
458 #endif
459 
460 /*@C
461    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
462 
463    Collective
464 
465    Input Arguments:
466 +  comm - communicator
467 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
468 .  dtype - datatype to send/receive from each rank (must match on all ranks)
469 .  nto - number of ranks to send data to
470 .  toranks - ranks to send to (array of length nto)
471 .  todata - data to send to each rank (packed)
472 .  ntags - number of tags needed by send/recv callbacks
473 .  send - callback invoked on sending process when ready to send primary payload
474 .  recv - callback invoked on receiving process after delivery of rendezvous message
475 -  ctx - context for callbacks
476 
477    Output Arguments:
478 +  nfrom - number of ranks receiving messages from
479 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
480 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
481 
482    Level: developer
483 
484    Notes:
485    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
486    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
487 
488    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
489 
490    References:
491 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
492    Scalable communication protocols for dynamic sparse data exchange, 2010.
493 
494 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedFReq(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
495 @*/
496 PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,
497                                        PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
498                                        PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
499 {
500   PetscErrorCode ierr;
501   MPI_Request    *toreqs,*fromreqs;
502 
503   PetscFunctionBegin;
504   ierr = PetscCommBuildTwoSidedFReq(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,&toreqs,&fromreqs,send,recv,ctx);CHKERRQ(ierr);
505   ierr = MPI_Waitall(nto*ntags,toreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
506   ierr = MPI_Waitall(*nfrom*ntags,fromreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
507   ierr = PetscFree(toreqs);CHKERRQ(ierr);
508   ierr = PetscFree(fromreqs);CHKERRQ(ierr);
509   PetscFunctionReturn(0);
510 }
511 
512 /*@C
513    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
514 
515    Collective
516 
517    Input Arguments:
518 +  comm - communicator
519 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
520 .  dtype - datatype to send/receive from each rank (must match on all ranks)
521 .  nto - number of ranks to send data to
522 .  toranks - ranks to send to (array of length nto)
523 .  todata - data to send to each rank (packed)
524 .  ntags - number of tags needed by send/recv callbacks
525 .  send - callback invoked on sending process when ready to send primary payload
526 .  recv - callback invoked on receiving process after delivery of rendezvous message
527 -  ctx - context for callbacks
528 
529    Output Arguments:
530 +  nfrom - number of ranks receiving messages from
531 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
532 .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
533 .  toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
534 -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())
535 
536    Level: developer
537 
538    Notes:
539    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
540    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
541 
542    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
543 
544    References:
545 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
546    Scalable communication protocols for dynamic sparse data exchange, 2010.
547 
548 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedF(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
549 @*/
550 PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
551                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
552                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
553                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
554 {
555   PetscErrorCode         ierr,(*f)(MPI_Comm,PetscMPIInt,MPI_Datatype,PetscMPIInt,const PetscMPIInt[],const void*,
556                                    PetscMPIInt*,PetscMPIInt**,void*,PetscMPIInt,MPI_Request**,MPI_Request**,
557                                    PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
558                                    PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx);
559   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
560   PetscMPIInt i,size;
561 
562   PetscFunctionBegin;
563   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
564   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
565   for (i=0; i<nto; i++) {
566     if (toranks[i] < 0 || size <= toranks[i]) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"toranks[%d] %d not in comm size %d",i,toranks[i],size);
567   }
568   ierr = PetscLogEventSync(PETSC_BuildTwoSidedF,comm);CHKERRQ(ierr);
569   ierr = PetscLogEventBegin(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
570   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
571   switch (buildtype) {
572   case PETSC_BUILDTWOSIDED_IBARRIER:
573 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
574     f = PetscCommBuildTwoSidedFReq_Ibarrier;
575 #else
576     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
577 #endif
578     break;
579   case PETSC_BUILDTWOSIDED_ALLREDUCE:
580   case PETSC_BUILDTWOSIDED_REDSCATTER:
581     f = PetscCommBuildTwoSidedFReq_Reference;
582     break;
583   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
584   }
585   ierr = (*f)(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,toreqs,fromreqs,send,recv,ctx);CHKERRQ(ierr);
586   ierr = PetscLogEventEnd(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
587   PetscFunctionReturn(0);
588 }
589