xref: /petsc/src/sys/utils/mpits.c (revision daad07d386296cdcbb87925ef5f1432ee4a24ec4)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 PetscLogEvent PETSC_BuildTwoSided;
5 PetscLogEvent PETSC_BuildTwoSidedF;
6 
7 const char *const PetscBuildTwoSidedTypes[] = {
8   "ALLREDUCE",
9   "IBARRIER",
10   "REDSCATTER",
11   "PetscBuildTwoSidedType",
12   "PETSC_BUILDTWOSIDED_",
13   NULL
14 };
15 
16 static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
17 
18 /*@
19    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
20 
21    Logically Collective
22 
23    Input Parameters:
24 +  comm - PETSC_COMM_WORLD
25 -  twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()
26 
27    Level: developer
28 
29    Note:
30    This option is currently global, but could be made per-communicator.
31 
32 .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedGetType()`
33 @*/
34 PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)
35 {
36   PetscFunctionBegin;
37   if (PetscDefined(USE_DEBUG)) {                             /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
38     PetscMPIInt b1[2],b2[2];
39     b1[0] = -(PetscMPIInt)twosided;
40     b1[1] = (PetscMPIInt)twosided;
41     PetscCall(MPIU_Allreduce(b1,b2,2,MPI_INT,MPI_MAX,comm));
42     PetscCheck(-b2[0] == b2[1],comm,PETSC_ERR_ARG_WRONG,"Enum value must be same on all processes");
43   }
44   _twosided_type = twosided;
45   PetscFunctionReturn(0);
46 }
47 
48 /*@
49    PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication
50 
51    Logically Collective
52 
53    Output Parameters:
54 +  comm - communicator on which to query algorithm
55 -  twosided - algorithm to use for PetscCommBuildTwoSided()
56 
57    Level: developer
58 
59 .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedSetType()`
60 @*/
61 PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType *twosided)
62 {
63   PetscMPIInt    size;
64 
65   PetscFunctionBegin;
66   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
67   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
68     PetscCallMPI(MPI_Comm_size(comm,&size));
69     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE; /* default for small comms, see https://gitlab.com/petsc/petsc/-/merge_requests/2611 */
70 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
71     if (size > 1024) _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
72 #endif
73     PetscCall(PetscOptionsGetEnum(NULL,NULL,"-build_twosided",PetscBuildTwoSidedTypes,(PetscEnum*)&_twosided_type,NULL));
74   }
75   *twosided = _twosided_type;
76   PetscFunctionReturn(0);
77 }
78 
79 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
80 static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
81 {
82   PetscMPIInt    nrecvs,tag,done,i;
83   MPI_Aint       lb,unitbytes;
84   char           *tdata;
85   MPI_Request    *sendreqs,barrier;
86   PetscSegBuffer segrank,segdata;
87   PetscBool      barrier_started;
88 
89   PetscFunctionBegin;
90   PetscCall(PetscCommDuplicate(comm,&comm,&tag));
91   PetscCallMPI(MPI_Type_get_extent(dtype,&lb,&unitbytes));
92   PetscCheck(lb == 0,comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb);
93   tdata = (char*)todata;
94   PetscCall(PetscMalloc1(nto,&sendreqs));
95   for (i=0; i<nto; i++) {
96     PetscCallMPI(MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i));
97   }
98   PetscCall(PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank));
99   PetscCall(PetscSegBufferCreate(unitbytes,4*count,&segdata));
100 
101   nrecvs  = 0;
102   barrier = MPI_REQUEST_NULL;
103   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
104    * but we need to work around it. */
105   barrier_started = PETSC_FALSE;
106   for (done=0; !done;) {
107     PetscMPIInt flag;
108     MPI_Status  status;
109     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status));
110     if (flag) {                 /* incoming message */
111       PetscMPIInt *recvrank;
112       void        *buf;
113       PetscCall(PetscSegBufferGet(segrank,1,&recvrank));
114       PetscCall(PetscSegBufferGet(segdata,count,&buf));
115       *recvrank = status.MPI_SOURCE;
116       PetscCallMPI(MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE));
117       nrecvs++;
118     }
119     if (!barrier_started) {
120       PetscMPIInt sent,nsends;
121       PetscCall(PetscMPIIntCast(nto,&nsends));
122       PetscCallMPI(MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE));
123       if (sent) {
124         PetscCallMPI(MPI_Ibarrier(comm,&barrier));
125         barrier_started = PETSC_TRUE;
126         PetscCall(PetscFree(sendreqs));
127       }
128     } else {
129       PetscCallMPI(MPI_Test(&barrier,&done,MPI_STATUS_IGNORE));
130     }
131   }
132   *nfrom = nrecvs;
133   PetscCall(PetscSegBufferExtractAlloc(segrank,fromranks));
134   PetscCall(PetscSegBufferDestroy(&segrank));
135   PetscCall(PetscSegBufferExtractAlloc(segdata,fromdata));
136   PetscCall(PetscSegBufferDestroy(&segdata));
137   PetscCall(PetscCommDestroy(&comm));
138   PetscFunctionReturn(0);
139 }
140 #endif
141 
142 static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
143 {
144   PetscMPIInt      size,rank,*iflags,nrecvs,tag,*franks,i,flg;
145   MPI_Aint         lb,unitbytes;
146   char             *tdata,*fdata;
147   MPI_Request      *reqs,*sendreqs;
148   MPI_Status       *statuses;
149   PetscCommCounter *counter;
150 
151   PetscFunctionBegin;
152   PetscCallMPI(MPI_Comm_size(comm,&size));
153   PetscCallMPI(MPI_Comm_rank(comm,&rank));
154   PetscCall(PetscCommDuplicate(comm,&comm,&tag));
155   PetscCallMPI(MPI_Comm_get_attr(comm,Petsc_Counter_keyval,&counter,&flg));
156   PetscCheck(flg,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Inner PETSc communicator does not have its tag/name counter attribute set");
157   if (!counter->iflags) {
158     PetscCall(PetscCalloc1(size,&counter->iflags));
159     iflags = counter->iflags;
160   } else {
161     iflags = counter->iflags;
162     PetscCall(PetscArrayzero(iflags,size));
163   }
164   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
165   PetscCall(MPIU_Allreduce(MPI_IN_PLACE,iflags,size,MPI_INT,MPI_SUM,comm));
166   nrecvs   = iflags[rank];
167   PetscCallMPI(MPI_Type_get_extent(dtype,&lb,&unitbytes));
168   PetscCheck(lb == 0,comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb);
169   PetscCall(PetscMalloc(nrecvs*count*unitbytes,&fdata));
170   tdata    = (char*)todata;
171   PetscCall(PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses));
172   sendreqs = reqs + nrecvs;
173   for (i=0; i<nrecvs; i++) {
174     PetscCallMPI(MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i));
175   }
176   for (i=0; i<nto; i++) {
177     PetscCallMPI(MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i));
178   }
179   PetscCallMPI(MPI_Waitall(nto+nrecvs,reqs,statuses));
180   PetscCall(PetscMalloc1(nrecvs,&franks));
181   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
182   PetscCall(PetscFree2(reqs,statuses));
183   PetscCall(PetscCommDestroy(&comm));
184 
185   *nfrom            = nrecvs;
186   *fromranks        = franks;
187   *(void**)fromdata = fdata;
188   PetscFunctionReturn(0);
189 }
190 
191 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
192 static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
193 {
194   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i,flg;
195   MPI_Aint       lb,unitbytes;
196   char           *tdata,*fdata;
197   MPI_Request    *reqs,*sendreqs;
198   MPI_Status     *statuses;
199   PetscCommCounter *counter;
200 
201   PetscFunctionBegin;
202   PetscCallMPI(MPI_Comm_size(comm,&size));
203   PetscCall(PetscCommDuplicate(comm,&comm,&tag));
204   PetscCallMPI(MPI_Comm_get_attr(comm,Petsc_Counter_keyval,&counter,&flg));
205   PetscCheck(flg,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Inner PETSc communicator does not have its tag/name counter attribute set");
206   if (!counter->iflags) {
207     PetscCall(PetscCalloc1(size,&counter->iflags));
208     iflags = counter->iflags;
209   } else {
210     iflags = counter->iflags;
211     PetscCall(PetscArrayzero(iflags,size));
212   }
213   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
214   PetscCallMPI(MPI_Reduce_scatter_block(iflags,&nrecvs,1,MPI_INT,MPI_SUM,comm));
215   PetscCallMPI(MPI_Type_get_extent(dtype,&lb,&unitbytes));
216   PetscCheck(lb == 0,comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb);
217   PetscCall(PetscMalloc(nrecvs*count*unitbytes,&fdata));
218   tdata    = (char*)todata;
219   PetscCall(PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses));
220   sendreqs = reqs + nrecvs;
221   for (i=0; i<nrecvs; i++) {
222     PetscCallMPI(MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i));
223   }
224   for (i=0; i<nto; i++) {
225     PetscCallMPI(MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i));
226   }
227   PetscCallMPI(MPI_Waitall(nto+nrecvs,reqs,statuses));
228   PetscCall(PetscMalloc1(nrecvs,&franks));
229   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
230   PetscCall(PetscFree2(reqs,statuses));
231   PetscCall(PetscCommDestroy(&comm));
232 
233   *nfrom            = nrecvs;
234   *fromranks        = franks;
235   *(void**)fromdata = fdata;
236   PetscFunctionReturn(0);
237 }
238 #endif
239 
240 /*@C
241    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
242 
243    Collective
244 
245    Input Parameters:
246 +  comm - communicator
247 .  count - number of entries to send/receive (must match on all ranks)
248 .  dtype - datatype to send/receive from each rank (must match on all ranks)
249 .  nto - number of ranks to send data to
250 .  toranks - ranks to send to (array of length nto)
251 -  todata - data to send to each rank (packed)
252 
253    Output Parameters:
254 +  nfrom - number of ranks receiving messages from
255 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
256 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
257 
258    Level: developer
259 
260    Options Database Keys:
261 .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication. Default is allreduce for communicators with <= 1024 ranks, otherwise ibarrier.
262 
263    Notes:
264    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
265    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.
266 
267    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
268 
269    References:
270 .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
271    Scalable communication protocols for dynamic sparse data exchange, 2010.
272 
273 .seealso: `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
274 @*/
275 PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
276 {
277   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
278 
279   PetscFunctionBegin;
280   PetscCall(PetscSysInitializePackage());
281   PetscCall(PetscLogEventSync(PETSC_BuildTwoSided,comm));
282   PetscCall(PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0));
283   PetscCall(PetscCommBuildTwoSidedGetType(comm,&buildtype));
284   switch (buildtype) {
285   case PETSC_BUILDTWOSIDED_IBARRIER:
286 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
287     PetscCall(PetscCommBuildTwoSided_Ibarrier(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata));
288     break;
289 #else
290     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
291 #endif
292   case PETSC_BUILDTWOSIDED_ALLREDUCE:
293     PetscCall(PetscCommBuildTwoSided_Allreduce(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata));
294     break;
295   case PETSC_BUILDTWOSIDED_REDSCATTER:
296 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
297     PetscCall(PetscCommBuildTwoSided_RedScatter(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata));
298     break;
299 #else
300     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
301 #endif
302   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
303   }
304   PetscCall(PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0));
305   PetscFunctionReturn(0);
306 }
307 
308 static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
309                                                            PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
310                                                            PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
311                                                            PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
312 {
313   PetscMPIInt i,*tag;
314   MPI_Aint    lb,unitbytes;
315   MPI_Request *sendreq,*recvreq;
316 
317   PetscFunctionBegin;
318   PetscCall(PetscMalloc1(ntags,&tag));
319   if (ntags > 0) {
320     PetscCall(PetscCommDuplicate(comm,&comm,&tag[0]));
321   }
322   for (i=1; i<ntags; i++) {
323     PetscCall(PetscCommGetNewTag(comm,&tag[i]));
324   }
325 
326   /* Perform complete initial rendezvous */
327   PetscCall(PetscCommBuildTwoSided(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata));
328 
329   PetscCall(PetscMalloc1(nto*ntags,&sendreq));
330   PetscCall(PetscMalloc1(*nfrom*ntags,&recvreq));
331 
332   PetscCallMPI(MPI_Type_get_extent(dtype,&lb,&unitbytes));
333   PetscCheck(lb == 0,comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb);
334   for (i=0; i<nto; i++) {
335     PetscMPIInt k;
336     for (k=0; k<ntags; k++) sendreq[i*ntags+k] = MPI_REQUEST_NULL;
337     PetscCall((*send)(comm,tag,i,toranks[i],((char*)todata)+count*unitbytes*i,sendreq+i*ntags,ctx));
338   }
339   for (i=0; i<*nfrom; i++) {
340     void *header = (*(char**)fromdata) + count*unitbytes*i;
341     PetscMPIInt k;
342     for (k=0; k<ntags; k++) recvreq[i*ntags+k] = MPI_REQUEST_NULL;
343     PetscCall((*recv)(comm,tag,(*fromranks)[i],header,recvreq+i*ntags,ctx));
344   }
345   PetscCall(PetscFree(tag));
346   PetscCall(PetscCommDestroy(&comm));
347   *toreqs = sendreq;
348   *fromreqs = recvreq;
349   PetscFunctionReturn(0);
350 }
351 
352 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
353 
354 static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
355                                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
356                                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
357                                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
358 {
359   PetscMPIInt    nrecvs,tag,*tags,done,i;
360   MPI_Aint       lb,unitbytes;
361   char           *tdata;
362   MPI_Request    *sendreqs,*usendreqs,*req,barrier;
363   PetscSegBuffer segrank,segdata,segreq;
364   PetscBool      barrier_started;
365 
366   PetscFunctionBegin;
367   PetscCall(PetscCommDuplicate(comm,&comm,&tag));
368   PetscCall(PetscMalloc1(ntags,&tags));
369   for (i=0; i<ntags; i++) {
370     PetscCall(PetscCommGetNewTag(comm,&tags[i]));
371   }
372   PetscCallMPI(MPI_Type_get_extent(dtype,&lb,&unitbytes));
373   PetscCheck(lb == 0,comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb);
374   tdata = (char*)todata;
375   PetscCall(PetscMalloc1(nto,&sendreqs));
376   PetscCall(PetscMalloc1(nto*ntags,&usendreqs));
377   /* Post synchronous sends */
378   for (i=0; i<nto; i++) {
379     PetscCallMPI(MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i));
380   }
381   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
382    * synchronous messages above. */
383   for (i=0; i<nto; i++) {
384     PetscMPIInt k;
385     for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
386     PetscCall((*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx));
387   }
388 
389   PetscCall(PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank));
390   PetscCall(PetscSegBufferCreate(unitbytes,4*count,&segdata));
391   PetscCall(PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq));
392 
393   nrecvs  = 0;
394   barrier = MPI_REQUEST_NULL;
395   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
396    * but we need to work around it. */
397   barrier_started = PETSC_FALSE;
398   for (done=0; !done;) {
399     PetscMPIInt flag;
400     MPI_Status  status;
401     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status));
402     if (flag) {                 /* incoming message */
403       PetscMPIInt *recvrank,k;
404       void        *buf;
405       PetscCall(PetscSegBufferGet(segrank,1,&recvrank));
406       PetscCall(PetscSegBufferGet(segdata,count,&buf));
407       *recvrank = status.MPI_SOURCE;
408       PetscCallMPI(MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE));
409       PetscCall(PetscSegBufferGet(segreq,ntags,&req));
410       for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
411       PetscCall((*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx));
412       nrecvs++;
413     }
414     if (!barrier_started) {
415       PetscMPIInt sent,nsends;
416       PetscCall(PetscMPIIntCast(nto,&nsends));
417       PetscCallMPI(MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE));
418       if (sent) {
419         PetscCallMPI(MPI_Ibarrier(comm,&barrier));
420         barrier_started = PETSC_TRUE;
421       }
422     } else {
423       PetscCallMPI(MPI_Test(&barrier,&done,MPI_STATUS_IGNORE));
424     }
425   }
426   *nfrom = nrecvs;
427   PetscCall(PetscSegBufferExtractAlloc(segrank,fromranks));
428   PetscCall(PetscSegBufferDestroy(&segrank));
429   PetscCall(PetscSegBufferExtractAlloc(segdata,fromdata));
430   PetscCall(PetscSegBufferDestroy(&segdata));
431   *toreqs = usendreqs;
432   PetscCall(PetscSegBufferExtractAlloc(segreq,fromreqs));
433   PetscCall(PetscSegBufferDestroy(&segreq));
434   PetscCall(PetscFree(sendreqs));
435   PetscCall(PetscFree(tags));
436   PetscCall(PetscCommDestroy(&comm));
437   PetscFunctionReturn(0);
438 }
439 #endif
440 
441 /*@C
442    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
443 
444    Collective
445 
446    Input Parameters:
447 +  comm - communicator
448 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
449 .  dtype - datatype to send/receive from each rank (must match on all ranks)
450 .  nto - number of ranks to send data to
451 .  toranks - ranks to send to (array of length nto)
452 .  todata - data to send to each rank (packed)
453 .  ntags - number of tags needed by send/recv callbacks
454 .  send - callback invoked on sending process when ready to send primary payload
455 .  recv - callback invoked on receiving process after delivery of rendezvous message
456 -  ctx - context for callbacks
457 
458    Output Parameters:
459 +  nfrom - number of ranks receiving messages from
460 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
461 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
462 
463    Level: developer
464 
465    Notes:
466    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
467    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
468 
469    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
470 
471    References:
472 .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
473    Scalable communication protocols for dynamic sparse data exchange, 2010.
474 
475 .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedFReq()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
476 @*/
477 PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,
478                                        PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
479                                        PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
480 {
481   MPI_Request    *toreqs,*fromreqs;
482 
483   PetscFunctionBegin;
484   PetscCall(PetscCommBuildTwoSidedFReq(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,&toreqs,&fromreqs,send,recv,ctx));
485   PetscCallMPI(MPI_Waitall(nto*ntags,toreqs,MPI_STATUSES_IGNORE));
486   PetscCallMPI(MPI_Waitall(*nfrom*ntags,fromreqs,MPI_STATUSES_IGNORE));
487   PetscCall(PetscFree(toreqs));
488   PetscCall(PetscFree(fromreqs));
489   PetscFunctionReturn(0);
490 }
491 
492 /*@C
493    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
494 
495    Collective
496 
497    Input Parameters:
498 +  comm - communicator
499 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
500 .  dtype - datatype to send/receive from each rank (must match on all ranks)
501 .  nto - number of ranks to send data to
502 .  toranks - ranks to send to (array of length nto)
503 .  todata - data to send to each rank (packed)
504 .  ntags - number of tags needed by send/recv callbacks
505 .  send - callback invoked on sending process when ready to send primary payload
506 .  recv - callback invoked on receiving process after delivery of rendezvous message
507 -  ctx - context for callbacks
508 
509    Output Parameters:
510 +  nfrom - number of ranks receiving messages from
511 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
512 .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
513 .  toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
514 -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())
515 
516    Level: developer
517 
518    Notes:
519    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
520    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
521 
522    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
523 
524    References:
525 .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
526    Scalable communication protocols for dynamic sparse data exchange, 2010.
527 
528 .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedF()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
529 @*/
530 PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
531                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
532                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
533                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
534 {
535   PetscErrorCode         (*f)(MPI_Comm,PetscMPIInt,MPI_Datatype,PetscMPIInt,const PetscMPIInt[],const void*,
536                               PetscMPIInt*,PetscMPIInt**,void*,PetscMPIInt,MPI_Request**,MPI_Request**,
537                               PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
538                               PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx);
539   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
540   PetscMPIInt i,size;
541 
542   PetscFunctionBegin;
543   PetscCall(PetscSysInitializePackage());
544   PetscCallMPI(MPI_Comm_size(comm,&size));
545   for (i=0; i<nto; i++) {
546     PetscCheck(toranks[i] >= 0 && size > toranks[i],comm,PETSC_ERR_ARG_OUTOFRANGE,"toranks[%d] %d not in comm size %d",i,toranks[i],size);
547   }
548   PetscCall(PetscLogEventSync(PETSC_BuildTwoSidedF,comm));
549   PetscCall(PetscLogEventBegin(PETSC_BuildTwoSidedF,0,0,0,0));
550   PetscCall(PetscCommBuildTwoSidedGetType(comm,&buildtype));
551   switch (buildtype) {
552   case PETSC_BUILDTWOSIDED_IBARRIER:
553 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
554     f = PetscCommBuildTwoSidedFReq_Ibarrier;
555     break;
556 #else
557     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
558 #endif
559   case PETSC_BUILDTWOSIDED_ALLREDUCE:
560   case PETSC_BUILDTWOSIDED_REDSCATTER:
561     f = PetscCommBuildTwoSidedFReq_Reference;
562     break;
563   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
564   }
565   PetscCall((*f)(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,toreqs,fromreqs,send,recv,ctx));
566   PetscCall(PetscLogEventEnd(PETSC_BuildTwoSidedF,0,0,0,0));
567   PetscFunctionReturn(0);
568 }
569