xref: /petsc/src/sys/utils/mpits.c (revision 96a0c9949c19278363492ba7462c3ac2a88a3ed1)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 
3 PetscLogEvent PETSC_BuildTwoSided,PETSC_BuildTwoSidedF;
4 
5 const char *const PetscBuildTwoSidedTypes[] = {
6   "ALLREDUCE",
7   "IBARRIER",
8   "REDSCATTER",
9   "PetscBuildTwoSidedType",
10   "PETSC_BUILDTWOSIDED_",
11   0
12 };
13 
14 static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
15 
16 #undef __FUNCT__
17 #define __FUNCT__ "PetscCommBuildTwoSidedSetType"
18 /*@
19    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
20 
21    Logically Collective
22 
23    Input Arguments:
24 +  comm - PETSC_COMM_WORLD
25 -  twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()
26 
27    Level: developer
28 
29    Note:
30    This option is currently global, but could be made per-communicator.
31 
32 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedGetType()
33 @*/
34 PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)
35 {
36   PetscFunctionBegin;
37 #if defined(PETSC_USE_DEBUG)
38   {                             /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
39     PetscMPIInt ierr;
40     PetscMPIInt b1[2],b2[2];
41     b1[0] = -(PetscMPIInt)twosided;
42     b1[1] = (PetscMPIInt)twosided;
43     ierr  = MPIU_Allreduce(b1,b2,2,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
44     if (-b2[0] != b2[1]) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"Enum value must be same on all processes");
45   }
46 #endif
47   _twosided_type = twosided;
48   PetscFunctionReturn(0);
49 }
50 
51 #undef __FUNCT__
52 #define __FUNCT__ "PetscCommBuildTwoSidedGetType"
53 /*@
54    PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication
55 
56    Logically Collective
57 
58    Output Arguments:
59 +  comm - communicator on which to query algorithm
60 -  twosided - algorithm to use for PetscCommBuildTwoSided()
61 
62    Level: developer
63 
64 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedSetType()
65 @*/
66 PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType *twosided)
67 {
68   PetscErrorCode ierr;
69 
70   PetscFunctionBegin;
71   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
72   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
73 #if defined(PETSC_HAVE_MPI_IBARRIER)
74 #  if defined(PETSC_HAVE_MPICH_CH3_SOCK) && !defined(PETSC_HAVE_MPICH_CH3_SOCK_FIXED_NBC_PROGRESS)
75     /* Deadlock in Ibarrier: http://trac.mpich.org/projects/mpich/ticket/1785 */
76     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE;
77 #  else
78     _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
79 #  endif
80 #else
81     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE;
82 #endif
83     ierr = PetscOptionsGetEnum(NULL,NULL,"-build_twosided",PetscBuildTwoSidedTypes,(PetscEnum*)&_twosided_type,NULL);CHKERRQ(ierr);
84   }
85   *twosided = _twosided_type;
86   PetscFunctionReturn(0);
87 }
88 
89 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
90 
91 #undef __FUNCT__
92 #define __FUNCT__ "PetscCommBuildTwoSided_Ibarrier"
93 static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
94 {
95   PetscErrorCode ierr;
96   PetscMPIInt    nrecvs,tag,done,i;
97   MPI_Aint       lb,unitbytes;
98   char           *tdata;
99   MPI_Request    *sendreqs,barrier;
100   PetscSegBuffer segrank,segdata;
101   PetscBool      barrier_started;
102 
103   PetscFunctionBegin;
104   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
105   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
106   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
107   tdata = (char*)todata;
108   ierr  = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
109   for (i=0; i<nto; i++) {
110     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
111   }
112   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
113   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
114 
115   nrecvs  = 0;
116   barrier = MPI_REQUEST_NULL;
117   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
118    * but we need to work around it. */
119   barrier_started = PETSC_FALSE;
120   for (done=0; !done; ) {
121     PetscMPIInt flag;
122     MPI_Status  status;
123     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
124     if (flag) {                 /* incoming message */
125       PetscMPIInt *recvrank;
126       void        *buf;
127       ierr      = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
128       ierr      = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
129       *recvrank = status.MPI_SOURCE;
130       ierr      = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
131       nrecvs++;
132     }
133     if (!barrier_started) {
134       PetscMPIInt sent,nsends;
135       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
136       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
137       if (sent) {
138 #if defined(PETSC_HAVE_MPI_IBARRIER)
139         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
140 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
141         ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
142 #endif
143         barrier_started = PETSC_TRUE;
144         ierr = PetscFree(sendreqs);CHKERRQ(ierr);
145       }
146     } else {
147       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
148     }
149   }
150   *nfrom = nrecvs;
151   ierr   = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
152   ierr   = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
153   ierr   = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
154   ierr   = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
155   ierr   = PetscCommDestroy(&comm);CHKERRQ(ierr);
156   PetscFunctionReturn(0);
157 }
158 #endif
159 
160 #undef __FUNCT__
161 #define __FUNCT__ "PetscCommBuildTwoSided_Allreduce"
162 static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
163 {
164   PetscErrorCode ierr;
165   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i;
166   MPI_Aint       lb,unitbytes;
167   char           *tdata,*fdata;
168   MPI_Request    *reqs,*sendreqs;
169   MPI_Status     *statuses;
170 
171   PetscFunctionBegin;
172   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
173   ierr = PetscCalloc1(size,&iflags);CHKERRQ(ierr);
174   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
175   ierr = PetscGatherNumberOfMessages(comm,iflags,NULL,&nrecvs);CHKERRQ(ierr);
176   ierr = PetscFree(iflags);CHKERRQ(ierr);
177 
178   ierr     = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
179   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
180   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
181   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
182   tdata    = (char*)todata;
183   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
184   sendreqs = reqs + nrecvs;
185   for (i=0; i<nrecvs; i++) {
186     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
187   }
188   for (i=0; i<nto; i++) {
189     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
190   }
191   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
192   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
193   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
194   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
195   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
196 
197   *nfrom            = nrecvs;
198   *fromranks        = franks;
199   *(void**)fromdata = fdata;
200   PetscFunctionReturn(0);
201 }
202 
203 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
204 #undef __FUNCT__
205 #define __FUNCT__ "PetscCommBuildTwoSided_RedScatter"
206 static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
207 {
208   PetscErrorCode ierr;
209   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i;
210   MPI_Aint       lb,unitbytes;
211   char           *tdata,*fdata;
212   MPI_Request    *reqs,*sendreqs;
213   MPI_Status     *statuses;
214 
215   PetscFunctionBegin;
216   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
217   ierr = PetscMalloc1(size,&iflags);CHKERRQ(ierr);
218   ierr = PetscMemzero(iflags,size*sizeof(*iflags));CHKERRQ(ierr);
219   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
220   ierr = MPI_Reduce_scatter_block(iflags,&nrecvs,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
221   ierr = PetscFree(iflags);CHKERRQ(ierr);
222 
223   ierr     = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
224   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
225   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
226   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
227   tdata    = (char*)todata;
228   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
229   sendreqs = reqs + nrecvs;
230   for (i=0; i<nrecvs; i++) {
231     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
232   }
233   for (i=0; i<nto; i++) {
234     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
235   }
236   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
237   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
238   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
239   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
240   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
241 
242   *nfrom            = nrecvs;
243   *fromranks        = franks;
244   *(void**)fromdata = fdata;
245   PetscFunctionReturn(0);
246 }
247 #endif
248 
249 #undef __FUNCT__
250 #define __FUNCT__ "PetscCommBuildTwoSided"
251 /*@C
252    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
253 
254    Collective on MPI_Comm
255 
256    Input Arguments:
257 +  comm - communicator
258 .  count - number of entries to send/receive (must match on all ranks)
259 .  dtype - datatype to send/receive from each rank (must match on all ranks)
260 .  nto - number of ranks to send data to
261 .  toranks - ranks to send to (array of length nto)
262 -  todata - data to send to each rank (packed)
263 
264    Output Arguments:
265 +  nfrom - number of ranks receiving messages from
266 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
267 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
268 
269    Level: developer
270 
271    Options Database Keys:
272 .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication
273 
274    Notes:
275    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
276    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.
277 
278    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
279 
280    References:
281 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
282    Scalable communication protocols for dynamic sparse data exchange, 2010.
283 
284 .seealso: PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
285 @*/
286 PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
287 {
288   PetscErrorCode         ierr;
289   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
290 
291   PetscFunctionBegin;
292   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
293   ierr = PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
294   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
295   switch (buildtype) {
296   case PETSC_BUILDTWOSIDED_IBARRIER:
297 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
298     ierr = PetscCommBuildTwoSided_Ibarrier(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
299 #else
300     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
301 #endif
302     break;
303   case PETSC_BUILDTWOSIDED_ALLREDUCE:
304     ierr = PetscCommBuildTwoSided_Allreduce(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
305     break;
306   case PETSC_BUILDTWOSIDED_REDSCATTER:
307 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
308     ierr = PetscCommBuildTwoSided_RedScatter(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
309 #else
310     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
311 #endif
312     break;
313   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
314   }
315   ierr = PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
316   PetscFunctionReturn(0);
317 }
318 
319 #undef __FUNCT__
320 #define __FUNCT__ "PetscCommBuildTwoSidedFReq_Reference"
321 static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
322                                                            PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
323                                                            PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
324                                                            PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
325 {
326   PetscErrorCode ierr;
327   PetscMPIInt i,*tag;
328   MPI_Aint    lb,unitbytes;
329   MPI_Request *sendreq,*recvreq;
330 
331   PetscFunctionBegin;
332   ierr = PetscMalloc1(ntags,&tag);CHKERRQ(ierr);
333   if (ntags > 0) {
334     ierr = PetscCommDuplicate(comm,&comm,&tag[0]);CHKERRQ(ierr);
335   }
336   for (i=1; i<ntags; i++) {
337     ierr = PetscCommGetNewTag(comm,&tag[i]);CHKERRQ(ierr);
338   }
339 
340   /* Perform complete initial rendezvous */
341   ierr = PetscCommBuildTwoSided(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
342 
343   ierr = PetscMalloc1(nto*ntags,&sendreq);CHKERRQ(ierr);
344   ierr = PetscMalloc1(*nfrom*ntags,&recvreq);CHKERRQ(ierr);
345 
346   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
347   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
348   for (i=0; i<nto; i++) {
349     PetscMPIInt k;
350     for (k=0; k<ntags; k++) sendreq[i*ntags+k] = MPI_REQUEST_NULL;
351     ierr = (*send)(comm,tag,i,toranks[i],((char*)todata)+count*unitbytes*i,sendreq+i*ntags,ctx);CHKERRQ(ierr);
352   }
353   for (i=0; i<*nfrom; i++) {
354     void *header = (*(char**)fromdata) + count*unitbytes*i;
355     PetscMPIInt k;
356     for (k=0; k<ntags; k++) recvreq[i*ntags+k] = MPI_REQUEST_NULL;
357     ierr = (*recv)(comm,tag,(*fromranks)[i],header,recvreq+i*ntags,ctx);CHKERRQ(ierr);
358   }
359   ierr = PetscFree(tag);CHKERRQ(ierr);
360   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
361   *toreqs = sendreq;
362   *fromreqs = recvreq;
363   PetscFunctionReturn(0);
364 }
365 
366 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
367 
368 #undef __FUNCT__
369 #define __FUNCT__ "PetscCommBuildTwoSidedFReq_Ibarrier"
370 static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
371                                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
372                                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
373                                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
374 {
375   PetscErrorCode ierr;
376   PetscMPIInt    nrecvs,tag,*tags,done,i;
377   MPI_Aint       lb,unitbytes;
378   char           *tdata;
379   MPI_Request    *sendreqs,*usendreqs,*req,barrier;
380   PetscSegBuffer segrank,segdata,segreq;
381   PetscBool      barrier_started;
382 
383   PetscFunctionBegin;
384   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
385   ierr = PetscMalloc1(ntags,&tags);CHKERRQ(ierr);
386   for (i=0; i<ntags; i++) {
387     ierr = PetscCommGetNewTag(comm,&tags[i]);CHKERRQ(ierr);
388   }
389   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
390   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
391   tdata = (char*)todata;
392   ierr = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
393   ierr = PetscMalloc1(nto*ntags,&usendreqs);CHKERRQ(ierr);
394   /* Post synchronous sends */
395   for (i=0; i<nto; i++) {
396     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
397   }
398   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
399    * synchronous messages above. */
400   for (i=0; i<nto; i++) {
401     PetscMPIInt k;
402     for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
403     ierr = (*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx);CHKERRQ(ierr);
404   }
405 
406   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
407   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
408   ierr = PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq);CHKERRQ(ierr);
409 
410   nrecvs  = 0;
411   barrier = MPI_REQUEST_NULL;
412   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
413    * but we need to work around it. */
414   barrier_started = PETSC_FALSE;
415   for (done=0; !done; ) {
416     PetscMPIInt flag;
417     MPI_Status  status;
418     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
419     if (flag) {                 /* incoming message */
420       PetscMPIInt *recvrank,k;
421       void        *buf;
422       ierr = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
423       ierr = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
424       *recvrank = status.MPI_SOURCE;
425       ierr = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
426       ierr = PetscSegBufferGet(segreq,ntags,&req);CHKERRQ(ierr);
427       for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
428       ierr = (*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx);CHKERRQ(ierr);
429       nrecvs++;
430     }
431     if (!barrier_started) {
432       PetscMPIInt sent,nsends;
433       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
434       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
435       if (sent) {
436 #if defined(PETSC_HAVE_MPI_IBARRIER)
437         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
438 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
439         ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
440 #endif
441         barrier_started = PETSC_TRUE;
442       }
443     } else {
444       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
445     }
446   }
447   *nfrom = nrecvs;
448   ierr = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
449   ierr = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
450   ierr = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
451   ierr = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
452   *toreqs = usendreqs;
453   ierr = PetscSegBufferExtractAlloc(segreq,fromreqs);CHKERRQ(ierr);
454   ierr = PetscSegBufferDestroy(&segreq);CHKERRQ(ierr);
455   ierr = PetscFree(sendreqs);CHKERRQ(ierr);
456   ierr = PetscFree(tags);CHKERRQ(ierr);
457   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
458   PetscFunctionReturn(0);
459 }
460 #endif
461 
462 #undef __FUNCT__
463 #define __FUNCT__ "PetscCommBuildTwoSidedF"
464 /*@C
465    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
466 
467    Collective on MPI_Comm
468 
469    Input Arguments:
470 +  comm - communicator
471 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
472 .  dtype - datatype to send/receive from each rank (must match on all ranks)
473 .  nto - number of ranks to send data to
474 .  toranks - ranks to send to (array of length nto)
475 .  todata - data to send to each rank (packed)
476 .  ntags - number of tags needed by send/recv callbacks
477 .  send - callback invoked on sending process when ready to send primary payload
478 .  recv - callback invoked on receiving process after delivery of rendezvous message
479 -  ctx - context for callbacks
480 
481    Output Arguments:
482 +  nfrom - number of ranks receiving messages from
483 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
484 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
485 
486    Level: developer
487 
488    Notes:
489    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
490    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
491 
492    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
493 
494    References:
495 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
496    Scalable communication protocols for dynamic sparse data exchange, 2010.
497 
498 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedFReq(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
499 @*/
500 PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,
501                                        PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
502                                        PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
503 {
504   PetscErrorCode ierr;
505   MPI_Request    *toreqs,*fromreqs;
506 
507   PetscFunctionBegin;
508   ierr = PetscCommBuildTwoSidedFReq(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,&toreqs,&fromreqs,send,recv,ctx);CHKERRQ(ierr);
509   ierr = MPI_Waitall(nto*ntags,toreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
510   ierr = MPI_Waitall(*nfrom*ntags,fromreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
511   ierr = PetscFree(toreqs);CHKERRQ(ierr);
512   ierr = PetscFree(fromreqs);CHKERRQ(ierr);
513   PetscFunctionReturn(0);
514 }
515 
516 #undef __FUNCT__
517 #define __FUNCT__ "PetscCommBuildTwoSidedFReq"
518 /*@C
519    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
520 
521    Collective on MPI_Comm
522 
523    Input Arguments:
524 +  comm - communicator
525 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
526 .  dtype - datatype to send/receive from each rank (must match on all ranks)
527 .  nto - number of ranks to send data to
528 .  toranks - ranks to send to (array of length nto)
529 .  todata - data to send to each rank (packed)
530 .  ntags - number of tags needed by send/recv callbacks
531 .  send - callback invoked on sending process when ready to send primary payload
532 .  recv - callback invoked on receiving process after delivery of rendezvous message
533 -  ctx - context for callbacks
534 
535    Output Arguments:
536 +  nfrom - number of ranks receiving messages from
537 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
538 .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
539 .  toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
540 -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())
541 
542    Level: developer
543 
544    Notes:
545    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
546    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
547 
548    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
549 
550    References:
551 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
552    Scalable communication protocols for dynamic sparse data exchange, 2010.
553 
554 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedF(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
555 @*/
556 PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
557                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
558                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
559                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
560 {
561   PetscErrorCode         ierr,(*f)(MPI_Comm,PetscMPIInt,MPI_Datatype,PetscMPIInt,const PetscMPIInt[],const void*,
562                                    PetscMPIInt*,PetscMPIInt**,void*,PetscMPIInt,MPI_Request**,MPI_Request**,
563                                    PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
564                                    PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx);
565   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
566   PetscMPIInt i,size;
567 
568   PetscFunctionBegin;
569   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
570   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
571   for (i=0; i<nto; i++) {
572     if (toranks[i] < 0 || size <= toranks[i]) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"toranks[%d] %d not in comm size %d",i,toranks[i],size);
573   }
574   ierr = PetscLogEventBegin(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
575   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
576   switch (buildtype) {
577   case PETSC_BUILDTWOSIDED_IBARRIER:
578 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
579     f = PetscCommBuildTwoSidedFReq_Ibarrier;
580 #else
581     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
582 #endif
583     break;
584   case PETSC_BUILDTWOSIDED_ALLREDUCE:
585   case PETSC_BUILDTWOSIDED_REDSCATTER:
586     f = PetscCommBuildTwoSidedFReq_Reference;
587     break;
588   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
589   }
590   ierr = (*f)(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,toreqs,fromreqs,send,recv,ctx);CHKERRQ(ierr);
591   ierr = PetscLogEventEnd(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
592   PetscFunctionReturn(0);
593 }
594