xref: /petsc/src/sys/utils/mpits.c (revision 493de4004d8a6b5b16751267168cdab7e3bbd94a)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 
3 PetscLogEvent PETSC_BuildTwoSided,PETSC_BuildTwoSidedF;
4 
5 const char *const PetscBuildTwoSidedTypes[] = {
6   "ALLREDUCE",
7   "IBARRIER",
8   "REDSCATTER",
9   "PetscBuildTwoSidedType",
10   "PETSC_BUILDTWOSIDED_",
11   0
12 };
13 
14 static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
15 
16 #undef __FUNCT__
17 #define __FUNCT__ "PetscCommBuildTwoSidedSetType"
18 /*@
19    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
20 
21    Logically Collective
22 
23    Input Arguments:
24 +  comm - PETSC_COMM_WORLD
25 -  twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()
26 
27    Level: developer
28 
29    Note:
30    This option is currently global, but could be made per-communicator.
31 
32 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedGetType()
33 @*/
34 PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)
35 {
36   PetscFunctionBegin;
37 #if defined(PETSC_USE_DEBUG)
38   {                             /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
39     PetscMPIInt ierr;
40     PetscMPIInt b1[2],b2[2];
41     b1[0] = -(PetscMPIInt)twosided;
42     b1[1] = (PetscMPIInt)twosided;
43     ierr  = MPI_Allreduce(b1,b2,2,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
44     if (-b2[0] != b2[1]) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"Enum value must be same on all processes");
45   }
46 #endif
47   _twosided_type = twosided;
48   PetscFunctionReturn(0);
49 }
50 
51 #undef __FUNCT__
52 #define __FUNCT__ "PetscCommBuildTwoSidedGetType"
53 /*@
54    PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication
55 
56    Logically Collective
57 
58    Output Arguments:
59 +  comm - communicator on which to query algorithm
60 -  twosided - algorithm to use for PetscCommBuildTwoSided()
61 
62    Level: developer
63 
64 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedSetType()
65 @*/
66 PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType *twosided)
67 {
68   PetscErrorCode ierr;
69 
70   PetscFunctionBegin;
71   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
72   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
73 #if defined(PETSC_HAVE_MPI_IBARRIER)
74 #  if defined(PETSC_HAVE_MPICH_CH3_SOCK) && !defined(PETSC_HAVE_MPICH_CH3_SOCK_FIXED_NBC_PROGRESS)
75     /* Deadlock in Ibarrier: http://trac.mpich.org/projects/mpich/ticket/1785 */
76     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE;
77 #  else
78     _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
79 #  endif
80 #else
81     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE;
82 #endif
83     ierr = PetscOptionsGetEnum(NULL,"-build_twosided",PetscBuildTwoSidedTypes,(PetscEnum*)&_twosided_type,NULL);CHKERRQ(ierr);
84   }
85   *twosided = _twosided_type;
86   PetscFunctionReturn(0);
87 }
88 
89 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
90 
91 #undef __FUNCT__
92 #define __FUNCT__ "PetscCommBuildTwoSided_Ibarrier"
93 static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
94 {
95   PetscErrorCode ierr;
96   PetscMPIInt    nrecvs,tag,done,i;
97   MPI_Aint       lb,unitbytes;
98   char           *tdata;
99   MPI_Request    *sendreqs,barrier;
100   PetscSegBuffer segrank,segdata;
101 
102   PetscFunctionBegin;
103   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
104   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
105   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
106   tdata = (char*)todata;
107   ierr  = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
108   for (i=0; i<nto; i++) {
109     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
110   }
111   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
112   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
113 
114   nrecvs  = 0;
115   barrier = MPI_REQUEST_NULL;
116   for (done=0; !done; ) {
117     PetscMPIInt flag;
118     MPI_Status  status;
119     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
120     if (flag) {                 /* incoming message */
121       PetscMPIInt *recvrank;
122       void        *buf;
123       ierr      = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
124       ierr      = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
125       *recvrank = status.MPI_SOURCE;
126       ierr      = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
127       nrecvs++;
128     }
129     if (barrier == MPI_REQUEST_NULL) {
130       PetscMPIInt sent,nsends;
131       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
132       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
133       if (sent) {
134 #if defined(PETSC_HAVE_MPI_IBARRIER)
135         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
136 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
137         ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
138 #endif
139         ierr = PetscFree(sendreqs);CHKERRQ(ierr);
140       }
141     } else {
142       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
143     }
144   }
145   *nfrom = nrecvs;
146   ierr   = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
147   ierr   = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
148   ierr   = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
149   ierr   = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
150   ierr   = PetscCommDestroy(&comm);CHKERRQ(ierr);
151   PetscFunctionReturn(0);
152 }
153 #endif
154 
155 #undef __FUNCT__
156 #define __FUNCT__ "PetscCommBuildTwoSided_Allreduce"
157 static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
158 {
159   PetscErrorCode ierr;
160   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i;
161   MPI_Aint       lb,unitbytes;
162   char           *tdata,*fdata;
163   MPI_Request    *reqs,*sendreqs;
164   MPI_Status     *statuses;
165 
166   PetscFunctionBegin;
167   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
168   ierr = PetscCalloc1(size,&iflags);CHKERRQ(ierr);
169   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
170   ierr = PetscGatherNumberOfMessages(comm,iflags,NULL,&nrecvs);CHKERRQ(ierr);
171   ierr = PetscFree(iflags);CHKERRQ(ierr);
172 
173   ierr     = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
174   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
175   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
176   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
177   tdata    = (char*)todata;
178   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
179   sendreqs = reqs + nrecvs;
180   for (i=0; i<nrecvs; i++) {
181     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
182   }
183   for (i=0; i<nto; i++) {
184     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
185   }
186   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
187   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
188   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
189   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
190   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
191 
192   *nfrom            = nrecvs;
193   *fromranks        = franks;
194   *(void**)fromdata = fdata;
195   PetscFunctionReturn(0);
196 }
197 
198 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
199 #undef __FUNCT__
200 #define __FUNCT__ "PetscCommBuildTwoSided_RedScatter"
201 static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
202 {
203   PetscErrorCode ierr;
204   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i;
205   MPI_Aint       lb,unitbytes;
206   char           *tdata,*fdata;
207   MPI_Request    *reqs,*sendreqs;
208   MPI_Status     *statuses;
209 
210   PetscFunctionBegin;
211   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
212   ierr = PetscMalloc1(size,&iflags);CHKERRQ(ierr);
213   ierr = PetscMemzero(iflags,size*sizeof(*iflags));CHKERRQ(ierr);
214   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
215   ierr = MPI_Reduce_scatter_block(iflags,&nrecvs,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
216   ierr = PetscFree(iflags);CHKERRQ(ierr);
217 
218   ierr     = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
219   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
220   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
221   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
222   tdata    = (char*)todata;
223   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
224   sendreqs = reqs + nrecvs;
225   for (i=0; i<nrecvs; i++) {
226     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
227   }
228   for (i=0; i<nto; i++) {
229     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
230   }
231   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
232   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
233   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
234   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
235   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
236 
237   *nfrom            = nrecvs;
238   *fromranks        = franks;
239   *(void**)fromdata = fdata;
240   PetscFunctionReturn(0);
241 }
242 #endif
243 
244 #undef __FUNCT__
245 #define __FUNCT__ "PetscCommBuildTwoSided"
246 /*@C
247    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
248 
249    Collective on MPI_Comm
250 
251    Input Arguments:
252 +  comm - communicator
253 .  count - number of entries to send/receive (must match on all ranks)
254 .  dtype - datatype to send/receive from each rank (must match on all ranks)
255 .  nto - number of ranks to send data to
256 .  toranks - ranks to send to (array of length nto)
257 -  todata - data to send to each rank (packed)
258 
259    Output Arguments:
260 +  nfrom - number of ranks receiving messages from
261 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
262 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
263 
264    Level: developer
265 
266    Options Database Keys:
267 .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication
268 
269    Notes:
270    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
271    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.
272 
273    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
274 
275    References:
276    The MPI_Ibarrier implementation uses the algorithm in
277    Hoefler, Siebert and Lumsdaine, Scalable communication protocols for dynamic sparse data exchange, 2010.
278 
279 .seealso: PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
280 @*/
281 PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
282 {
283   PetscErrorCode         ierr;
284   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
285 
286   PetscFunctionBegin;
287   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
288   ierr = PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
289   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
290   switch (buildtype) {
291   case PETSC_BUILDTWOSIDED_IBARRIER:
292 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
293     ierr = PetscCommBuildTwoSided_Ibarrier(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
294 #else
295     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
296 #endif
297     break;
298   case PETSC_BUILDTWOSIDED_ALLREDUCE:
299     ierr = PetscCommBuildTwoSided_Allreduce(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
300     break;
301   case PETSC_BUILDTWOSIDED_REDSCATTER:
302 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
303     ierr = PetscCommBuildTwoSided_RedScatter(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
304 #else
305     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
306 #endif
307     break;
308   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
309   }
310   ierr = PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
311   PetscFunctionReturn(0);
312 }
313 
314 #undef __FUNCT__
315 #define __FUNCT__ "PetscCommBuildTwoSidedFReq_Reference"
316 static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
317                                                            PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
318                                                            PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
319                                                            PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
320 {
321   PetscErrorCode ierr;
322   PetscMPIInt i,*tag;
323   MPI_Aint    lb,unitbytes;
324   MPI_Request *sendreq,*recvreq;
325 
326   PetscFunctionBegin;
327   ierr = PetscMalloc1(ntags,&tag);CHKERRQ(ierr);
328   if (ntags > 0) {
329     ierr = PetscCommDuplicate(comm,&comm,&tag[0]);CHKERRQ(ierr);
330   }
331   for (i=1; i<ntags; i++) {
332     ierr = PetscCommGetNewTag(comm,&tag[i]);CHKERRQ(ierr);
333   }
334 
335   /* Perform complete initial rendezvous */
336   ierr = PetscCommBuildTwoSided(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
337 
338   ierr = PetscMalloc1(nto*ntags,&sendreq);CHKERRQ(ierr);
339   ierr = PetscMalloc1(*nfrom*ntags,&recvreq);CHKERRQ(ierr);
340 
341   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
342   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
343   for (i=0; i<nto; i++) {
344     PetscMPIInt k;
345     for (k=0; k<ntags; k++) sendreq[i*ntags+k] = MPI_REQUEST_NULL;
346     ierr = (*send)(comm,tag,i,toranks[i],((char*)todata)+count*unitbytes*i,sendreq+i*ntags,ctx);CHKERRQ(ierr);
347   }
348   for (i=0; i<*nfrom; i++) {
349     void *header = (*(char**)fromdata) + count*unitbytes*i;
350     PetscMPIInt k;
351     for (k=0; k<ntags; k++) recvreq[i*ntags+k] = MPI_REQUEST_NULL;
352     ierr = (*recv)(comm,tag,(*fromranks)[i],header,recvreq+i*ntags,ctx);CHKERRQ(ierr);
353   }
354   ierr = PetscFree(tag);CHKERRQ(ierr);
355   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
356   *toreqs = sendreq;
357   *fromreqs = recvreq;
358   PetscFunctionReturn(0);
359 }
360 
361 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
362 
363 #undef __FUNCT__
364 #define __FUNCT__ "PetscCommBuildTwoSidedFReq_Ibarrier"
365 static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
366                                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
367                                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
368                                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
369 {
370   PetscErrorCode ierr;
371   PetscMPIInt    nrecvs,tag,*tags,done,i;
372   MPI_Aint       lb,unitbytes;
373   char           *tdata;
374   MPI_Request    *sendreqs,*usendreqs,*req,barrier;
375   PetscSegBuffer segrank,segdata,segreq;
376 
377   PetscFunctionBegin;
378   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
379   ierr = PetscMalloc1(ntags,&tags);CHKERRQ(ierr);
380   for (i=0; i<ntags; i++) {
381     ierr = PetscCommGetNewTag(comm,&tags[i]);CHKERRQ(ierr);
382   }
383   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
384   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
385   tdata = (char*)todata;
386   ierr = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
387   ierr = PetscMalloc1(nto*ntags,&usendreqs);CHKERRQ(ierr);
388   /* Post synchronous sends */
389   for (i=0; i<nto; i++) {
390     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
391   }
392   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
393    * synchronous messages above. */
394   for (i=0; i<nto; i++) {
395     PetscMPIInt k;
396     for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
397     ierr = (*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx);CHKERRQ(ierr);
398   }
399 
400   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
401   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
402   ierr = PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq);CHKERRQ(ierr);
403 
404   nrecvs  = 0;
405   barrier = MPI_REQUEST_NULL;
406   for (done=0; !done; ) {
407     PetscMPIInt flag;
408     MPI_Status  status;
409     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
410     if (flag) {                 /* incoming message */
411       PetscMPIInt *recvrank,k;
412       void        *buf;
413       ierr = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
414       ierr = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
415       *recvrank = status.MPI_SOURCE;
416       ierr = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
417       ierr = PetscSegBufferGet(segreq,ntags,&req);CHKERRQ(ierr);
418       for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
419       ierr = (*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx);CHKERRQ(ierr);
420       nrecvs++;
421     }
422     if (barrier == MPI_REQUEST_NULL) {
423       PetscMPIInt sent,nsends;
424       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
425       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
426       if (sent) {
427 #if defined(PETSC_HAVE_MPI_IBARRIER)
428         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
429 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
430         ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
431 #endif
432       }
433     } else {
434       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
435     }
436   }
437   *nfrom = nrecvs;
438   ierr = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
439   ierr = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
440   ierr = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
441   ierr = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
442   *toreqs = usendreqs;
443   ierr = PetscSegBufferExtractAlloc(segreq,fromreqs);CHKERRQ(ierr);
444   ierr = PetscSegBufferDestroy(&segreq);CHKERRQ(ierr);
445   ierr = PetscFree(sendreqs);CHKERRQ(ierr);
446   ierr = PetscFree(tags);CHKERRQ(ierr);
447   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
448   PetscFunctionReturn(0);
449 }
450 #endif
451 
452 #undef __FUNCT__
453 #define __FUNCT__ "PetscCommBuildTwoSidedF"
454 /*@C
455    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
456 
457    Collective on MPI_Comm
458 
459    Input Arguments:
460 +  comm - communicator
461 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
462 .  dtype - datatype to send/receive from each rank (must match on all ranks)
463 .  nto - number of ranks to send data to
464 .  toranks - ranks to send to (array of length nto)
465 .  todata - data to send to each rank (packed)
466 .  ntags - number of tags needed by send/recv callbacks
467 .  send - callback invoked on sending process when ready to send primary payload
468 .  recv - callback invoked on receiving process after delivery of rendezvous message
469 -  ctx - context for callbacks
470 
471    Output Arguments:
472 +  nfrom - number of ranks receiving messages from
473 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
474 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
475 
476    Level: developer
477 
478    Notes:
479    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
480    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
481 
482    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
483 
484    References:
485    The MPI_Ibarrier implementation uses the algorithm in
486    Hoefler, Siebert and Lumsdaine, Scalable communication protocols for dynamic sparse data exchange, 2010.
487 
488 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedFReq(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
489 @*/
490 PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,
491                                        PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
492                                        PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
493 {
494   PetscErrorCode ierr;
495   MPI_Request    *toreqs,*fromreqs;
496 
497   PetscFunctionBegin;
498   ierr = PetscCommBuildTwoSidedFReq(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,&toreqs,&fromreqs,send,recv,ctx);CHKERRQ(ierr);
499   ierr = MPI_Waitall(nto*ntags,toreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
500   ierr = MPI_Waitall(*nfrom*ntags,fromreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
501   ierr = PetscFree(toreqs);CHKERRQ(ierr);
502   ierr = PetscFree(fromreqs);CHKERRQ(ierr);
503   PetscFunctionReturn(0);
504 }
505 
506 #undef __FUNCT__
507 #define __FUNCT__ "PetscCommBuildTwoSidedFReq"
508 /*@C
509    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
510 
511    Collective on MPI_Comm
512 
513    Input Arguments:
514 +  comm - communicator
515 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
516 .  dtype - datatype to send/receive from each rank (must match on all ranks)
517 .  nto - number of ranks to send data to
518 .  toranks - ranks to send to (array of length nto)
519 .  todata - data to send to each rank (packed)
520 .  ntags - number of tags needed by send/recv callbacks
521 .  send - callback invoked on sending process when ready to send primary payload
522 .  recv - callback invoked on receiving process after delivery of rendezvous message
523 -  ctx - context for callbacks
524 
525    Output Arguments:
526 +  nfrom - number of ranks receiving messages from
527 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
528 .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
529 .  toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
530 -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())
531 
532    Level: developer
533 
534    Notes:
535    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
536    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
537 
538    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
539 
540    References:
541    The MPI_Ibarrier implementation uses the algorithm in
542    Hoefler, Siebert and Lumsdaine, Scalable communication protocols for dynamic sparse data exchange, 2010.
543 
544 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedF(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
545 @*/
546 PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
547                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
548                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
549                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
550 {
551   PetscErrorCode         ierr,(*f)(MPI_Comm,PetscMPIInt,MPI_Datatype,PetscMPIInt,const PetscMPIInt[],const void*,
552                                    PetscMPIInt*,PetscMPIInt**,void*,PetscMPIInt,MPI_Request**,MPI_Request**,
553                                    PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
554                                    PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx);
555   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
556   PetscMPIInt i,size;
557 
558   PetscFunctionBegin;
559   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
560   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
561   for (i=0; i<nto; i++) {
562     if (toranks[i] < 0 || size <= toranks[i]) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"toranks[%d] %d not in comm size %d",i,toranks[i],size);
563   }
564   ierr = PetscLogEventBegin(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
565   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
566   switch (buildtype) {
567   case PETSC_BUILDTWOSIDED_IBARRIER:
568 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
569     f = PetscCommBuildTwoSidedFReq_Ibarrier;
570 #else
571     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
572 #endif
573     break;
574   case PETSC_BUILDTWOSIDED_ALLREDUCE:
575   case PETSC_BUILDTWOSIDED_REDSCATTER:
576     f = PetscCommBuildTwoSidedFReq_Reference;
577     break;
578   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
579   }
580   ierr = (*f)(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,toreqs,fromreqs,send,recv,ctx);CHKERRQ(ierr);
581   ierr = PetscLogEventEnd(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584