xref: /petsc/src/sys/utils/mpits.c (revision 4dfa11a44d5adf2389f1d3acbc8f3c1116dc6c3a)
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 PetscLogEvent PETSC_BuildTwoSided;
5 PetscLogEvent PETSC_BuildTwoSidedF;
6 
7 const char *const PetscBuildTwoSidedTypes[] = {"ALLREDUCE", "IBARRIER", "REDSCATTER", "PetscBuildTwoSidedType", "PETSC_BUILDTWOSIDED_", NULL};
8 
9 static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
10 
11 /*@
12    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
13 
14    Logically Collective
15 
16    Input Parameters:
17 +  comm - `PETSC_COMM_WORLD`
18 -  twosided - algorithm to use in subsequent calls to `PetscCommBuildTwoSided()`
19 
20    Level: developer
21 
22    Note:
23    This option is currently global, but could be made per-communicator.
24 
25 .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedGetType()`, `PetscBuildTwoSidedType`
26 @*/
27 PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm, PetscBuildTwoSidedType twosided) {
28   PetscFunctionBegin;
29   if (PetscDefined(USE_DEBUG)) { /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
30     PetscMPIInt b1[2], b2[2];
31     b1[0] = -(PetscMPIInt)twosided;
32     b1[1] = (PetscMPIInt)twosided;
33     PetscCall(MPIU_Allreduce(b1, b2, 2, MPI_INT, MPI_MAX, comm));
34     PetscCheck(-b2[0] == b2[1], comm, PETSC_ERR_ARG_WRONG, "Enum value must be same on all processes");
35   }
36   _twosided_type = twosided;
37   PetscFunctionReturn(0);
38 }
39 
40 /*@
41    PetscCommBuildTwoSidedGetType - get algorithm used when building two-sided communication
42 
43    Logically Collective
44 
45    Output Parameters:
46 +  comm - communicator on which to query algorithm
47 -  twosided - algorithm to use for `PetscCommBuildTwoSided()`
48 
49    Level: developer
50 
51 .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedSetType()`, `PetscBuildTwoSidedType`
52 @*/
53 PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm, PetscBuildTwoSidedType *twosided) {
54   PetscMPIInt size;
55 
56   PetscFunctionBegin;
57   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
58   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
59     PetscCallMPI(MPI_Comm_size(comm, &size));
60     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE; /* default for small comms, see https://gitlab.com/petsc/petsc/-/merge_requests/2611 */
61 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
62     if (size > 1024) _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
63 #endif
64     PetscCall(PetscOptionsGetEnum(NULL, NULL, "-build_twosided", PetscBuildTwoSidedTypes, (PetscEnum *)&_twosided_type, NULL));
65   }
66   *twosided = _twosided_type;
67   PetscFunctionReturn(0);
68 }
69 
70 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
71 static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata) {
72   PetscMPIInt    nrecvs, tag, done, i;
73   MPI_Aint       lb, unitbytes;
74   char          *tdata;
75   MPI_Request   *sendreqs, barrier;
76   PetscSegBuffer segrank, segdata;
77   PetscBool      barrier_started;
78 
79   PetscFunctionBegin;
80   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
81   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
82   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
83   tdata = (char *)todata;
84   PetscCall(PetscMalloc1(nto, &sendreqs));
85   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Issend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
86   PetscCall(PetscSegBufferCreate(sizeof(PetscMPIInt), 4, &segrank));
87   PetscCall(PetscSegBufferCreate(unitbytes, 4 * count, &segdata));
88 
89   nrecvs          = 0;
90   barrier         = MPI_REQUEST_NULL;
91   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
92    * but we need to work around it. */
93   barrier_started = PETSC_FALSE;
94   for (done = 0; !done;) {
95     PetscMPIInt flag;
96     MPI_Status  status;
97     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &flag, &status));
98     if (flag) { /* incoming message */
99       PetscMPIInt *recvrank;
100       void        *buf;
101       PetscCall(PetscSegBufferGet(segrank, 1, &recvrank));
102       PetscCall(PetscSegBufferGet(segdata, count, &buf));
103       *recvrank = status.MPI_SOURCE;
104       PetscCallMPI(MPI_Recv(buf, count, dtype, status.MPI_SOURCE, tag, comm, MPI_STATUS_IGNORE));
105       nrecvs++;
106     }
107     if (!barrier_started) {
108       PetscMPIInt sent, nsends;
109       PetscCall(PetscMPIIntCast(nto, &nsends));
110       PetscCallMPI(MPI_Testall(nsends, sendreqs, &sent, MPI_STATUSES_IGNORE));
111       if (sent) {
112         PetscCallMPI(MPI_Ibarrier(comm, &barrier));
113         barrier_started = PETSC_TRUE;
114         PetscCall(PetscFree(sendreqs));
115       }
116     } else {
117       PetscCallMPI(MPI_Test(&barrier, &done, MPI_STATUS_IGNORE));
118     }
119   }
120   *nfrom = nrecvs;
121   PetscCall(PetscSegBufferExtractAlloc(segrank, fromranks));
122   PetscCall(PetscSegBufferDestroy(&segrank));
123   PetscCall(PetscSegBufferExtractAlloc(segdata, fromdata));
124   PetscCall(PetscSegBufferDestroy(&segdata));
125   PetscCall(PetscCommDestroy(&comm));
126   PetscFunctionReturn(0);
127 }
128 #endif
129 
130 static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata) {
131   PetscMPIInt       size, rank, *iflags, nrecvs, tag, *franks, i, flg;
132   MPI_Aint          lb, unitbytes;
133   char             *tdata, *fdata;
134   MPI_Request      *reqs, *sendreqs;
135   MPI_Status       *statuses;
136   PetscCommCounter *counter;
137 
138   PetscFunctionBegin;
139   PetscCallMPI(MPI_Comm_size(comm, &size));
140   PetscCallMPI(MPI_Comm_rank(comm, &rank));
141   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
142   PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Counter_keyval, &counter, &flg));
143   PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inner PETSc communicator does not have its tag/name counter attribute set");
144   if (!counter->iflags) {
145     PetscCall(PetscCalloc1(size, &counter->iflags));
146     iflags = counter->iflags;
147   } else {
148     iflags = counter->iflags;
149     PetscCall(PetscArrayzero(iflags, size));
150   }
151   for (i = 0; i < nto; i++) iflags[toranks[i]] = 1;
152   PetscCall(MPIU_Allreduce(MPI_IN_PLACE, iflags, size, MPI_INT, MPI_SUM, comm));
153   nrecvs = iflags[rank];
154   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
155   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
156   PetscCall(PetscMalloc(nrecvs * count * unitbytes, &fdata));
157   tdata = (char *)todata;
158   PetscCall(PetscMalloc2(nto + nrecvs, &reqs, nto + nrecvs, &statuses));
159   sendreqs = reqs + nrecvs;
160   for (i = 0; i < nrecvs; i++) PetscCallMPI(MPI_Irecv((void *)(fdata + count * unitbytes * i), count, dtype, MPI_ANY_SOURCE, tag, comm, reqs + i));
161   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Isend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
162   PetscCallMPI(MPI_Waitall(nto + nrecvs, reqs, statuses));
163   PetscCall(PetscMalloc1(nrecvs, &franks));
164   for (i = 0; i < nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
165   PetscCall(PetscFree2(reqs, statuses));
166   PetscCall(PetscCommDestroy(&comm));
167 
168   *nfrom             = nrecvs;
169   *fromranks         = franks;
170   *(void **)fromdata = fdata;
171   PetscFunctionReturn(0);
172 }
173 
174 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
175 static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata) {
176   PetscMPIInt       size, *iflags, nrecvs, tag, *franks, i, flg;
177   MPI_Aint          lb, unitbytes;
178   char             *tdata, *fdata;
179   MPI_Request      *reqs, *sendreqs;
180   MPI_Status       *statuses;
181   PetscCommCounter *counter;
182 
183   PetscFunctionBegin;
184   PetscCallMPI(MPI_Comm_size(comm, &size));
185   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
186   PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Counter_keyval, &counter, &flg));
187   PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inner PETSc communicator does not have its tag/name counter attribute set");
188   if (!counter->iflags) {
189     PetscCall(PetscCalloc1(size, &counter->iflags));
190     iflags = counter->iflags;
191   } else {
192     iflags = counter->iflags;
193     PetscCall(PetscArrayzero(iflags, size));
194   }
195   for (i = 0; i < nto; i++) iflags[toranks[i]] = 1;
196   PetscCallMPI(MPI_Reduce_scatter_block(iflags, &nrecvs, 1, MPI_INT, MPI_SUM, comm));
197   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
198   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
199   PetscCall(PetscMalloc(nrecvs * count * unitbytes, &fdata));
200   tdata = (char *)todata;
201   PetscCall(PetscMalloc2(nto + nrecvs, &reqs, nto + nrecvs, &statuses));
202   sendreqs = reqs + nrecvs;
203   for (i = 0; i < nrecvs; i++) PetscCallMPI(MPI_Irecv((void *)(fdata + count * unitbytes * i), count, dtype, MPI_ANY_SOURCE, tag, comm, reqs + i));
204   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Isend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
205   PetscCallMPI(MPI_Waitall(nto + nrecvs, reqs, statuses));
206   PetscCall(PetscMalloc1(nrecvs, &franks));
207   for (i = 0; i < nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
208   PetscCall(PetscFree2(reqs, statuses));
209   PetscCall(PetscCommDestroy(&comm));
210 
211   *nfrom             = nrecvs;
212   *fromranks         = franks;
213   *(void **)fromdata = fdata;
214   PetscFunctionReturn(0);
215 }
216 #endif
217 
218 /*@C
219    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
220 
221    Collective
222 
223    Input Parameters:
224 +  comm - communicator
225 .  count - number of entries to send/receive (must match on all ranks)
226 .  dtype - datatype to send/receive from each rank (must match on all ranks)
227 .  nto - number of ranks to send data to
228 .  toranks - ranks to send to (array of length nto)
229 -  todata - data to send to each rank (packed)
230 
231    Output Parameters:
232 +  nfrom - number of ranks receiving messages from
233 .  fromranks - ranks receiving messages from (length nfrom; caller should `PetscFree()`)
234 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for `PetscFree()`)
235 
236    Level: developer
237 
238    Options Database Key:
239 .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication. Default is allreduce for communicators with <= 1024 ranks, otherwise ibarrier.
240 
241    Notes:
242    This memory-scalable interface is an alternative to calling `PetscGatherNumberOfMessages()` and
243    `PetscGatherMessageLengths()`, possibly with a subsequent round of communication to send other constant-size data.
244 
245    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
246 
247    References:
248 .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
249    Scalable communication protocols for dynamic sparse data exchange, 2010.
250 
251 .seealso: `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`, `PetscCommBuildTwoSidedSetType()`, `PetscCommBuildTwoSidedType`
252 @*/
253 PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata) {
254   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
255 
256   PetscFunctionBegin;
257   PetscCall(PetscSysInitializePackage());
258   PetscCall(PetscLogEventSync(PETSC_BuildTwoSided, comm));
259   PetscCall(PetscLogEventBegin(PETSC_BuildTwoSided, 0, 0, 0, 0));
260   PetscCall(PetscCommBuildTwoSidedGetType(comm, &buildtype));
261   switch (buildtype) {
262   case PETSC_BUILDTWOSIDED_IBARRIER:
263 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
264     PetscCall(PetscCommBuildTwoSided_Ibarrier(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
265     break;
266 #else
267     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
268 #endif
269   case PETSC_BUILDTWOSIDED_ALLREDUCE: PetscCall(PetscCommBuildTwoSided_Allreduce(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata)); break;
270   case PETSC_BUILDTWOSIDED_REDSCATTER:
271 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
272     PetscCall(PetscCommBuildTwoSided_RedScatter(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
273     break;
274 #else
275     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
276 #endif
277   default: SETERRQ(comm, PETSC_ERR_PLIB, "Unknown method for building two-sided communication");
278   }
279   PetscCall(PetscLogEventEnd(PETSC_BuildTwoSided, 0, 0, 0, 0));
280   PetscFunctionReturn(0);
281 }
282 
283 static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx) {
284   PetscMPIInt  i, *tag;
285   MPI_Aint     lb, unitbytes;
286   MPI_Request *sendreq, *recvreq;
287 
288   PetscFunctionBegin;
289   PetscCall(PetscMalloc1(ntags, &tag));
290   if (ntags > 0) PetscCall(PetscCommDuplicate(comm, &comm, &tag[0]));
291   for (i = 1; i < ntags; i++) PetscCall(PetscCommGetNewTag(comm, &tag[i]));
292 
293   /* Perform complete initial rendezvous */
294   PetscCall(PetscCommBuildTwoSided(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
295 
296   PetscCall(PetscMalloc1(nto * ntags, &sendreq));
297   PetscCall(PetscMalloc1(*nfrom * ntags, &recvreq));
298 
299   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
300   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
301   for (i = 0; i < nto; i++) {
302     PetscMPIInt k;
303     for (k = 0; k < ntags; k++) sendreq[i * ntags + k] = MPI_REQUEST_NULL;
304     PetscCall((*send)(comm, tag, i, toranks[i], ((char *)todata) + count * unitbytes * i, sendreq + i * ntags, ctx));
305   }
306   for (i = 0; i < *nfrom; i++) {
307     void       *header = (*(char **)fromdata) + count * unitbytes * i;
308     PetscMPIInt k;
309     for (k = 0; k < ntags; k++) recvreq[i * ntags + k] = MPI_REQUEST_NULL;
310     PetscCall((*recv)(comm, tag, (*fromranks)[i], header, recvreq + i * ntags, ctx));
311   }
312   PetscCall(PetscFree(tag));
313   PetscCall(PetscCommDestroy(&comm));
314   *toreqs   = sendreq;
315   *fromreqs = recvreq;
316   PetscFunctionReturn(0);
317 }
318 
319 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
320 
321 static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx) {
322   PetscMPIInt    nrecvs, tag, *tags, done, i;
323   MPI_Aint       lb, unitbytes;
324   char          *tdata;
325   MPI_Request   *sendreqs, *usendreqs, *req, barrier;
326   PetscSegBuffer segrank, segdata, segreq;
327   PetscBool      barrier_started;
328 
329   PetscFunctionBegin;
330   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
331   PetscCall(PetscMalloc1(ntags, &tags));
332   for (i = 0; i < ntags; i++) PetscCall(PetscCommGetNewTag(comm, &tags[i]));
333   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
334   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
335   tdata = (char *)todata;
336   PetscCall(PetscMalloc1(nto, &sendreqs));
337   PetscCall(PetscMalloc1(nto * ntags, &usendreqs));
338   /* Post synchronous sends */
339   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Issend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
340   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
341    * synchronous messages above. */
342   for (i = 0; i < nto; i++) {
343     PetscMPIInt k;
344     for (k = 0; k < ntags; k++) usendreqs[i * ntags + k] = MPI_REQUEST_NULL;
345     PetscCall((*send)(comm, tags, i, toranks[i], tdata + count * unitbytes * i, usendreqs + i * ntags, ctx));
346   }
347 
348   PetscCall(PetscSegBufferCreate(sizeof(PetscMPIInt), 4, &segrank));
349   PetscCall(PetscSegBufferCreate(unitbytes, 4 * count, &segdata));
350   PetscCall(PetscSegBufferCreate(sizeof(MPI_Request), 4, &segreq));
351 
352   nrecvs          = 0;
353   barrier         = MPI_REQUEST_NULL;
354   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
355    * but we need to work around it. */
356   barrier_started = PETSC_FALSE;
357   for (done = 0; !done;) {
358     PetscMPIInt flag;
359     MPI_Status  status;
360     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &flag, &status));
361     if (flag) { /* incoming message */
362       PetscMPIInt *recvrank, k;
363       void        *buf;
364       PetscCall(PetscSegBufferGet(segrank, 1, &recvrank));
365       PetscCall(PetscSegBufferGet(segdata, count, &buf));
366       *recvrank = status.MPI_SOURCE;
367       PetscCallMPI(MPI_Recv(buf, count, dtype, status.MPI_SOURCE, tag, comm, MPI_STATUS_IGNORE));
368       PetscCall(PetscSegBufferGet(segreq, ntags, &req));
369       for (k = 0; k < ntags; k++) req[k] = MPI_REQUEST_NULL;
370       PetscCall((*recv)(comm, tags, status.MPI_SOURCE, buf, req, ctx));
371       nrecvs++;
372     }
373     if (!barrier_started) {
374       PetscMPIInt sent, nsends;
375       PetscCall(PetscMPIIntCast(nto, &nsends));
376       PetscCallMPI(MPI_Testall(nsends, sendreqs, &sent, MPI_STATUSES_IGNORE));
377       if (sent) {
378         PetscCallMPI(MPI_Ibarrier(comm, &barrier));
379         barrier_started = PETSC_TRUE;
380       }
381     } else {
382       PetscCallMPI(MPI_Test(&barrier, &done, MPI_STATUS_IGNORE));
383     }
384   }
385   *nfrom = nrecvs;
386   PetscCall(PetscSegBufferExtractAlloc(segrank, fromranks));
387   PetscCall(PetscSegBufferDestroy(&segrank));
388   PetscCall(PetscSegBufferExtractAlloc(segdata, fromdata));
389   PetscCall(PetscSegBufferDestroy(&segdata));
390   *toreqs = usendreqs;
391   PetscCall(PetscSegBufferExtractAlloc(segreq, fromreqs));
392   PetscCall(PetscSegBufferDestroy(&segreq));
393   PetscCall(PetscFree(sendreqs));
394   PetscCall(PetscFree(tags));
395   PetscCall(PetscCommDestroy(&comm));
396   PetscFunctionReturn(0);
397 }
398 #endif
399 
400 /*@C
401    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
402 
403    Collective
404 
405    Input Parameters:
406 +  comm - communicator
407 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
408 .  dtype - datatype to send/receive from each rank (must match on all ranks)
409 .  nto - number of ranks to send data to
410 .  toranks - ranks to send to (array of length nto)
411 .  todata - data to send to each rank (packed)
412 .  ntags - number of tags needed by send/recv callbacks
413 .  send - callback invoked on sending process when ready to send primary payload
414 .  recv - callback invoked on receiving process after delivery of rendezvous message
415 -  ctx - context for callbacks
416 
417    Output Parameters:
418 +  nfrom - number of ranks receiving messages from
419 .  fromranks - ranks receiving messages from (length nfrom; caller should `PetscFree()`)
420 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for `PetscFree()`)
421 
422    Level: developer
423 
424    Notes:
425    This memory-scalable interface is an alternative to calling `PetscGatherNumberOfMessages()` and
426    `PetscGatherMessageLengths()`, possibly with a subsequent round of communication to send other data.
427 
428    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
429 
430    References:
431 .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
432    Scalable communication protocols for dynamic sparse data exchange, 2010.
433 
434 .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedFReq()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
435 @*/
436 PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx) {
437   MPI_Request *toreqs, *fromreqs;
438 
439   PetscFunctionBegin;
440   PetscCall(PetscCommBuildTwoSidedFReq(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata, ntags, &toreqs, &fromreqs, send, recv, ctx));
441   PetscCallMPI(MPI_Waitall(nto * ntags, toreqs, MPI_STATUSES_IGNORE));
442   PetscCallMPI(MPI_Waitall(*nfrom * ntags, fromreqs, MPI_STATUSES_IGNORE));
443   PetscCall(PetscFree(toreqs));
444   PetscCall(PetscFree(fromreqs));
445   PetscFunctionReturn(0);
446 }
447 
448 /*@C
449    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
450 
451    Collective
452 
453    Input Parameters:
454 +  comm - communicator
455 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
456 .  dtype - datatype to send/receive from each rank (must match on all ranks)
457 .  nto - number of ranks to send data to
458 .  toranks - ranks to send to (array of length nto)
459 .  todata - data to send to each rank (packed)
460 .  ntags - number of tags needed by send/recv callbacks
461 .  send - callback invoked on sending process when ready to send primary payload
462 .  recv - callback invoked on receiving process after delivery of rendezvous message
463 -  ctx - context for callbacks
464 
465    Output Parameters:
466 +  nfrom - number of ranks receiving messages from
467 .  fromranks - ranks receiving messages from (length nfrom; caller should `PetscFree()`)
468 .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for `PetscFree()`)
469 .  toreqs - array of nto*ntags sender requests (caller must wait on these, then `PetscFree()`)
470 -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then `PetscFree()`)
471 
472    Level: developer
473 
474    Notes:
475    This memory-scalable interface is an alternative to calling `PetscGatherNumberOfMessages()` and
476    `PetscGatherMessageLengths()`, possibly with a subsequent round of communication to send other data.
477 
478    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
479 
480    References:
481 .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
482    Scalable communication protocols for dynamic sparse data exchange, 2010.
483 
484 .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedF()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
485 @*/
486 PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx) {
487   PetscErrorCode (*f)(MPI_Comm, PetscMPIInt, MPI_Datatype, PetscMPIInt, const PetscMPIInt[], const void *, PetscMPIInt *, PetscMPIInt **, void *, PetscMPIInt, MPI_Request **, MPI_Request **, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx);
488   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
489   PetscMPIInt            i, size;
490 
491   PetscFunctionBegin;
492   PetscCall(PetscSysInitializePackage());
493   PetscCallMPI(MPI_Comm_size(comm, &size));
494   for (i = 0; i < nto; i++) PetscCheck(toranks[i] >= 0 && size > toranks[i], comm, PETSC_ERR_ARG_OUTOFRANGE, "toranks[%d] %d not in comm size %d", i, toranks[i], size);
495   PetscCall(PetscLogEventSync(PETSC_BuildTwoSidedF, comm));
496   PetscCall(PetscLogEventBegin(PETSC_BuildTwoSidedF, 0, 0, 0, 0));
497   PetscCall(PetscCommBuildTwoSidedGetType(comm, &buildtype));
498   switch (buildtype) {
499   case PETSC_BUILDTWOSIDED_IBARRIER:
500 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
501     f = PetscCommBuildTwoSidedFReq_Ibarrier;
502     break;
503 #else
504     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
505 #endif
506   case PETSC_BUILDTWOSIDED_ALLREDUCE:
507   case PETSC_BUILDTWOSIDED_REDSCATTER: f = PetscCommBuildTwoSidedFReq_Reference; break;
508   default: SETERRQ(comm, PETSC_ERR_PLIB, "Unknown method for building two-sided communication");
509   }
510   PetscCall((*f)(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata, ntags, toreqs, fromreqs, send, recv, ctx));
511   PetscCall(PetscLogEventEnd(PETSC_BuildTwoSidedF, 0, 0, 0, 0));
512   PetscFunctionReturn(0);
513 }
514