xref: /petsc/src/sys/utils/mpits.c (revision 0619917b5a674bb687c64e7daba2ab22be99af31)
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 PetscLogEvent PETSC_BuildTwoSided;
5 PetscLogEvent PETSC_BuildTwoSidedF;
6 
7 const char *const PetscBuildTwoSidedTypes[] = {"ALLREDUCE", "IBARRIER", "REDSCATTER", "PetscBuildTwoSidedType", "PETSC_BUILDTWOSIDED_", NULL};
8 
9 static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
10 
11 /*@
12   PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
13 
14   Logically Collective
15 
16   Input Parameters:
17 + comm     - `PETSC_COMM_WORLD`
18 - twosided - algorithm to use in subsequent calls to `PetscCommBuildTwoSided()`
19 
20   Level: developer
21 
22   Note:
23   This option is currently global, but could be made per-communicator.
24 
25 .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedGetType()`, `PetscBuildTwoSidedType`
26 @*/
27 PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm, PetscBuildTwoSidedType twosided)
28 {
29   PetscFunctionBegin;
30   if (PetscDefined(USE_DEBUG)) { /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
31     PetscMPIInt b1[2], b2[2];
32     b1[0] = -(PetscMPIInt)twosided;
33     b1[1] = (PetscMPIInt)twosided;
34     PetscCall(MPIU_Allreduce(b1, b2, 2, MPI_INT, MPI_MAX, comm));
35     PetscCheck(-b2[0] == b2[1], comm, PETSC_ERR_ARG_WRONG, "Enum value must be same on all processes");
36   }
37   _twosided_type = twosided;
38   PetscFunctionReturn(PETSC_SUCCESS);
39 }
40 
41 /*@
42   PetscCommBuildTwoSidedGetType - get algorithm used when building two-sided communication
43 
44   Logically Collective
45 
46   Output Parameters:
47 + comm     - communicator on which to query algorithm
48 - twosided - algorithm to use for `PetscCommBuildTwoSided()`
49 
50   Level: developer
51 
52 .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedSetType()`, `PetscBuildTwoSidedType`
53 @*/
54 PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm, PetscBuildTwoSidedType *twosided)
55 {
56   PetscMPIInt size;
57 
58   PetscFunctionBegin;
59   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
60   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
61     PetscCallMPI(MPI_Comm_size(comm, &size));
62     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE; /* default for small comms, see https://gitlab.com/petsc/petsc/-/merge_requests/2611 */
63 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
64     if (size > 1024) _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
65 #endif
66     PetscCall(PetscOptionsGetEnum(NULL, NULL, "-build_twosided", PetscBuildTwoSidedTypes, (PetscEnum *)&_twosided_type, NULL));
67   }
68   *twosided = _twosided_type;
69   PetscFunctionReturn(PETSC_SUCCESS);
70 }
71 
72 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
73 static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata)
74 {
75   PetscMPIInt    nrecvs, tag, done, i;
76   MPI_Aint       lb, unitbytes;
77   char          *tdata;
78   MPI_Request   *sendreqs, barrier;
79   PetscSegBuffer segrank, segdata;
80   PetscBool      barrier_started;
81 
82   PetscFunctionBegin;
83   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
84   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
85   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
86   tdata = (char *)todata;
87   PetscCall(PetscMalloc1(nto, &sendreqs));
88   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Issend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
89   PetscCall(PetscSegBufferCreate(sizeof(PetscMPIInt), 4, &segrank));
90   PetscCall(PetscSegBufferCreate(unitbytes, 4 * count, &segdata));
91 
92   nrecvs  = 0;
93   barrier = MPI_REQUEST_NULL;
94   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
95    * but we need to work around it. */
96   barrier_started = PETSC_FALSE;
97   for (done = 0; !done;) {
98     PetscMPIInt flag;
99     MPI_Status  status;
100     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &flag, &status));
101     if (flag) { /* incoming message */
102       PetscMPIInt *recvrank;
103       void        *buf;
104       PetscCall(PetscSegBufferGet(segrank, 1, &recvrank));
105       PetscCall(PetscSegBufferGet(segdata, count, &buf));
106       *recvrank = status.MPI_SOURCE;
107       PetscCallMPI(MPI_Recv(buf, count, dtype, status.MPI_SOURCE, tag, comm, MPI_STATUS_IGNORE));
108       nrecvs++;
109     }
110     if (!barrier_started) {
111       PetscMPIInt sent, nsends;
112       PetscCall(PetscMPIIntCast(nto, &nsends));
113       PetscCallMPI(MPI_Testall(nsends, sendreqs, &sent, MPI_STATUSES_IGNORE));
114       if (sent) {
115         PetscCallMPI(MPI_Ibarrier(comm, &barrier));
116         barrier_started = PETSC_TRUE;
117         PetscCall(PetscFree(sendreqs));
118       }
119     } else {
120       PetscCallMPI(MPI_Test(&barrier, &done, MPI_STATUS_IGNORE));
121     }
122   }
123   *nfrom = nrecvs;
124   PetscCall(PetscSegBufferExtractAlloc(segrank, fromranks));
125   PetscCall(PetscSegBufferDestroy(&segrank));
126   PetscCall(PetscSegBufferExtractAlloc(segdata, fromdata));
127   PetscCall(PetscSegBufferDestroy(&segdata));
128   PetscCall(PetscCommDestroy(&comm));
129   PetscFunctionReturn(PETSC_SUCCESS);
130 }
131 #endif
132 
133 static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata)
134 {
135   PetscMPIInt       size, rank, *iflags, nrecvs, tag, *franks, i, flg;
136   MPI_Aint          lb, unitbytes;
137   char             *tdata, *fdata;
138   MPI_Request      *reqs, *sendreqs;
139   MPI_Status       *statuses;
140   PetscCommCounter *counter;
141 
142   PetscFunctionBegin;
143   PetscCallMPI(MPI_Comm_size(comm, &size));
144   PetscCallMPI(MPI_Comm_rank(comm, &rank));
145   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
146   PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Counter_keyval, &counter, &flg));
147   PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inner PETSc communicator does not have its tag/name counter attribute set");
148   if (!counter->iflags) {
149     PetscCall(PetscCalloc1(size, &counter->iflags));
150     iflags = counter->iflags;
151   } else {
152     iflags = counter->iflags;
153     PetscCall(PetscArrayzero(iflags, size));
154   }
155   for (i = 0; i < nto; i++) iflags[toranks[i]] = 1;
156   PetscCall(MPIU_Allreduce(MPI_IN_PLACE, iflags, size, MPI_INT, MPI_SUM, comm));
157   nrecvs = iflags[rank];
158   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
159   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
160   PetscCall(PetscMalloc(nrecvs * count * unitbytes, &fdata));
161   tdata = (char *)todata;
162   PetscCall(PetscMalloc2(nto + nrecvs, &reqs, nto + nrecvs, &statuses));
163   sendreqs = reqs + nrecvs;
164   for (i = 0; i < nrecvs; i++) PetscCallMPI(MPI_Irecv((void *)(fdata + count * unitbytes * i), count, dtype, MPI_ANY_SOURCE, tag, comm, reqs + i));
165   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Isend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
166   PetscCallMPI(MPI_Waitall(nto + nrecvs, reqs, statuses));
167   PetscCall(PetscMalloc1(nrecvs, &franks));
168   for (i = 0; i < nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
169   PetscCall(PetscFree2(reqs, statuses));
170   PetscCall(PetscCommDestroy(&comm));
171 
172   *nfrom             = nrecvs;
173   *fromranks         = franks;
174   *(void **)fromdata = fdata;
175   PetscFunctionReturn(PETSC_SUCCESS);
176 }
177 
178 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
179 static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata)
180 {
181   PetscMPIInt       size, *iflags, nrecvs, tag, *franks, i, flg;
182   MPI_Aint          lb, unitbytes;
183   char             *tdata, *fdata;
184   MPI_Request      *reqs, *sendreqs;
185   MPI_Status       *statuses;
186   PetscCommCounter *counter;
187 
188   PetscFunctionBegin;
189   PetscCallMPI(MPI_Comm_size(comm, &size));
190   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
191   PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Counter_keyval, &counter, &flg));
192   PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inner PETSc communicator does not have its tag/name counter attribute set");
193   if (!counter->iflags) {
194     PetscCall(PetscCalloc1(size, &counter->iflags));
195     iflags = counter->iflags;
196   } else {
197     iflags = counter->iflags;
198     PetscCall(PetscArrayzero(iflags, size));
199   }
200   for (i = 0; i < nto; i++) iflags[toranks[i]] = 1;
201   PetscCallMPI(MPI_Reduce_scatter_block(iflags, &nrecvs, 1, MPI_INT, MPI_SUM, comm));
202   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
203   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
204   PetscCall(PetscMalloc(nrecvs * count * unitbytes, &fdata));
205   tdata = (char *)todata;
206   PetscCall(PetscMalloc2(nto + nrecvs, &reqs, nto + nrecvs, &statuses));
207   sendreqs = reqs + nrecvs;
208   for (i = 0; i < nrecvs; i++) PetscCallMPI(MPI_Irecv((void *)(fdata + count * unitbytes * i), count, dtype, MPI_ANY_SOURCE, tag, comm, reqs + i));
209   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Isend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
210   PetscCallMPI(MPI_Waitall(nto + nrecvs, reqs, statuses));
211   PetscCall(PetscMalloc1(nrecvs, &franks));
212   for (i = 0; i < nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
213   PetscCall(PetscFree2(reqs, statuses));
214   PetscCall(PetscCommDestroy(&comm));
215 
216   *nfrom             = nrecvs;
217   *fromranks         = franks;
218   *(void **)fromdata = fdata;
219   PetscFunctionReturn(PETSC_SUCCESS);
220 }
221 #endif
222 
223 /*@C
224   PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
225 
226   Collective
227 
228   Input Parameters:
229 + comm    - communicator
230 . count   - number of entries to send/receive (must match on all ranks)
231 . dtype   - datatype to send/receive from each rank (must match on all ranks)
232 . nto     - number of ranks to send data to
233 . toranks - ranks to send to (array of length nto)
234 - todata  - data to send to each rank (packed)
235 
236   Output Parameters:
237 + nfrom     - number of ranks receiving messages from
238 . fromranks - ranks receiving messages from (length `nfrom`, caller should `PetscFree()`)
239 - fromdata  - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for `PetscFree()`)
240 
241   Options Database Key:
242 . -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication. Default is allreduce for communicators with <= 1024 ranks,
243                    otherwise ibarrier.
244 
245   Level: developer
246 
247   Notes:
248   This memory-scalable interface is an alternative to calling `PetscGatherNumberOfMessages()` and
249   `PetscGatherMessageLengths()`, possibly with a subsequent round of communication to send other constant-size data.
250 
251   Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
252 
253   References:
254 .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
255   Scalable communication protocols for dynamic sparse data exchange, 2010.
256 
257 .seealso: `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`, `PetscCommBuildTwoSidedSetType()`, `PetscCommBuildTwoSidedType`
258 @*/
259 PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata)
260 {
261   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
262 
263   PetscFunctionBegin;
264   PetscCall(PetscSysInitializePackage());
265   PetscCall(PetscLogEventSync(PETSC_BuildTwoSided, comm));
266   PetscCall(PetscLogEventBegin(PETSC_BuildTwoSided, 0, 0, 0, 0));
267   PetscCall(PetscCommBuildTwoSidedGetType(comm, &buildtype));
268   switch (buildtype) {
269   case PETSC_BUILDTWOSIDED_IBARRIER:
270 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
271     PetscCall(PetscCommBuildTwoSided_Ibarrier(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
272     break;
273 #else
274     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
275 #endif
276   case PETSC_BUILDTWOSIDED_ALLREDUCE:
277     PetscCall(PetscCommBuildTwoSided_Allreduce(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
278     break;
279   case PETSC_BUILDTWOSIDED_REDSCATTER:
280 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
281     PetscCall(PetscCommBuildTwoSided_RedScatter(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
282     break;
283 #else
284     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
285 #endif
286   default:
287     SETERRQ(comm, PETSC_ERR_PLIB, "Unknown method for building two-sided communication");
288   }
289   PetscCall(PetscLogEventEnd(PETSC_BuildTwoSided, 0, 0, 0, 0));
290   PetscFunctionReturn(PETSC_SUCCESS);
291 }
292 
293 static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx)
294 {
295   PetscMPIInt  i, *tag;
296   MPI_Aint     lb, unitbytes;
297   MPI_Request *sendreq, *recvreq;
298 
299   PetscFunctionBegin;
300   PetscCall(PetscMalloc1(ntags, &tag));
301   if (ntags > 0) PetscCall(PetscCommDuplicate(comm, &comm, &tag[0]));
302   for (i = 1; i < ntags; i++) PetscCall(PetscCommGetNewTag(comm, &tag[i]));
303 
304   /* Perform complete initial rendezvous */
305   PetscCall(PetscCommBuildTwoSided(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
306 
307   PetscCall(PetscMalloc1(nto * ntags, &sendreq));
308   PetscCall(PetscMalloc1(*nfrom * ntags, &recvreq));
309 
310   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
311   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
312   for (i = 0; i < nto; i++) {
313     PetscMPIInt k;
314     for (k = 0; k < ntags; k++) sendreq[i * ntags + k] = MPI_REQUEST_NULL;
315     PetscCall((*send)(comm, tag, i, toranks[i], ((char *)todata) + count * unitbytes * i, sendreq + i * ntags, ctx));
316   }
317   for (i = 0; i < *nfrom; i++) {
318     void       *header = (*(char **)fromdata) + count * unitbytes * i;
319     PetscMPIInt k;
320     for (k = 0; k < ntags; k++) recvreq[i * ntags + k] = MPI_REQUEST_NULL;
321     PetscCall((*recv)(comm, tag, (*fromranks)[i], header, recvreq + i * ntags, ctx));
322   }
323   PetscCall(PetscFree(tag));
324   PetscCall(PetscCommDestroy(&comm));
325   *toreqs   = sendreq;
326   *fromreqs = recvreq;
327   PetscFunctionReturn(PETSC_SUCCESS);
328 }
329 
330 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
331 
332 static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx)
333 {
334   PetscMPIInt    nrecvs, tag, *tags, done, i;
335   MPI_Aint       lb, unitbytes;
336   char          *tdata;
337   MPI_Request   *sendreqs, *usendreqs, *req, barrier;
338   PetscSegBuffer segrank, segdata, segreq;
339   PetscBool      barrier_started;
340 
341   PetscFunctionBegin;
342   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
343   PetscCall(PetscMalloc1(ntags, &tags));
344   for (i = 0; i < ntags; i++) PetscCall(PetscCommGetNewTag(comm, &tags[i]));
345   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
346   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
347   tdata = (char *)todata;
348   PetscCall(PetscMalloc1(nto, &sendreqs));
349   PetscCall(PetscMalloc1(nto * ntags, &usendreqs));
350   /* Post synchronous sends */
351   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Issend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
352   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
353    * synchronous messages above. */
354   for (i = 0; i < nto; i++) {
355     PetscMPIInt k;
356     for (k = 0; k < ntags; k++) usendreqs[i * ntags + k] = MPI_REQUEST_NULL;
357     PetscCall((*send)(comm, tags, i, toranks[i], tdata + count * unitbytes * i, usendreqs + i * ntags, ctx));
358   }
359 
360   PetscCall(PetscSegBufferCreate(sizeof(PetscMPIInt), 4, &segrank));
361   PetscCall(PetscSegBufferCreate(unitbytes, 4 * count, &segdata));
362   PetscCall(PetscSegBufferCreate(sizeof(MPI_Request), 4, &segreq));
363 
364   nrecvs  = 0;
365   barrier = MPI_REQUEST_NULL;
366   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
367    * but we need to work around it. */
368   barrier_started = PETSC_FALSE;
369   for (done = 0; !done;) {
370     PetscMPIInt flag;
371     MPI_Status  status;
372     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &flag, &status));
373     if (flag) { /* incoming message */
374       PetscMPIInt *recvrank, k;
375       void        *buf;
376       PetscCall(PetscSegBufferGet(segrank, 1, &recvrank));
377       PetscCall(PetscSegBufferGet(segdata, count, &buf));
378       *recvrank = status.MPI_SOURCE;
379       PetscCallMPI(MPI_Recv(buf, count, dtype, status.MPI_SOURCE, tag, comm, MPI_STATUS_IGNORE));
380       PetscCall(PetscSegBufferGet(segreq, ntags, &req));
381       for (k = 0; k < ntags; k++) req[k] = MPI_REQUEST_NULL;
382       PetscCall((*recv)(comm, tags, status.MPI_SOURCE, buf, req, ctx));
383       nrecvs++;
384     }
385     if (!barrier_started) {
386       PetscMPIInt sent, nsends;
387       PetscCall(PetscMPIIntCast(nto, &nsends));
388       PetscCallMPI(MPI_Testall(nsends, sendreqs, &sent, MPI_STATUSES_IGNORE));
389       if (sent) {
390         PetscCallMPI(MPI_Ibarrier(comm, &barrier));
391         barrier_started = PETSC_TRUE;
392       }
393     } else {
394       PetscCallMPI(MPI_Test(&barrier, &done, MPI_STATUS_IGNORE));
395     }
396   }
397   *nfrom = nrecvs;
398   PetscCall(PetscSegBufferExtractAlloc(segrank, fromranks));
399   PetscCall(PetscSegBufferDestroy(&segrank));
400   PetscCall(PetscSegBufferExtractAlloc(segdata, fromdata));
401   PetscCall(PetscSegBufferDestroy(&segdata));
402   *toreqs = usendreqs;
403   PetscCall(PetscSegBufferExtractAlloc(segreq, fromreqs));
404   PetscCall(PetscSegBufferDestroy(&segreq));
405   PetscCall(PetscFree(sendreqs));
406   PetscCall(PetscFree(tags));
407   PetscCall(PetscCommDestroy(&comm));
408   PetscFunctionReturn(PETSC_SUCCESS);
409 }
410 #endif
411 
412 /*@C
413   PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
414 
415   Collective
416 
417   Input Parameters:
418 + comm    - communicator
419 . count   - number of entries to send/receive in initial rendezvous (must match on all ranks)
420 . dtype   - datatype to send/receive from each rank (must match on all ranks)
421 . nto     - number of ranks to send data to
422 . toranks - ranks to send to (array of length nto)
423 . todata  - data to send to each rank (packed)
424 . ntags   - number of tags needed by send/recv callbacks
425 . send    - callback invoked on sending process when ready to send primary payload
426 . recv    - callback invoked on receiving process after delivery of rendezvous message
427 - ctx     - context for callbacks
428 
429   Output Parameters:
430 + nfrom     - number of ranks receiving messages from
431 . fromranks - ranks receiving messages from (length nfrom; caller should `PetscFree()`)
432 - fromdata  - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for `PetscFree()`)
433 
434   Level: developer
435 
436   Notes:
437   This memory-scalable interface is an alternative to calling `PetscGatherNumberOfMessages()` and
438   `PetscGatherMessageLengths()`, possibly with a subsequent round of communication to send other data.
439 
440   Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
441 
442   References:
443 .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
444   Scalable communication protocols for dynamic sparse data exchange, 2010.
445 
446 .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedFReq()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
447 @*/
448 PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx)
449 {
450   MPI_Request *toreqs, *fromreqs;
451 
452   PetscFunctionBegin;
453   PetscCall(PetscCommBuildTwoSidedFReq(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata, ntags, &toreqs, &fromreqs, send, recv, ctx));
454   PetscCallMPI(MPI_Waitall(nto * ntags, toreqs, MPI_STATUSES_IGNORE));
455   PetscCallMPI(MPI_Waitall(*nfrom * ntags, fromreqs, MPI_STATUSES_IGNORE));
456   PetscCall(PetscFree(toreqs));
457   PetscCall(PetscFree(fromreqs));
458   PetscFunctionReturn(PETSC_SUCCESS);
459 }
460 
461 /*@C
462   PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
463 
464   Collective
465 
466   Input Parameters:
467 + comm    - communicator
468 . count   - number of entries to send/receive in initial rendezvous (must match on all ranks)
469 . dtype   - datatype to send/receive from each rank (must match on all ranks)
470 . nto     - number of ranks to send data to
471 . toranks - ranks to send to (array of length nto)
472 . todata  - data to send to each rank (packed)
473 . ntags   - number of tags needed by send/recv callbacks
474 . send    - callback invoked on sending process when ready to send primary payload
475 . recv    - callback invoked on receiving process after delivery of rendezvous message
476 - ctx     - context for callbacks
477 
478   Output Parameters:
479 + nfrom     - number of ranks receiving messages from
480 . fromranks - ranks receiving messages from (length nfrom; caller should `PetscFree()`)
481 . fromdata  - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for `PetscFree()`)
482 . toreqs    - array of nto*ntags sender requests (caller must wait on these, then `PetscFree()`)
483 - fromreqs  - array of nfrom*ntags receiver requests (caller must wait on these, then `PetscFree()`)
484 
485   Level: developer
486 
487   Notes:
488   This memory-scalable interface is an alternative to calling `PetscGatherNumberOfMessages()` and
489   `PetscGatherMessageLengths()`, possibly with a subsequent round of communication to send other data.
490 
491   Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
492 
493   References:
494 .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
495   Scalable communication protocols for dynamic sparse data exchange, 2010.
496 
497 .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedF()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
498 @*/
499 PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx)
500 {
501   PetscErrorCode (*f)(MPI_Comm, PetscMPIInt, MPI_Datatype, PetscMPIInt, const PetscMPIInt[], const void *, PetscMPIInt *, PetscMPIInt **, void *, PetscMPIInt, MPI_Request **, MPI_Request **, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx);
502   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
503   PetscMPIInt            i, size;
504 
505   PetscFunctionBegin;
506   PetscCall(PetscSysInitializePackage());
507   PetscCallMPI(MPI_Comm_size(comm, &size));
508   for (i = 0; i < nto; i++) PetscCheck(toranks[i] >= 0 && size > toranks[i], comm, PETSC_ERR_ARG_OUTOFRANGE, "toranks[%d] %d not in comm size %d", i, toranks[i], size);
509   PetscCall(PetscLogEventSync(PETSC_BuildTwoSidedF, comm));
510   PetscCall(PetscLogEventBegin(PETSC_BuildTwoSidedF, 0, 0, 0, 0));
511   PetscCall(PetscCommBuildTwoSidedGetType(comm, &buildtype));
512   switch (buildtype) {
513   case PETSC_BUILDTWOSIDED_IBARRIER:
514 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
515     f = PetscCommBuildTwoSidedFReq_Ibarrier;
516     break;
517 #else
518     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
519 #endif
520   case PETSC_BUILDTWOSIDED_ALLREDUCE:
521   case PETSC_BUILDTWOSIDED_REDSCATTER:
522     f = PetscCommBuildTwoSidedFReq_Reference;
523     break;
524   default:
525     SETERRQ(comm, PETSC_ERR_PLIB, "Unknown method for building two-sided communication");
526   }
527   PetscCall((*f)(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata, ntags, toreqs, fromreqs, send, recv, ctx));
528   PetscCall(PetscLogEventEnd(PETSC_BuildTwoSidedF, 0, 0, 0, 0));
529   PetscFunctionReturn(PETSC_SUCCESS);
530 }
531