xref: /petsc/src/sys/utils/mpits.c (revision 48a46eb9bd028bec07ec0f396b1a3abb43f14558)
1f6ced4a3SJed Brown #include <petscsys.h> /*I  "petscsys.h"  I*/
295c0884eSLisandro Dalcin #include <petsc/private/petscimpl.h>
3f6ced4a3SJed Brown 
495c0884eSLisandro Dalcin PetscLogEvent PETSC_BuildTwoSided;
595c0884eSLisandro Dalcin PetscLogEvent PETSC_BuildTwoSidedF;
63b3561c8SJed Brown 
79371c9d4SSatish Balay const char *const PetscBuildTwoSidedTypes[] = {"ALLREDUCE", "IBARRIER", "REDSCATTER", "PetscBuildTwoSidedType", "PETSC_BUILDTWOSIDED_", NULL};
8f6ced4a3SJed Brown 
96145cd65SJed Brown static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
10f6ced4a3SJed Brown 
116145cd65SJed Brown /*@
126145cd65SJed Brown    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
136145cd65SJed Brown 
146145cd65SJed Brown    Logically Collective
156145cd65SJed Brown 
164165533cSJose E. Roman    Input Parameters:
176145cd65SJed Brown +  comm - PETSC_COMM_WORLD
186145cd65SJed Brown -  twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()
196145cd65SJed Brown 
206145cd65SJed Brown    Level: developer
216145cd65SJed Brown 
226145cd65SJed Brown    Note:
236145cd65SJed Brown    This option is currently global, but could be made per-communicator.
246145cd65SJed Brown 
25db781477SPatrick Sanan .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedGetType()`
266145cd65SJed Brown @*/
279371c9d4SSatish Balay PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm, PetscBuildTwoSidedType twosided) {
286145cd65SJed Brown   PetscFunctionBegin;
2976bd3646SJed Brown   if (PetscDefined(USE_DEBUG)) { /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
306145cd65SJed Brown     PetscMPIInt b1[2], b2[2];
316145cd65SJed Brown     b1[0] = -(PetscMPIInt)twosided;
326145cd65SJed Brown     b1[1] = (PetscMPIInt)twosided;
331c2dc1cbSBarry Smith     PetscCall(MPIU_Allreduce(b1, b2, 2, MPI_INT, MPI_MAX, comm));
3408401ef6SPierre Jolivet     PetscCheck(-b2[0] == b2[1], comm, PETSC_ERR_ARG_WRONG, "Enum value must be same on all processes");
356145cd65SJed Brown   }
366145cd65SJed Brown   _twosided_type = twosided;
376145cd65SJed Brown   PetscFunctionReturn(0);
386145cd65SJed Brown }
396145cd65SJed Brown 
406145cd65SJed Brown /*@
416145cd65SJed Brown    PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication
426145cd65SJed Brown 
436145cd65SJed Brown    Logically Collective
446145cd65SJed Brown 
454165533cSJose E. Roman    Output Parameters:
466145cd65SJed Brown +  comm - communicator on which to query algorithm
476145cd65SJed Brown -  twosided - algorithm to use for PetscCommBuildTwoSided()
486145cd65SJed Brown 
496145cd65SJed Brown    Level: developer
506145cd65SJed Brown 
51db781477SPatrick Sanan .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedSetType()`
526145cd65SJed Brown @*/
539371c9d4SSatish Balay PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm, PetscBuildTwoSidedType *twosided) {
5405342407SJunchao Zhang   PetscMPIInt size;
55f6ced4a3SJed Brown 
56f6ced4a3SJed Brown   PetscFunctionBegin;
576145cd65SJed Brown   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
586145cd65SJed Brown   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
599566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(comm, &size));
6005342407SJunchao Zhang     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE; /* default for small comms, see https://gitlab.com/petsc/petsc/-/merge_requests/2611 */
618ee3e7ecSJunchao Zhang #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
6205342407SJunchao Zhang     if (size > 1024) _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
636145cd65SJed Brown #endif
649566063dSJacob Faibussowitsch     PetscCall(PetscOptionsGetEnum(NULL, NULL, "-build_twosided", PetscBuildTwoSidedTypes, (PetscEnum *)&_twosided_type, NULL));
65f6ced4a3SJed Brown   }
66f6ced4a3SJed Brown   *twosided = _twosided_type;
67f6ced4a3SJed Brown   PetscFunctionReturn(0);
68f6ced4a3SJed Brown }
69f6ced4a3SJed Brown 
708ee3e7ecSJunchao Zhang #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
719371c9d4SSatish Balay static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata) {
72c3a59b84SJed Brown   PetscMPIInt    nrecvs, tag, done, i;
73c3a59b84SJed Brown   MPI_Aint       lb, unitbytes;
74f6ced4a3SJed Brown   char          *tdata;
75f6ced4a3SJed Brown   MPI_Request   *sendreqs, barrier;
760f453b92SJed Brown   PetscSegBuffer segrank, segdata;
7746e50bb6SJed Brown   PetscBool      barrier_started;
78f6ced4a3SJed Brown 
79f6ced4a3SJed Brown   PetscFunctionBegin;
809566063dSJacob Faibussowitsch   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
819566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
8208401ef6SPierre Jolivet   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
83f6ced4a3SJed Brown   tdata = (char *)todata;
849566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nto, &sendreqs));
85*48a46eb9SPierre Jolivet   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Issend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
869566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscMPIInt), 4, &segrank));
879566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(unitbytes, 4 * count, &segdata));
88f6ced4a3SJed Brown 
89f6ced4a3SJed Brown   nrecvs          = 0;
90f6ced4a3SJed Brown   barrier         = MPI_REQUEST_NULL;
9146e50bb6SJed Brown   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
9246e50bb6SJed Brown    * but we need to work around it. */
9346e50bb6SJed Brown   barrier_started = PETSC_FALSE;
94f6ced4a3SJed Brown   for (done = 0; !done;) {
95f6ced4a3SJed Brown     PetscMPIInt flag;
96f6ced4a3SJed Brown     MPI_Status  status;
979566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &flag, &status));
98f6ced4a3SJed Brown     if (flag) { /* incoming message */
99f6ced4a3SJed Brown       PetscMPIInt *recvrank;
100f6ced4a3SJed Brown       void        *buf;
1019566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGet(segrank, 1, &recvrank));
1029566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGet(segdata, count, &buf));
103f6ced4a3SJed Brown       *recvrank = status.MPI_SOURCE;
1049566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Recv(buf, count, dtype, status.MPI_SOURCE, tag, comm, MPI_STATUS_IGNORE));
105f6ced4a3SJed Brown       nrecvs++;
106f6ced4a3SJed Brown     }
10746e50bb6SJed Brown     if (!barrier_started) {
1084dc2109aSBarry Smith       PetscMPIInt sent, nsends;
1099566063dSJacob Faibussowitsch       PetscCall(PetscMPIIntCast(nto, &nsends));
1109566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Testall(nsends, sendreqs, &sent, MPI_STATUSES_IGNORE));
111f6ced4a3SJed Brown       if (sent) {
1129566063dSJacob Faibussowitsch         PetscCallMPI(MPI_Ibarrier(comm, &barrier));
11346e50bb6SJed Brown         barrier_started = PETSC_TRUE;
1149566063dSJacob Faibussowitsch         PetscCall(PetscFree(sendreqs));
115f6ced4a3SJed Brown       }
116f6ced4a3SJed Brown     } else {
1179566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Test(&barrier, &done, MPI_STATUS_IGNORE));
118f6ced4a3SJed Brown     }
119f6ced4a3SJed Brown   }
120f6ced4a3SJed Brown   *nfrom = nrecvs;
1219566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(segrank, fromranks));
1229566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segrank));
1239566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(segdata, fromdata));
1249566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segdata));
1259566063dSJacob Faibussowitsch   PetscCall(PetscCommDestroy(&comm));
126f6ced4a3SJed Brown   PetscFunctionReturn(0);
127f6ced4a3SJed Brown }
128f6ced4a3SJed Brown #endif
129f6ced4a3SJed Brown 
1309371c9d4SSatish Balay static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata) {
13105342407SJunchao Zhang   PetscMPIInt       size, rank, *iflags, nrecvs, tag, *franks, i, flg;
132c3a59b84SJed Brown   MPI_Aint          lb, unitbytes;
133f6ced4a3SJed Brown   char             *tdata, *fdata;
134f6ced4a3SJed Brown   MPI_Request      *reqs, *sendreqs;
135f6ced4a3SJed Brown   MPI_Status       *statuses;
13605342407SJunchao Zhang   PetscCommCounter *counter;
137f6ced4a3SJed Brown 
138f6ced4a3SJed Brown   PetscFunctionBegin;
1399566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
1409566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1419566063dSJacob Faibussowitsch   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
1429566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Counter_keyval, &counter, &flg));
14328b400f6SJacob Faibussowitsch   PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inner PETSc communicator does not have its tag/name counter attribute set");
14405342407SJunchao Zhang   if (!counter->iflags) {
1459566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(size, &counter->iflags));
14605342407SJunchao Zhang     iflags = counter->iflags;
14705342407SJunchao Zhang   } else {
14805342407SJunchao Zhang     iflags = counter->iflags;
1499566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(iflags, size));
15005342407SJunchao Zhang   }
15105342407SJunchao Zhang   for (i = 0; i < nto; i++) iflags[toranks[i]] = 1;
1521c2dc1cbSBarry Smith   PetscCall(MPIU_Allreduce(MPI_IN_PLACE, iflags, size, MPI_INT, MPI_SUM, comm));
15305342407SJunchao Zhang   nrecvs = iflags[rank];
1549566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
1555f80ce2aSJacob Faibussowitsch   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
1569566063dSJacob Faibussowitsch   PetscCall(PetscMalloc(nrecvs * count * unitbytes, &fdata));
157f6ced4a3SJed Brown   tdata = (char *)todata;
1589566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nto + nrecvs, &reqs, nto + nrecvs, &statuses));
15906926cafSJed Brown   sendreqs = reqs + nrecvs;
160*48a46eb9SPierre Jolivet   for (i = 0; i < nrecvs; i++) PetscCallMPI(MPI_Irecv((void *)(fdata + count * unitbytes * i), count, dtype, MPI_ANY_SOURCE, tag, comm, reqs + i));
161*48a46eb9SPierre Jolivet   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Isend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
1629566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Waitall(nto + nrecvs, reqs, statuses));
1639566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nrecvs, &franks));
164a297a907SKarl Rupp   for (i = 0; i < nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
1659566063dSJacob Faibussowitsch   PetscCall(PetscFree2(reqs, statuses));
1669566063dSJacob Faibussowitsch   PetscCall(PetscCommDestroy(&comm));
167f6ced4a3SJed Brown 
168f6ced4a3SJed Brown   *nfrom             = nrecvs;
169f6ced4a3SJed Brown   *fromranks         = franks;
170f6ced4a3SJed Brown   *(void **)fromdata = fdata;
171f6ced4a3SJed Brown   PetscFunctionReturn(0);
172f6ced4a3SJed Brown }
173f6ced4a3SJed Brown 
1741654bf6bSJed Brown #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
1759371c9d4SSatish Balay static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata) {
17605342407SJunchao Zhang   PetscMPIInt       size, *iflags, nrecvs, tag, *franks, i, flg;
177c3a59b84SJed Brown   MPI_Aint          lb, unitbytes;
1781654bf6bSJed Brown   char             *tdata, *fdata;
1791654bf6bSJed Brown   MPI_Request      *reqs, *sendreqs;
1801654bf6bSJed Brown   MPI_Status       *statuses;
18105342407SJunchao Zhang   PetscCommCounter *counter;
1821654bf6bSJed Brown 
1831654bf6bSJed Brown   PetscFunctionBegin;
1849566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
1859566063dSJacob Faibussowitsch   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
1869566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Counter_keyval, &counter, &flg));
18728b400f6SJacob Faibussowitsch   PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inner PETSc communicator does not have its tag/name counter attribute set");
18805342407SJunchao Zhang   if (!counter->iflags) {
1899566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(size, &counter->iflags));
19005342407SJunchao Zhang     iflags = counter->iflags;
19105342407SJunchao Zhang   } else {
19205342407SJunchao Zhang     iflags = counter->iflags;
1939566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(iflags, size));
19405342407SJunchao Zhang   }
1951654bf6bSJed Brown   for (i = 0; i < nto; i++) iflags[toranks[i]] = 1;
1969566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Reduce_scatter_block(iflags, &nrecvs, 1, MPI_INT, MPI_SUM, comm));
1979566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
19808401ef6SPierre Jolivet   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
1999566063dSJacob Faibussowitsch   PetscCall(PetscMalloc(nrecvs * count * unitbytes, &fdata));
2001654bf6bSJed Brown   tdata = (char *)todata;
2019566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nto + nrecvs, &reqs, nto + nrecvs, &statuses));
2021654bf6bSJed Brown   sendreqs = reqs + nrecvs;
203*48a46eb9SPierre Jolivet   for (i = 0; i < nrecvs; i++) PetscCallMPI(MPI_Irecv((void *)(fdata + count * unitbytes * i), count, dtype, MPI_ANY_SOURCE, tag, comm, reqs + i));
204*48a46eb9SPierre Jolivet   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Isend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
2059566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Waitall(nto + nrecvs, reqs, statuses));
2069566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nrecvs, &franks));
2071654bf6bSJed Brown   for (i = 0; i < nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
2089566063dSJacob Faibussowitsch   PetscCall(PetscFree2(reqs, statuses));
2099566063dSJacob Faibussowitsch   PetscCall(PetscCommDestroy(&comm));
2101654bf6bSJed Brown 
2111654bf6bSJed Brown   *nfrom             = nrecvs;
2121654bf6bSJed Brown   *fromranks         = franks;
2131654bf6bSJed Brown   *(void **)fromdata = fdata;
2141654bf6bSJed Brown   PetscFunctionReturn(0);
2151654bf6bSJed Brown }
2161654bf6bSJed Brown #endif
2171654bf6bSJed Brown 
218f6ced4a3SJed Brown /*@C
219f6ced4a3SJed Brown    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
220f6ced4a3SJed Brown 
221d083f849SBarry Smith    Collective
222f6ced4a3SJed Brown 
2234165533cSJose E. Roman    Input Parameters:
224f6ced4a3SJed Brown +  comm - communicator
225f6ced4a3SJed Brown .  count - number of entries to send/receive (must match on all ranks)
226f6ced4a3SJed Brown .  dtype - datatype to send/receive from each rank (must match on all ranks)
227f6ced4a3SJed Brown .  nto - number of ranks to send data to
228f6ced4a3SJed Brown .  toranks - ranks to send to (array of length nto)
229f6ced4a3SJed Brown -  todata - data to send to each rank (packed)
230f6ced4a3SJed Brown 
2314165533cSJose E. Roman    Output Parameters:
232f6ced4a3SJed Brown +  nfrom - number of ranks receiving messages from
233f6ced4a3SJed Brown .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
234f6ced4a3SJed Brown -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
235f6ced4a3SJed Brown 
236f6ced4a3SJed Brown    Level: developer
237f6ced4a3SJed Brown 
2381654bf6bSJed Brown    Options Database Keys:
23905342407SJunchao Zhang .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication. Default is allreduce for communicators with <= 1024 ranks, otherwise ibarrier.
2401654bf6bSJed Brown 
241f6ced4a3SJed Brown    Notes:
242f6ced4a3SJed Brown    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
243f6ced4a3SJed Brown    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.
244f6ced4a3SJed Brown 
245f6ced4a3SJed Brown    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
246f6ced4a3SJed Brown 
247f6ced4a3SJed Brown    References:
248606c0280SSatish Balay .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
2494f02bc6aSBarry Smith    Scalable communication protocols for dynamic sparse data exchange, 2010.
250f6ced4a3SJed Brown 
251db781477SPatrick Sanan .seealso: `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
252f6ced4a3SJed Brown @*/
2539371c9d4SSatish Balay PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata) {
2546145cd65SJed Brown   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
255f6ced4a3SJed Brown 
256f6ced4a3SJed Brown   PetscFunctionBegin;
2579566063dSJacob Faibussowitsch   PetscCall(PetscSysInitializePackage());
2589566063dSJacob Faibussowitsch   PetscCall(PetscLogEventSync(PETSC_BuildTwoSided, comm));
2599566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSC_BuildTwoSided, 0, 0, 0, 0));
2609566063dSJacob Faibussowitsch   PetscCall(PetscCommBuildTwoSidedGetType(comm, &buildtype));
261f6ced4a3SJed Brown   switch (buildtype) {
2626145cd65SJed Brown   case PETSC_BUILDTWOSIDED_IBARRIER:
2638ee3e7ecSJunchao Zhang #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
2649566063dSJacob Faibussowitsch     PetscCall(PetscCommBuildTwoSided_Ibarrier(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
265b458e8f1SJose E. Roman     break;
2666145cd65SJed Brown #else
2676145cd65SJed Brown     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
268f6ced4a3SJed Brown #endif
2699371c9d4SSatish Balay   case PETSC_BUILDTWOSIDED_ALLREDUCE: PetscCall(PetscCommBuildTwoSided_Allreduce(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata)); break;
2701654bf6bSJed Brown   case PETSC_BUILDTWOSIDED_REDSCATTER:
2711654bf6bSJed Brown #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
2729566063dSJacob Faibussowitsch     PetscCall(PetscCommBuildTwoSided_RedScatter(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
273b458e8f1SJose E. Roman     break;
2741654bf6bSJed Brown #else
2751654bf6bSJed Brown     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
2761654bf6bSJed Brown #endif
277f6ced4a3SJed Brown   default: SETERRQ(comm, PETSC_ERR_PLIB, "Unknown method for building two-sided communication");
278f6ced4a3SJed Brown   }
2799566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSC_BuildTwoSided, 0, 0, 0, 0));
280f6ced4a3SJed Brown   PetscFunctionReturn(0);
281f6ced4a3SJed Brown }
282d815da10SJed Brown 
2839371c9d4SSatish Balay static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx) {
284d815da10SJed Brown   PetscMPIInt  i, *tag;
285d815da10SJed Brown   MPI_Aint     lb, unitbytes;
286d815da10SJed Brown   MPI_Request *sendreq, *recvreq;
287d815da10SJed Brown 
288d815da10SJed Brown   PetscFunctionBegin;
2899566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ntags, &tag));
290*48a46eb9SPierre Jolivet   if (ntags > 0) PetscCall(PetscCommDuplicate(comm, &comm, &tag[0]));
291*48a46eb9SPierre Jolivet   for (i = 1; i < ntags; i++) PetscCall(PetscCommGetNewTag(comm, &tag[i]));
292d815da10SJed Brown 
293d815da10SJed Brown   /* Perform complete initial rendezvous */
2949566063dSJacob Faibussowitsch   PetscCall(PetscCommBuildTwoSided(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
295d815da10SJed Brown 
2969566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nto * ntags, &sendreq));
2979566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(*nfrom * ntags, &recvreq));
298d815da10SJed Brown 
2999566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
30008401ef6SPierre Jolivet   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
301d815da10SJed Brown   for (i = 0; i < nto; i++) {
30289157a57SJed Brown     PetscMPIInt k;
30389157a57SJed Brown     for (k = 0; k < ntags; k++) sendreq[i * ntags + k] = MPI_REQUEST_NULL;
3049566063dSJacob Faibussowitsch     PetscCall((*send)(comm, tag, i, toranks[i], ((char *)todata) + count * unitbytes * i, sendreq + i * ntags, ctx));
305d815da10SJed Brown   }
306d815da10SJed Brown   for (i = 0; i < *nfrom; i++) {
307d815da10SJed Brown     void       *header = (*(char **)fromdata) + count * unitbytes * i;
30889157a57SJed Brown     PetscMPIInt k;
30989157a57SJed Brown     for (k = 0; k < ntags; k++) recvreq[i * ntags + k] = MPI_REQUEST_NULL;
3109566063dSJacob Faibussowitsch     PetscCall((*recv)(comm, tag, (*fromranks)[i], header, recvreq + i * ntags, ctx));
311d815da10SJed Brown   }
3129566063dSJacob Faibussowitsch   PetscCall(PetscFree(tag));
3139566063dSJacob Faibussowitsch   PetscCall(PetscCommDestroy(&comm));
3146afbe7ddSJed Brown   *toreqs   = sendreq;
3156afbe7ddSJed Brown   *fromreqs = recvreq;
316d815da10SJed Brown   PetscFunctionReturn(0);
317d815da10SJed Brown }
318d815da10SJed Brown 
3198ee3e7ecSJunchao Zhang #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
320afb254cdSJed Brown 
3219371c9d4SSatish Balay static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx) {
322afb254cdSJed Brown   PetscMPIInt    nrecvs, tag, *tags, done, i;
323afb254cdSJed Brown   MPI_Aint       lb, unitbytes;
324afb254cdSJed Brown   char          *tdata;
325afb254cdSJed Brown   MPI_Request   *sendreqs, *usendreqs, *req, barrier;
326afb254cdSJed Brown   PetscSegBuffer segrank, segdata, segreq;
32746e50bb6SJed Brown   PetscBool      barrier_started;
328afb254cdSJed Brown 
329afb254cdSJed Brown   PetscFunctionBegin;
3309566063dSJacob Faibussowitsch   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
3319566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ntags, &tags));
332*48a46eb9SPierre Jolivet   for (i = 0; i < ntags; i++) PetscCall(PetscCommGetNewTag(comm, &tags[i]));
3339566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
33408401ef6SPierre Jolivet   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
335afb254cdSJed Brown   tdata = (char *)todata;
3369566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nto, &sendreqs));
3379566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nto * ntags, &usendreqs));
338afb254cdSJed Brown   /* Post synchronous sends */
339*48a46eb9SPierre Jolivet   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Issend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
340afb254cdSJed Brown   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
341afb254cdSJed Brown    * synchronous messages above. */
342afb254cdSJed Brown   for (i = 0; i < nto; i++) {
34389157a57SJed Brown     PetscMPIInt k;
34489157a57SJed Brown     for (k = 0; k < ntags; k++) usendreqs[i * ntags + k] = MPI_REQUEST_NULL;
3459566063dSJacob Faibussowitsch     PetscCall((*send)(comm, tags, i, toranks[i], tdata + count * unitbytes * i, usendreqs + i * ntags, ctx));
346afb254cdSJed Brown   }
347afb254cdSJed Brown 
3489566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscMPIInt), 4, &segrank));
3499566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(unitbytes, 4 * count, &segdata));
3509566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(MPI_Request), 4, &segreq));
351afb254cdSJed Brown 
352afb254cdSJed Brown   nrecvs          = 0;
353afb254cdSJed Brown   barrier         = MPI_REQUEST_NULL;
35446e50bb6SJed Brown   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
35546e50bb6SJed Brown    * but we need to work around it. */
35646e50bb6SJed Brown   barrier_started = PETSC_FALSE;
357afb254cdSJed Brown   for (done = 0; !done;) {
358afb254cdSJed Brown     PetscMPIInt flag;
359afb254cdSJed Brown     MPI_Status  status;
3609566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &flag, &status));
361afb254cdSJed Brown     if (flag) { /* incoming message */
36289157a57SJed Brown       PetscMPIInt *recvrank, k;
363afb254cdSJed Brown       void        *buf;
3649566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGet(segrank, 1, &recvrank));
3659566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGet(segdata, count, &buf));
366afb254cdSJed Brown       *recvrank = status.MPI_SOURCE;
3679566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Recv(buf, count, dtype, status.MPI_SOURCE, tag, comm, MPI_STATUS_IGNORE));
3689566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGet(segreq, ntags, &req));
36989157a57SJed Brown       for (k = 0; k < ntags; k++) req[k] = MPI_REQUEST_NULL;
3709566063dSJacob Faibussowitsch       PetscCall((*recv)(comm, tags, status.MPI_SOURCE, buf, req, ctx));
371afb254cdSJed Brown       nrecvs++;
372afb254cdSJed Brown     }
37346e50bb6SJed Brown     if (!barrier_started) {
374afb254cdSJed Brown       PetscMPIInt sent, nsends;
3759566063dSJacob Faibussowitsch       PetscCall(PetscMPIIntCast(nto, &nsends));
3769566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Testall(nsends, sendreqs, &sent, MPI_STATUSES_IGNORE));
377afb254cdSJed Brown       if (sent) {
3789566063dSJacob Faibussowitsch         PetscCallMPI(MPI_Ibarrier(comm, &barrier));
37946e50bb6SJed Brown         barrier_started = PETSC_TRUE;
380afb254cdSJed Brown       }
381afb254cdSJed Brown     } else {
3829566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Test(&barrier, &done, MPI_STATUS_IGNORE));
383afb254cdSJed Brown     }
384afb254cdSJed Brown   }
385afb254cdSJed Brown   *nfrom = nrecvs;
3869566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(segrank, fromranks));
3879566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segrank));
3889566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(segdata, fromdata));
3899566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segdata));
3906afbe7ddSJed Brown   *toreqs = usendreqs;
3919566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(segreq, fromreqs));
3929566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segreq));
3939566063dSJacob Faibussowitsch   PetscCall(PetscFree(sendreqs));
3949566063dSJacob Faibussowitsch   PetscCall(PetscFree(tags));
3959566063dSJacob Faibussowitsch   PetscCall(PetscCommDestroy(&comm));
396afb254cdSJed Brown   PetscFunctionReturn(0);
397afb254cdSJed Brown }
398afb254cdSJed Brown #endif
399afb254cdSJed Brown 
400d815da10SJed Brown /*@C
401d815da10SJed Brown    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
402d815da10SJed Brown 
403d083f849SBarry Smith    Collective
404d815da10SJed Brown 
4054165533cSJose E. Roman    Input Parameters:
406d815da10SJed Brown +  comm - communicator
407d815da10SJed Brown .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
408d815da10SJed Brown .  dtype - datatype to send/receive from each rank (must match on all ranks)
409d815da10SJed Brown .  nto - number of ranks to send data to
410d815da10SJed Brown .  toranks - ranks to send to (array of length nto)
411d815da10SJed Brown .  todata - data to send to each rank (packed)
412d815da10SJed Brown .  ntags - number of tags needed by send/recv callbacks
413d815da10SJed Brown .  send - callback invoked on sending process when ready to send primary payload
414d815da10SJed Brown .  recv - callback invoked on receiving process after delivery of rendezvous message
415d815da10SJed Brown -  ctx - context for callbacks
416d815da10SJed Brown 
4174165533cSJose E. Roman    Output Parameters:
418d815da10SJed Brown +  nfrom - number of ranks receiving messages from
419d815da10SJed Brown .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
420d815da10SJed Brown -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
421d815da10SJed Brown 
422d815da10SJed Brown    Level: developer
423d815da10SJed Brown 
424d815da10SJed Brown    Notes:
425d815da10SJed Brown    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
426d815da10SJed Brown    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
427d815da10SJed Brown 
428d815da10SJed Brown    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
429d815da10SJed Brown 
430d815da10SJed Brown    References:
431606c0280SSatish Balay .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
4324f02bc6aSBarry Smith    Scalable communication protocols for dynamic sparse data exchange, 2010.
433d815da10SJed Brown 
434db781477SPatrick Sanan .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedFReq()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
435d815da10SJed Brown @*/
4369371c9d4SSatish Balay PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx) {
4376afbe7ddSJed Brown   MPI_Request *toreqs, *fromreqs;
4386afbe7ddSJed Brown 
4396afbe7ddSJed Brown   PetscFunctionBegin;
4409566063dSJacob Faibussowitsch   PetscCall(PetscCommBuildTwoSidedFReq(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata, ntags, &toreqs, &fromreqs, send, recv, ctx));
4419566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Waitall(nto * ntags, toreqs, MPI_STATUSES_IGNORE));
4429566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Waitall(*nfrom * ntags, fromreqs, MPI_STATUSES_IGNORE));
4439566063dSJacob Faibussowitsch   PetscCall(PetscFree(toreqs));
4449566063dSJacob Faibussowitsch   PetscCall(PetscFree(fromreqs));
4456afbe7ddSJed Brown   PetscFunctionReturn(0);
4466afbe7ddSJed Brown }
4476afbe7ddSJed Brown 
4486afbe7ddSJed Brown /*@C
4496afbe7ddSJed Brown    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
4506afbe7ddSJed Brown 
451d083f849SBarry Smith    Collective
4526afbe7ddSJed Brown 
4534165533cSJose E. Roman    Input Parameters:
4546afbe7ddSJed Brown +  comm - communicator
4556afbe7ddSJed Brown .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
4566afbe7ddSJed Brown .  dtype - datatype to send/receive from each rank (must match on all ranks)
4576afbe7ddSJed Brown .  nto - number of ranks to send data to
4586afbe7ddSJed Brown .  toranks - ranks to send to (array of length nto)
4596afbe7ddSJed Brown .  todata - data to send to each rank (packed)
4606afbe7ddSJed Brown .  ntags - number of tags needed by send/recv callbacks
4616afbe7ddSJed Brown .  send - callback invoked on sending process when ready to send primary payload
4626afbe7ddSJed Brown .  recv - callback invoked on receiving process after delivery of rendezvous message
4636afbe7ddSJed Brown -  ctx - context for callbacks
4646afbe7ddSJed Brown 
4654165533cSJose E. Roman    Output Parameters:
4666afbe7ddSJed Brown +  nfrom - number of ranks receiving messages from
4676afbe7ddSJed Brown .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
4686afbe7ddSJed Brown .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
4696afbe7ddSJed Brown .  toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
4706afbe7ddSJed Brown -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())
4716afbe7ddSJed Brown 
4726afbe7ddSJed Brown    Level: developer
4736afbe7ddSJed Brown 
4746afbe7ddSJed Brown    Notes:
4756afbe7ddSJed Brown    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
4766afbe7ddSJed Brown    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
4776afbe7ddSJed Brown 
4786afbe7ddSJed Brown    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
4796afbe7ddSJed Brown 
4806afbe7ddSJed Brown    References:
481606c0280SSatish Balay .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
4824f02bc6aSBarry Smith    Scalable communication protocols for dynamic sparse data exchange, 2010.
4836afbe7ddSJed Brown 
484db781477SPatrick Sanan .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedF()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
4856afbe7ddSJed Brown @*/
4869371c9d4SSatish Balay PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx) {
4879371c9d4SSatish Balay   PetscErrorCode (*f)(MPI_Comm, PetscMPIInt, MPI_Datatype, PetscMPIInt, const PetscMPIInt[], const void *, PetscMPIInt *, PetscMPIInt **, void *, PetscMPIInt, MPI_Request **, MPI_Request **, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx);
488d815da10SJed Brown   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
4893461502dSJed Brown   PetscMPIInt            i, size;
490d815da10SJed Brown 
491d815da10SJed Brown   PetscFunctionBegin;
4929566063dSJacob Faibussowitsch   PetscCall(PetscSysInitializePackage());
4939566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
4949371c9d4SSatish Balay   for (i = 0; i < nto; i++) { PetscCheck(toranks[i] >= 0 && size > toranks[i], comm, PETSC_ERR_ARG_OUTOFRANGE, "toranks[%d] %d not in comm size %d", i, toranks[i], size); }
4959566063dSJacob Faibussowitsch   PetscCall(PetscLogEventSync(PETSC_BuildTwoSidedF, comm));
4969566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSC_BuildTwoSidedF, 0, 0, 0, 0));
4979566063dSJacob Faibussowitsch   PetscCall(PetscCommBuildTwoSidedGetType(comm, &buildtype));
498d815da10SJed Brown   switch (buildtype) {
499d815da10SJed Brown   case PETSC_BUILDTWOSIDED_IBARRIER:
5008ee3e7ecSJunchao Zhang #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
5016afbe7ddSJed Brown     f = PetscCommBuildTwoSidedFReq_Ibarrier;
502b458e8f1SJose E. Roman     break;
503afb254cdSJed Brown #else
504afb254cdSJed Brown     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
505afb254cdSJed Brown #endif
506d815da10SJed Brown   case PETSC_BUILDTWOSIDED_ALLREDUCE:
5079371c9d4SSatish Balay   case PETSC_BUILDTWOSIDED_REDSCATTER: f = PetscCommBuildTwoSidedFReq_Reference; break;
508d815da10SJed Brown   default: SETERRQ(comm, PETSC_ERR_PLIB, "Unknown method for building two-sided communication");
509d815da10SJed Brown   }
5109566063dSJacob Faibussowitsch   PetscCall((*f)(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata, ntags, toreqs, fromreqs, send, recv, ctx));
5119566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSC_BuildTwoSidedF, 0, 0, 0, 0));
512d815da10SJed Brown   PetscFunctionReturn(0);
513d815da10SJed Brown }
514