1f6ced4a3SJed Brown #include <petscsys.h> /*I "petscsys.h" I*/ 295c0884eSLisandro Dalcin #include <petsc/private/petscimpl.h> 3f6ced4a3SJed Brown 495c0884eSLisandro Dalcin PetscLogEvent PETSC_BuildTwoSided; 595c0884eSLisandro Dalcin PetscLogEvent PETSC_BuildTwoSidedF; 63b3561c8SJed Brown 79371c9d4SSatish Balay const char *const PetscBuildTwoSidedTypes[] = {"ALLREDUCE", "IBARRIER", "REDSCATTER", "PetscBuildTwoSidedType", "PETSC_BUILDTWOSIDED_", NULL}; 8f6ced4a3SJed Brown 96145cd65SJed Brown static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET; 10f6ced4a3SJed Brown 116145cd65SJed Brown /*@ 126145cd65SJed Brown PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication 136145cd65SJed Brown 146145cd65SJed Brown Logically Collective 156145cd65SJed Brown 164165533cSJose E. Roman Input Parameters: 176145cd65SJed Brown + comm - PETSC_COMM_WORLD 186145cd65SJed Brown - twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided() 196145cd65SJed Brown 206145cd65SJed Brown Level: developer 216145cd65SJed Brown 226145cd65SJed Brown Note: 236145cd65SJed Brown This option is currently global, but could be made per-communicator. 246145cd65SJed Brown 25db781477SPatrick Sanan .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedGetType()` 266145cd65SJed Brown @*/ 279371c9d4SSatish Balay PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm, PetscBuildTwoSidedType twosided) { 286145cd65SJed Brown PetscFunctionBegin; 2976bd3646SJed Brown if (PetscDefined(USE_DEBUG)) { /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */ 306145cd65SJed Brown PetscMPIInt b1[2], b2[2]; 316145cd65SJed Brown b1[0] = -(PetscMPIInt)twosided; 326145cd65SJed Brown b1[1] = (PetscMPIInt)twosided; 331c2dc1cbSBarry Smith PetscCall(MPIU_Allreduce(b1, b2, 2, MPI_INT, MPI_MAX, comm)); 3408401ef6SPierre Jolivet PetscCheck(-b2[0] == b2[1], comm, PETSC_ERR_ARG_WRONG, "Enum value must be same on all processes"); 356145cd65SJed Brown } 366145cd65SJed Brown _twosided_type = twosided; 376145cd65SJed Brown PetscFunctionReturn(0); 386145cd65SJed Brown } 396145cd65SJed Brown 406145cd65SJed Brown /*@ 416145cd65SJed Brown PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication 426145cd65SJed Brown 436145cd65SJed Brown Logically Collective 446145cd65SJed Brown 454165533cSJose E. Roman Output Parameters: 466145cd65SJed Brown + comm - communicator on which to query algorithm 476145cd65SJed Brown - twosided - algorithm to use for PetscCommBuildTwoSided() 486145cd65SJed Brown 496145cd65SJed Brown Level: developer 506145cd65SJed Brown 51db781477SPatrick Sanan .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedSetType()` 526145cd65SJed Brown @*/ 539371c9d4SSatish Balay PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm, PetscBuildTwoSidedType *twosided) { 5405342407SJunchao Zhang PetscMPIInt size; 55f6ced4a3SJed Brown 56f6ced4a3SJed Brown PetscFunctionBegin; 576145cd65SJed Brown *twosided = PETSC_BUILDTWOSIDED_NOTSET; 586145cd65SJed Brown if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) { 599566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm, &size)); 6005342407SJunchao Zhang _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE; /* default for small comms, see https://gitlab.com/petsc/petsc/-/merge_requests/2611 */ 618ee3e7ecSJunchao Zhang #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES) 6205342407SJunchao Zhang if (size > 1024) _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER; 636145cd65SJed Brown #endif 649566063dSJacob Faibussowitsch PetscCall(PetscOptionsGetEnum(NULL, NULL, "-build_twosided", PetscBuildTwoSidedTypes, (PetscEnum *)&_twosided_type, NULL)); 65f6ced4a3SJed Brown } 66f6ced4a3SJed Brown *twosided = _twosided_type; 67f6ced4a3SJed Brown PetscFunctionReturn(0); 68f6ced4a3SJed Brown } 69f6ced4a3SJed Brown 708ee3e7ecSJunchao Zhang #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES) 719371c9d4SSatish Balay static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata) { 72c3a59b84SJed Brown PetscMPIInt nrecvs, tag, done, i; 73c3a59b84SJed Brown MPI_Aint lb, unitbytes; 74f6ced4a3SJed Brown char *tdata; 75f6ced4a3SJed Brown MPI_Request *sendreqs, barrier; 760f453b92SJed Brown PetscSegBuffer segrank, segdata; 7746e50bb6SJed Brown PetscBool barrier_started; 78f6ced4a3SJed Brown 79f6ced4a3SJed Brown PetscFunctionBegin; 809566063dSJacob Faibussowitsch PetscCall(PetscCommDuplicate(comm, &comm, &tag)); 819566063dSJacob Faibussowitsch PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes)); 8208401ef6SPierre Jolivet PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb); 83f6ced4a3SJed Brown tdata = (char *)todata; 849566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nto, &sendreqs)); 85*48a46eb9SPierre Jolivet for (i = 0; i < nto; i++) PetscCallMPI(MPI_Issend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i)); 869566063dSJacob Faibussowitsch PetscCall(PetscSegBufferCreate(sizeof(PetscMPIInt), 4, &segrank)); 879566063dSJacob Faibussowitsch PetscCall(PetscSegBufferCreate(unitbytes, 4 * count, &segdata)); 88f6ced4a3SJed Brown 89f6ced4a3SJed Brown nrecvs = 0; 90f6ced4a3SJed Brown barrier = MPI_REQUEST_NULL; 9146e50bb6SJed Brown /* MPICH-3.2 sometimes does not create a request in some "optimized" cases. This is arguably a standard violation, 9246e50bb6SJed Brown * but we need to work around it. */ 9346e50bb6SJed Brown barrier_started = PETSC_FALSE; 94f6ced4a3SJed Brown for (done = 0; !done;) { 95f6ced4a3SJed Brown PetscMPIInt flag; 96f6ced4a3SJed Brown MPI_Status status; 979566063dSJacob Faibussowitsch PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &flag, &status)); 98f6ced4a3SJed Brown if (flag) { /* incoming message */ 99f6ced4a3SJed Brown PetscMPIInt *recvrank; 100f6ced4a3SJed Brown void *buf; 1019566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGet(segrank, 1, &recvrank)); 1029566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGet(segdata, count, &buf)); 103f6ced4a3SJed Brown *recvrank = status.MPI_SOURCE; 1049566063dSJacob Faibussowitsch PetscCallMPI(MPI_Recv(buf, count, dtype, status.MPI_SOURCE, tag, comm, MPI_STATUS_IGNORE)); 105f6ced4a3SJed Brown nrecvs++; 106f6ced4a3SJed Brown } 10746e50bb6SJed Brown if (!barrier_started) { 1084dc2109aSBarry Smith PetscMPIInt sent, nsends; 1099566063dSJacob Faibussowitsch PetscCall(PetscMPIIntCast(nto, &nsends)); 1109566063dSJacob Faibussowitsch PetscCallMPI(MPI_Testall(nsends, sendreqs, &sent, MPI_STATUSES_IGNORE)); 111f6ced4a3SJed Brown if (sent) { 1129566063dSJacob Faibussowitsch PetscCallMPI(MPI_Ibarrier(comm, &barrier)); 11346e50bb6SJed Brown barrier_started = PETSC_TRUE; 1149566063dSJacob Faibussowitsch PetscCall(PetscFree(sendreqs)); 115f6ced4a3SJed Brown } 116f6ced4a3SJed Brown } else { 1179566063dSJacob Faibussowitsch PetscCallMPI(MPI_Test(&barrier, &done, MPI_STATUS_IGNORE)); 118f6ced4a3SJed Brown } 119f6ced4a3SJed Brown } 120f6ced4a3SJed Brown *nfrom = nrecvs; 1219566063dSJacob Faibussowitsch PetscCall(PetscSegBufferExtractAlloc(segrank, fromranks)); 1229566063dSJacob Faibussowitsch PetscCall(PetscSegBufferDestroy(&segrank)); 1239566063dSJacob Faibussowitsch PetscCall(PetscSegBufferExtractAlloc(segdata, fromdata)); 1249566063dSJacob Faibussowitsch PetscCall(PetscSegBufferDestroy(&segdata)); 1259566063dSJacob Faibussowitsch PetscCall(PetscCommDestroy(&comm)); 126f6ced4a3SJed Brown PetscFunctionReturn(0); 127f6ced4a3SJed Brown } 128f6ced4a3SJed Brown #endif 129f6ced4a3SJed Brown 1309371c9d4SSatish Balay static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata) { 13105342407SJunchao Zhang PetscMPIInt size, rank, *iflags, nrecvs, tag, *franks, i, flg; 132c3a59b84SJed Brown MPI_Aint lb, unitbytes; 133f6ced4a3SJed Brown char *tdata, *fdata; 134f6ced4a3SJed Brown MPI_Request *reqs, *sendreqs; 135f6ced4a3SJed Brown MPI_Status *statuses; 13605342407SJunchao Zhang PetscCommCounter *counter; 137f6ced4a3SJed Brown 138f6ced4a3SJed Brown PetscFunctionBegin; 1399566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm, &size)); 1409566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1419566063dSJacob Faibussowitsch PetscCall(PetscCommDuplicate(comm, &comm, &tag)); 1429566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Counter_keyval, &counter, &flg)); 14328b400f6SJacob Faibussowitsch PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inner PETSc communicator does not have its tag/name counter attribute set"); 14405342407SJunchao Zhang if (!counter->iflags) { 1459566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(size, &counter->iflags)); 14605342407SJunchao Zhang iflags = counter->iflags; 14705342407SJunchao Zhang } else { 14805342407SJunchao Zhang iflags = counter->iflags; 1499566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(iflags, size)); 15005342407SJunchao Zhang } 15105342407SJunchao Zhang for (i = 0; i < nto; i++) iflags[toranks[i]] = 1; 1521c2dc1cbSBarry Smith PetscCall(MPIU_Allreduce(MPI_IN_PLACE, iflags, size, MPI_INT, MPI_SUM, comm)); 15305342407SJunchao Zhang nrecvs = iflags[rank]; 1549566063dSJacob Faibussowitsch PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes)); 1555f80ce2aSJacob Faibussowitsch PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb); 1569566063dSJacob Faibussowitsch PetscCall(PetscMalloc(nrecvs * count * unitbytes, &fdata)); 157f6ced4a3SJed Brown tdata = (char *)todata; 1589566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nto + nrecvs, &reqs, nto + nrecvs, &statuses)); 15906926cafSJed Brown sendreqs = reqs + nrecvs; 160*48a46eb9SPierre Jolivet for (i = 0; i < nrecvs; i++) PetscCallMPI(MPI_Irecv((void *)(fdata + count * unitbytes * i), count, dtype, MPI_ANY_SOURCE, tag, comm, reqs + i)); 161*48a46eb9SPierre Jolivet for (i = 0; i < nto; i++) PetscCallMPI(MPI_Isend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i)); 1629566063dSJacob Faibussowitsch PetscCallMPI(MPI_Waitall(nto + nrecvs, reqs, statuses)); 1639566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nrecvs, &franks)); 164a297a907SKarl Rupp for (i = 0; i < nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE; 1659566063dSJacob Faibussowitsch PetscCall(PetscFree2(reqs, statuses)); 1669566063dSJacob Faibussowitsch PetscCall(PetscCommDestroy(&comm)); 167f6ced4a3SJed Brown 168f6ced4a3SJed Brown *nfrom = nrecvs; 169f6ced4a3SJed Brown *fromranks = franks; 170f6ced4a3SJed Brown *(void **)fromdata = fdata; 171f6ced4a3SJed Brown PetscFunctionReturn(0); 172f6ced4a3SJed Brown } 173f6ced4a3SJed Brown 1741654bf6bSJed Brown #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK) 1759371c9d4SSatish Balay static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata) { 17605342407SJunchao Zhang PetscMPIInt size, *iflags, nrecvs, tag, *franks, i, flg; 177c3a59b84SJed Brown MPI_Aint lb, unitbytes; 1781654bf6bSJed Brown char *tdata, *fdata; 1791654bf6bSJed Brown MPI_Request *reqs, *sendreqs; 1801654bf6bSJed Brown MPI_Status *statuses; 18105342407SJunchao Zhang PetscCommCounter *counter; 1821654bf6bSJed Brown 1831654bf6bSJed Brown PetscFunctionBegin; 1849566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm, &size)); 1859566063dSJacob Faibussowitsch PetscCall(PetscCommDuplicate(comm, &comm, &tag)); 1869566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Counter_keyval, &counter, &flg)); 18728b400f6SJacob Faibussowitsch PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inner PETSc communicator does not have its tag/name counter attribute set"); 18805342407SJunchao Zhang if (!counter->iflags) { 1899566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(size, &counter->iflags)); 19005342407SJunchao Zhang iflags = counter->iflags; 19105342407SJunchao Zhang } else { 19205342407SJunchao Zhang iflags = counter->iflags; 1939566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(iflags, size)); 19405342407SJunchao Zhang } 1951654bf6bSJed Brown for (i = 0; i < nto; i++) iflags[toranks[i]] = 1; 1969566063dSJacob Faibussowitsch PetscCallMPI(MPI_Reduce_scatter_block(iflags, &nrecvs, 1, MPI_INT, MPI_SUM, comm)); 1979566063dSJacob Faibussowitsch PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes)); 19808401ef6SPierre Jolivet PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb); 1999566063dSJacob Faibussowitsch PetscCall(PetscMalloc(nrecvs * count * unitbytes, &fdata)); 2001654bf6bSJed Brown tdata = (char *)todata; 2019566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nto + nrecvs, &reqs, nto + nrecvs, &statuses)); 2021654bf6bSJed Brown sendreqs = reqs + nrecvs; 203*48a46eb9SPierre Jolivet for (i = 0; i < nrecvs; i++) PetscCallMPI(MPI_Irecv((void *)(fdata + count * unitbytes * i), count, dtype, MPI_ANY_SOURCE, tag, comm, reqs + i)); 204*48a46eb9SPierre Jolivet for (i = 0; i < nto; i++) PetscCallMPI(MPI_Isend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i)); 2059566063dSJacob Faibussowitsch PetscCallMPI(MPI_Waitall(nto + nrecvs, reqs, statuses)); 2069566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nrecvs, &franks)); 2071654bf6bSJed Brown for (i = 0; i < nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE; 2089566063dSJacob Faibussowitsch PetscCall(PetscFree2(reqs, statuses)); 2099566063dSJacob Faibussowitsch PetscCall(PetscCommDestroy(&comm)); 2101654bf6bSJed Brown 2111654bf6bSJed Brown *nfrom = nrecvs; 2121654bf6bSJed Brown *fromranks = franks; 2131654bf6bSJed Brown *(void **)fromdata = fdata; 2141654bf6bSJed Brown PetscFunctionReturn(0); 2151654bf6bSJed Brown } 2161654bf6bSJed Brown #endif 2171654bf6bSJed Brown 218f6ced4a3SJed Brown /*@C 219f6ced4a3SJed Brown PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths) 220f6ced4a3SJed Brown 221d083f849SBarry Smith Collective 222f6ced4a3SJed Brown 2234165533cSJose E. Roman Input Parameters: 224f6ced4a3SJed Brown + comm - communicator 225f6ced4a3SJed Brown . count - number of entries to send/receive (must match on all ranks) 226f6ced4a3SJed Brown . dtype - datatype to send/receive from each rank (must match on all ranks) 227f6ced4a3SJed Brown . nto - number of ranks to send data to 228f6ced4a3SJed Brown . toranks - ranks to send to (array of length nto) 229f6ced4a3SJed Brown - todata - data to send to each rank (packed) 230f6ced4a3SJed Brown 2314165533cSJose E. Roman Output Parameters: 232f6ced4a3SJed Brown + nfrom - number of ranks receiving messages from 233f6ced4a3SJed Brown . fromranks - ranks receiving messages from (length nfrom; caller should PetscFree()) 234f6ced4a3SJed Brown - fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree()) 235f6ced4a3SJed Brown 236f6ced4a3SJed Brown Level: developer 237f6ced4a3SJed Brown 2381654bf6bSJed Brown Options Database Keys: 23905342407SJunchao Zhang . -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication. Default is allreduce for communicators with <= 1024 ranks, otherwise ibarrier. 2401654bf6bSJed Brown 241f6ced4a3SJed Brown Notes: 242f6ced4a3SJed Brown This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and 243f6ced4a3SJed Brown PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data. 244f6ced4a3SJed Brown 245f6ced4a3SJed Brown Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not. 246f6ced4a3SJed Brown 247f6ced4a3SJed Brown References: 248606c0280SSatish Balay . * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in 2494f02bc6aSBarry Smith Scalable communication protocols for dynamic sparse data exchange, 2010. 250f6ced4a3SJed Brown 251db781477SPatrick Sanan .seealso: `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()` 252f6ced4a3SJed Brown @*/ 2539371c9d4SSatish Balay PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata) { 2546145cd65SJed Brown PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET; 255f6ced4a3SJed Brown 256f6ced4a3SJed Brown PetscFunctionBegin; 2579566063dSJacob Faibussowitsch PetscCall(PetscSysInitializePackage()); 2589566063dSJacob Faibussowitsch PetscCall(PetscLogEventSync(PETSC_BuildTwoSided, comm)); 2599566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSC_BuildTwoSided, 0, 0, 0, 0)); 2609566063dSJacob Faibussowitsch PetscCall(PetscCommBuildTwoSidedGetType(comm, &buildtype)); 261f6ced4a3SJed Brown switch (buildtype) { 2626145cd65SJed Brown case PETSC_BUILDTWOSIDED_IBARRIER: 2638ee3e7ecSJunchao Zhang #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES) 2649566063dSJacob Faibussowitsch PetscCall(PetscCommBuildTwoSided_Ibarrier(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata)); 265b458e8f1SJose E. Roman break; 2666145cd65SJed Brown #else 2676145cd65SJed Brown SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Ibarrier (part of MPI-3)"); 268f6ced4a3SJed Brown #endif 2699371c9d4SSatish Balay case PETSC_BUILDTWOSIDED_ALLREDUCE: PetscCall(PetscCommBuildTwoSided_Allreduce(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata)); break; 2701654bf6bSJed Brown case PETSC_BUILDTWOSIDED_REDSCATTER: 2711654bf6bSJed Brown #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK) 2729566063dSJacob Faibussowitsch PetscCall(PetscCommBuildTwoSided_RedScatter(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata)); 273b458e8f1SJose E. Roman break; 2741654bf6bSJed Brown #else 2751654bf6bSJed Brown SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)"); 2761654bf6bSJed Brown #endif 277f6ced4a3SJed Brown default: SETERRQ(comm, PETSC_ERR_PLIB, "Unknown method for building two-sided communication"); 278f6ced4a3SJed Brown } 2799566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSC_BuildTwoSided, 0, 0, 0, 0)); 280f6ced4a3SJed Brown PetscFunctionReturn(0); 281f6ced4a3SJed Brown } 282d815da10SJed Brown 2839371c9d4SSatish Balay static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx) { 284d815da10SJed Brown PetscMPIInt i, *tag; 285d815da10SJed Brown MPI_Aint lb, unitbytes; 286d815da10SJed Brown MPI_Request *sendreq, *recvreq; 287d815da10SJed Brown 288d815da10SJed Brown PetscFunctionBegin; 2899566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ntags, &tag)); 290*48a46eb9SPierre Jolivet if (ntags > 0) PetscCall(PetscCommDuplicate(comm, &comm, &tag[0])); 291*48a46eb9SPierre Jolivet for (i = 1; i < ntags; i++) PetscCall(PetscCommGetNewTag(comm, &tag[i])); 292d815da10SJed Brown 293d815da10SJed Brown /* Perform complete initial rendezvous */ 2949566063dSJacob Faibussowitsch PetscCall(PetscCommBuildTwoSided(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata)); 295d815da10SJed Brown 2969566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nto * ntags, &sendreq)); 2979566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(*nfrom * ntags, &recvreq)); 298d815da10SJed Brown 2999566063dSJacob Faibussowitsch PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes)); 30008401ef6SPierre Jolivet PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb); 301d815da10SJed Brown for (i = 0; i < nto; i++) { 30289157a57SJed Brown PetscMPIInt k; 30389157a57SJed Brown for (k = 0; k < ntags; k++) sendreq[i * ntags + k] = MPI_REQUEST_NULL; 3049566063dSJacob Faibussowitsch PetscCall((*send)(comm, tag, i, toranks[i], ((char *)todata) + count * unitbytes * i, sendreq + i * ntags, ctx)); 305d815da10SJed Brown } 306d815da10SJed Brown for (i = 0; i < *nfrom; i++) { 307d815da10SJed Brown void *header = (*(char **)fromdata) + count * unitbytes * i; 30889157a57SJed Brown PetscMPIInt k; 30989157a57SJed Brown for (k = 0; k < ntags; k++) recvreq[i * ntags + k] = MPI_REQUEST_NULL; 3109566063dSJacob Faibussowitsch PetscCall((*recv)(comm, tag, (*fromranks)[i], header, recvreq + i * ntags, ctx)); 311d815da10SJed Brown } 3129566063dSJacob Faibussowitsch PetscCall(PetscFree(tag)); 3139566063dSJacob Faibussowitsch PetscCall(PetscCommDestroy(&comm)); 3146afbe7ddSJed Brown *toreqs = sendreq; 3156afbe7ddSJed Brown *fromreqs = recvreq; 316d815da10SJed Brown PetscFunctionReturn(0); 317d815da10SJed Brown } 318d815da10SJed Brown 3198ee3e7ecSJunchao Zhang #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES) 320afb254cdSJed Brown 3219371c9d4SSatish Balay static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx) { 322afb254cdSJed Brown PetscMPIInt nrecvs, tag, *tags, done, i; 323afb254cdSJed Brown MPI_Aint lb, unitbytes; 324afb254cdSJed Brown char *tdata; 325afb254cdSJed Brown MPI_Request *sendreqs, *usendreqs, *req, barrier; 326afb254cdSJed Brown PetscSegBuffer segrank, segdata, segreq; 32746e50bb6SJed Brown PetscBool barrier_started; 328afb254cdSJed Brown 329afb254cdSJed Brown PetscFunctionBegin; 3309566063dSJacob Faibussowitsch PetscCall(PetscCommDuplicate(comm, &comm, &tag)); 3319566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ntags, &tags)); 332*48a46eb9SPierre Jolivet for (i = 0; i < ntags; i++) PetscCall(PetscCommGetNewTag(comm, &tags[i])); 3339566063dSJacob Faibussowitsch PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes)); 33408401ef6SPierre Jolivet PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb); 335afb254cdSJed Brown tdata = (char *)todata; 3369566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nto, &sendreqs)); 3379566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nto * ntags, &usendreqs)); 338afb254cdSJed Brown /* Post synchronous sends */ 339*48a46eb9SPierre Jolivet for (i = 0; i < nto; i++) PetscCallMPI(MPI_Issend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i)); 340afb254cdSJed Brown /* Post actual payloads. These are typically larger messages. Hopefully sending these later does not slow down the 341afb254cdSJed Brown * synchronous messages above. */ 342afb254cdSJed Brown for (i = 0; i < nto; i++) { 34389157a57SJed Brown PetscMPIInt k; 34489157a57SJed Brown for (k = 0; k < ntags; k++) usendreqs[i * ntags + k] = MPI_REQUEST_NULL; 3459566063dSJacob Faibussowitsch PetscCall((*send)(comm, tags, i, toranks[i], tdata + count * unitbytes * i, usendreqs + i * ntags, ctx)); 346afb254cdSJed Brown } 347afb254cdSJed Brown 3489566063dSJacob Faibussowitsch PetscCall(PetscSegBufferCreate(sizeof(PetscMPIInt), 4, &segrank)); 3499566063dSJacob Faibussowitsch PetscCall(PetscSegBufferCreate(unitbytes, 4 * count, &segdata)); 3509566063dSJacob Faibussowitsch PetscCall(PetscSegBufferCreate(sizeof(MPI_Request), 4, &segreq)); 351afb254cdSJed Brown 352afb254cdSJed Brown nrecvs = 0; 353afb254cdSJed Brown barrier = MPI_REQUEST_NULL; 35446e50bb6SJed Brown /* MPICH-3.2 sometimes does not create a request in some "optimized" cases. This is arguably a standard violation, 35546e50bb6SJed Brown * but we need to work around it. */ 35646e50bb6SJed Brown barrier_started = PETSC_FALSE; 357afb254cdSJed Brown for (done = 0; !done;) { 358afb254cdSJed Brown PetscMPIInt flag; 359afb254cdSJed Brown MPI_Status status; 3609566063dSJacob Faibussowitsch PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &flag, &status)); 361afb254cdSJed Brown if (flag) { /* incoming message */ 36289157a57SJed Brown PetscMPIInt *recvrank, k; 363afb254cdSJed Brown void *buf; 3649566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGet(segrank, 1, &recvrank)); 3659566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGet(segdata, count, &buf)); 366afb254cdSJed Brown *recvrank = status.MPI_SOURCE; 3679566063dSJacob Faibussowitsch PetscCallMPI(MPI_Recv(buf, count, dtype, status.MPI_SOURCE, tag, comm, MPI_STATUS_IGNORE)); 3689566063dSJacob Faibussowitsch PetscCall(PetscSegBufferGet(segreq, ntags, &req)); 36989157a57SJed Brown for (k = 0; k < ntags; k++) req[k] = MPI_REQUEST_NULL; 3709566063dSJacob Faibussowitsch PetscCall((*recv)(comm, tags, status.MPI_SOURCE, buf, req, ctx)); 371afb254cdSJed Brown nrecvs++; 372afb254cdSJed Brown } 37346e50bb6SJed Brown if (!barrier_started) { 374afb254cdSJed Brown PetscMPIInt sent, nsends; 3759566063dSJacob Faibussowitsch PetscCall(PetscMPIIntCast(nto, &nsends)); 3769566063dSJacob Faibussowitsch PetscCallMPI(MPI_Testall(nsends, sendreqs, &sent, MPI_STATUSES_IGNORE)); 377afb254cdSJed Brown if (sent) { 3789566063dSJacob Faibussowitsch PetscCallMPI(MPI_Ibarrier(comm, &barrier)); 37946e50bb6SJed Brown barrier_started = PETSC_TRUE; 380afb254cdSJed Brown } 381afb254cdSJed Brown } else { 3829566063dSJacob Faibussowitsch PetscCallMPI(MPI_Test(&barrier, &done, MPI_STATUS_IGNORE)); 383afb254cdSJed Brown } 384afb254cdSJed Brown } 385afb254cdSJed Brown *nfrom = nrecvs; 3869566063dSJacob Faibussowitsch PetscCall(PetscSegBufferExtractAlloc(segrank, fromranks)); 3879566063dSJacob Faibussowitsch PetscCall(PetscSegBufferDestroy(&segrank)); 3889566063dSJacob Faibussowitsch PetscCall(PetscSegBufferExtractAlloc(segdata, fromdata)); 3899566063dSJacob Faibussowitsch PetscCall(PetscSegBufferDestroy(&segdata)); 3906afbe7ddSJed Brown *toreqs = usendreqs; 3919566063dSJacob Faibussowitsch PetscCall(PetscSegBufferExtractAlloc(segreq, fromreqs)); 3929566063dSJacob Faibussowitsch PetscCall(PetscSegBufferDestroy(&segreq)); 3939566063dSJacob Faibussowitsch PetscCall(PetscFree(sendreqs)); 3949566063dSJacob Faibussowitsch PetscCall(PetscFree(tags)); 3959566063dSJacob Faibussowitsch PetscCall(PetscCommDestroy(&comm)); 396afb254cdSJed Brown PetscFunctionReturn(0); 397afb254cdSJed Brown } 398afb254cdSJed Brown #endif 399afb254cdSJed Brown 400d815da10SJed Brown /*@C 401d815da10SJed Brown PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous 402d815da10SJed Brown 403d083f849SBarry Smith Collective 404d815da10SJed Brown 4054165533cSJose E. Roman Input Parameters: 406d815da10SJed Brown + comm - communicator 407d815da10SJed Brown . count - number of entries to send/receive in initial rendezvous (must match on all ranks) 408d815da10SJed Brown . dtype - datatype to send/receive from each rank (must match on all ranks) 409d815da10SJed Brown . nto - number of ranks to send data to 410d815da10SJed Brown . toranks - ranks to send to (array of length nto) 411d815da10SJed Brown . todata - data to send to each rank (packed) 412d815da10SJed Brown . ntags - number of tags needed by send/recv callbacks 413d815da10SJed Brown . send - callback invoked on sending process when ready to send primary payload 414d815da10SJed Brown . recv - callback invoked on receiving process after delivery of rendezvous message 415d815da10SJed Brown - ctx - context for callbacks 416d815da10SJed Brown 4174165533cSJose E. Roman Output Parameters: 418d815da10SJed Brown + nfrom - number of ranks receiving messages from 419d815da10SJed Brown . fromranks - ranks receiving messages from (length nfrom; caller should PetscFree()) 420d815da10SJed Brown - fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree()) 421d815da10SJed Brown 422d815da10SJed Brown Level: developer 423d815da10SJed Brown 424d815da10SJed Brown Notes: 425d815da10SJed Brown This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and 426d815da10SJed Brown PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data. 427d815da10SJed Brown 428d815da10SJed Brown Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not. 429d815da10SJed Brown 430d815da10SJed Brown References: 431606c0280SSatish Balay . * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in 4324f02bc6aSBarry Smith Scalable communication protocols for dynamic sparse data exchange, 2010. 433d815da10SJed Brown 434db781477SPatrick Sanan .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedFReq()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()` 435d815da10SJed Brown @*/ 4369371c9d4SSatish Balay PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx) { 4376afbe7ddSJed Brown MPI_Request *toreqs, *fromreqs; 4386afbe7ddSJed Brown 4396afbe7ddSJed Brown PetscFunctionBegin; 4409566063dSJacob Faibussowitsch PetscCall(PetscCommBuildTwoSidedFReq(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata, ntags, &toreqs, &fromreqs, send, recv, ctx)); 4419566063dSJacob Faibussowitsch PetscCallMPI(MPI_Waitall(nto * ntags, toreqs, MPI_STATUSES_IGNORE)); 4429566063dSJacob Faibussowitsch PetscCallMPI(MPI_Waitall(*nfrom * ntags, fromreqs, MPI_STATUSES_IGNORE)); 4439566063dSJacob Faibussowitsch PetscCall(PetscFree(toreqs)); 4449566063dSJacob Faibussowitsch PetscCall(PetscFree(fromreqs)); 4456afbe7ddSJed Brown PetscFunctionReturn(0); 4466afbe7ddSJed Brown } 4476afbe7ddSJed Brown 4486afbe7ddSJed Brown /*@C 4496afbe7ddSJed Brown PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests 4506afbe7ddSJed Brown 451d083f849SBarry Smith Collective 4526afbe7ddSJed Brown 4534165533cSJose E. Roman Input Parameters: 4546afbe7ddSJed Brown + comm - communicator 4556afbe7ddSJed Brown . count - number of entries to send/receive in initial rendezvous (must match on all ranks) 4566afbe7ddSJed Brown . dtype - datatype to send/receive from each rank (must match on all ranks) 4576afbe7ddSJed Brown . nto - number of ranks to send data to 4586afbe7ddSJed Brown . toranks - ranks to send to (array of length nto) 4596afbe7ddSJed Brown . todata - data to send to each rank (packed) 4606afbe7ddSJed Brown . ntags - number of tags needed by send/recv callbacks 4616afbe7ddSJed Brown . send - callback invoked on sending process when ready to send primary payload 4626afbe7ddSJed Brown . recv - callback invoked on receiving process after delivery of rendezvous message 4636afbe7ddSJed Brown - ctx - context for callbacks 4646afbe7ddSJed Brown 4654165533cSJose E. Roman Output Parameters: 4666afbe7ddSJed Brown + nfrom - number of ranks receiving messages from 4676afbe7ddSJed Brown . fromranks - ranks receiving messages from (length nfrom; caller should PetscFree()) 4686afbe7ddSJed Brown . fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree()) 4696afbe7ddSJed Brown . toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree()) 4706afbe7ddSJed Brown - fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree()) 4716afbe7ddSJed Brown 4726afbe7ddSJed Brown Level: developer 4736afbe7ddSJed Brown 4746afbe7ddSJed Brown Notes: 4756afbe7ddSJed Brown This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and 4766afbe7ddSJed Brown PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data. 4776afbe7ddSJed Brown 4786afbe7ddSJed Brown Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not. 4796afbe7ddSJed Brown 4806afbe7ddSJed Brown References: 481606c0280SSatish Balay . * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in 4824f02bc6aSBarry Smith Scalable communication protocols for dynamic sparse data exchange, 2010. 4836afbe7ddSJed Brown 484db781477SPatrick Sanan .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedF()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()` 4856afbe7ddSJed Brown @*/ 4869371c9d4SSatish Balay PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx) { 4879371c9d4SSatish Balay PetscErrorCode (*f)(MPI_Comm, PetscMPIInt, MPI_Datatype, PetscMPIInt, const PetscMPIInt[], const void *, PetscMPIInt *, PetscMPIInt **, void *, PetscMPIInt, MPI_Request **, MPI_Request **, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx); 488d815da10SJed Brown PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET; 4893461502dSJed Brown PetscMPIInt i, size; 490d815da10SJed Brown 491d815da10SJed Brown PetscFunctionBegin; 4929566063dSJacob Faibussowitsch PetscCall(PetscSysInitializePackage()); 4939566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm, &size)); 4949371c9d4SSatish Balay for (i = 0; i < nto; i++) { PetscCheck(toranks[i] >= 0 && size > toranks[i], comm, PETSC_ERR_ARG_OUTOFRANGE, "toranks[%d] %d not in comm size %d", i, toranks[i], size); } 4959566063dSJacob Faibussowitsch PetscCall(PetscLogEventSync(PETSC_BuildTwoSidedF, comm)); 4969566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSC_BuildTwoSidedF, 0, 0, 0, 0)); 4979566063dSJacob Faibussowitsch PetscCall(PetscCommBuildTwoSidedGetType(comm, &buildtype)); 498d815da10SJed Brown switch (buildtype) { 499d815da10SJed Brown case PETSC_BUILDTWOSIDED_IBARRIER: 5008ee3e7ecSJunchao Zhang #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES) 5016afbe7ddSJed Brown f = PetscCommBuildTwoSidedFReq_Ibarrier; 502b458e8f1SJose E. Roman break; 503afb254cdSJed Brown #else 504afb254cdSJed Brown SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Ibarrier (part of MPI-3)"); 505afb254cdSJed Brown #endif 506d815da10SJed Brown case PETSC_BUILDTWOSIDED_ALLREDUCE: 5079371c9d4SSatish Balay case PETSC_BUILDTWOSIDED_REDSCATTER: f = PetscCommBuildTwoSidedFReq_Reference; break; 508d815da10SJed Brown default: SETERRQ(comm, PETSC_ERR_PLIB, "Unknown method for building two-sided communication"); 509d815da10SJed Brown } 5109566063dSJacob Faibussowitsch PetscCall((*f)(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata, ntags, toreqs, fromreqs, send, recv, ctx)); 5119566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSC_BuildTwoSidedF, 0, 0, 0, 0)); 512d815da10SJed Brown PetscFunctionReturn(0); 513d815da10SJed Brown } 514