1 #include <../src/vec/is/sf/impls/basic/sfpack.h> 2 3 // Though there is no default mechanism to start a communication, we have a 4 // default to finish communication, which is just waiting on the requests. 5 // It should work for both non-blocking or persistent send/recvs or collectivwes. 6 static PetscErrorCode PetscSFLinkFinishCommunication_Default(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 7 { 8 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 9 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; 10 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 11 12 PetscFunctionBegin; 13 if (bas->nrootreqs) PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE)); 14 if (sf->nleafreqs) PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE)); 15 if (direction == PETSCSF_ROOT2LEAF) { 16 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */)); 17 } else { 18 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE)); 19 } 20 PetscFunctionReturn(PETSC_SUCCESS); 21 } 22 23 /* 24 The routine Creates a communication link for the given operation. It first looks up its link cache. If 25 there is a free & suitable one, it uses it. Otherwise it creates a new one. 26 27 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 28 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 29 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 30 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 31 32 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 33 34 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 35 36 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 37 need pack/unpack data. 38 */ 39 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) 40 { 41 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 42 PetscInt i, j, k, nrootreqs, nleafreqs, nreqs; 43 PetscSFLink *p, link; 44 PetscSFDirection direction; 45 MPI_Request *reqs = NULL; 46 PetscBool match, rootdirect[2], leafdirect[2]; 47 PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */ 48 PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 49 PetscMemType rootmtype_mpi, leafmtype_mpi; /* mtypes seen by MPI */ 50 PetscInt rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/ 51 52 PetscFunctionBegin; 53 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 54 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 55 if (sfop == PETSCSF_BCAST) { 56 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 57 leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 58 } else if (sfop == PETSCSF_REDUCE) { 59 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 60 rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 61 } else { /* PETSCSF_FETCH */ 62 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 63 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 64 } 65 } 66 67 // NEVER use root/leafdirect[] for persistent collectives. Otherwise, suppose for the first time, all ranks build 68 // a persistent MPI request in a collective call. Then in a second call to PetscSFBcast, one rank uses root/leafdirect 69 // but with new rootdata/leafdata pointers. Other ranks keep using the same rootdata/leafdata pointers as last time. 70 // Only that rank will try to rebuild the request with a collective call, resulting in hanging. We could to call 71 // MPI_Allreduce() every time to detect changes in root/leafdata, but that is too expensive for sparse communication. 72 // So we always set root/leafdirect[] to false and allocate additional root/leaf buffers for persistent collectives. 73 if (sf->persistent && sf->collective) { 74 rootdirect[PETSCSF_REMOTE] = PETSC_FALSE; 75 leafdirect[PETSCSF_REMOTE] = PETSC_FALSE; 76 } 77 78 if (sf->use_gpu_aware_mpi) { 79 rootmtype_mpi = rootmtype; 80 leafmtype_mpi = leafmtype; 81 } else { 82 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 83 } 84 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */ 85 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0; 86 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0; 87 88 direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 89 nrootreqs = bas->nrootreqs; 90 nleafreqs = sf->nleafreqs; 91 92 /* Look for free links in cache */ 93 for (p = &bas->avail; (link = *p); p = &link->next) { 94 if (!link->use_nvshmem) { /* Only check with MPI links */ 95 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 96 if (match) { 97 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current. 98 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests() with the same tag. 99 */ 100 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 101 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 102 for (i = 0; i < nrootreqs; i++) { 103 if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 104 } 105 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 106 } 107 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 108 reqs = link->leafreqs[direction][leafmtype][1]; 109 for (i = 0; i < nleafreqs; i++) { 110 if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 111 } 112 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 113 } 114 *p = link->next; /* Remove from available list */ 115 goto found; 116 } 117 } 118 } 119 120 PetscCall(PetscNew(&link)); 121 PetscCall(PetscSFLinkSetUp_Host(sf, link, unit)); 122 PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */ 123 124 nreqs = (nrootreqs + nleafreqs) * 8; 125 PetscCall(PetscMalloc1(nreqs, &link->reqs)); 126 for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 127 128 if (nreqs) 129 for (i = 0; i < 2; i++) { /* Two communication directions */ 130 for (j = 0; j < 2; j++) { /* Two memory types */ 131 for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */ 132 link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k); 133 link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k); 134 } 135 } 136 } 137 138 link->FinishCommunication = PetscSFLinkFinishCommunication_Default; 139 // each SF type could customize their communication by setting function pointers in the link. 140 // Currently only BASIC and NEIGHBOR use this abstraction. 141 PetscTryTypeMethod(sf, SetCommunicationOps, link); 142 143 found: 144 145 #if defined(PETSC_HAVE_DEVICE) 146 if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 147 #if defined(PETSC_HAVE_CUDA) 148 if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */ 149 #endif 150 #if defined(PETSC_HAVE_HIP) 151 if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */ 152 #endif 153 #if defined(PETSC_HAVE_KOKKOS) 154 if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit)); 155 #endif 156 } 157 #endif 158 159 /* Allocate buffers along root/leafdata */ 160 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 161 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 162 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 163 if (bas->rootbuflen[i]) { 164 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 165 link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes; 166 } else { /* Have to have a separate rootbuf */ 167 if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype])); 168 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 169 } 170 } 171 172 if (sf->leafbuflen[i]) { 173 if (leafdirect[i]) { 174 link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes; 175 } else { 176 if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype])); 177 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 178 } 179 } 180 } 181 182 #if defined(PETSC_HAVE_DEVICE) 183 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 184 if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) { 185 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 186 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 187 } 188 if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) { 189 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 190 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 191 } 192 #endif 193 194 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 195 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 196 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 197 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 198 } 199 200 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 201 link->leafdata = leafdata; 202 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 203 link->rootdirect[i] = rootdirect[i]; 204 link->leafdirect[i] = leafdirect[i]; 205 } 206 link->rootdirect_mpi = rootdirect_mpi; 207 link->leafdirect_mpi = leafdirect_mpi; 208 link->rootmtype = rootmtype; 209 link->leafmtype = leafmtype; 210 link->rootmtype_mpi = rootmtype_mpi; 211 link->leafmtype_mpi = leafmtype_mpi; 212 213 link->next = bas->inuse; 214 bas->inuse = link; 215 *mylink = link; 216 PetscFunctionReturn(PETSC_SUCCESS); 217 } 218