1 /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version, 2 we can also do abstractions like Prepare/StartCommunication. 3 */ 4 5 #include <../src/vec/is/sf/impls/basic/sfpack.h> 6 7 /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */ 8 static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 9 { 10 PetscMPIInt nreqs; 11 MPI_Request *reqs = NULL; 12 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 13 PetscInt buflen; 14 15 PetscFunctionBegin; 16 buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE]; 17 if (buflen) { 18 if (direction == PETSCSF_ROOT2LEAF) { 19 nreqs = sf->nleafreqs; 20 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs)); 21 } else { /* leaf to root */ 22 nreqs = bas->nrootreqs; 23 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL)); 24 } 25 PetscCallMPI(MPI_Startall_irecv(buflen, link->unit, nreqs, reqs)); 26 } 27 28 buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE]; 29 if (buflen) { 30 if (direction == PETSCSF_ROOT2LEAF) { 31 nreqs = bas->nrootreqs; 32 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */)); 33 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL)); 34 } else { /* leaf to root */ 35 nreqs = sf->nleafreqs; 36 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE)); 37 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs)); 38 } 39 PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction)); 40 PetscCallMPI(MPI_Startall_isend(buflen, link->unit, nreqs, reqs)); 41 } 42 PetscFunctionReturn(PETSC_SUCCESS); 43 } 44 45 static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 46 { 47 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 48 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; 49 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 50 51 PetscFunctionBegin; 52 PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE)); 53 PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE)); 54 if (direction == PETSCSF_ROOT2LEAF) { 55 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */)); 56 } else { 57 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE)); 58 } 59 PetscFunctionReturn(PETSC_SUCCESS); 60 } 61 62 #if defined(PETSC_HAVE_MPIX_STREAM) 63 // issue MPIX_Isend/Irecv_enqueue() 64 static PetscErrorCode PetscSFLinkStartEnqueue_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 65 { 66 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 67 PetscInt i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks; 68 const PetscInt *rootoffset, *leafoffset; 69 MPI_Aint disp; 70 MPI_Comm stream_comm = sf->stream_comm; 71 MPI_Datatype unit = link->unit; 72 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 73 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 74 75 PetscFunctionBegin; 76 if (bas->rootbuflen[PETSCSF_REMOTE]) { 77 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL)); 78 if (direction == PETSCSF_LEAF2ROOT) { 79 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 80 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 81 cnt = rootoffset[i + 1] - rootoffset[i]; 82 PetscCallMPI(MPIX_Irecv_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 83 } 84 } else { // PETSCSF_ROOT2LEAF 85 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 86 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 87 cnt = rootoffset[i + 1] - rootoffset[i]; 88 // no need to sync the gpu stream! 89 PetscCallMPI(MPIX_Isend_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 90 } 91 } 92 } 93 94 if (sf->leafbuflen[PETSCSF_REMOTE]) { 95 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL)); 96 if (direction == PETSCSF_LEAF2ROOT) { 97 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 98 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 99 cnt = leafoffset[i + 1] - leafoffset[i]; 100 // no need to sync the gpu stream! 101 PetscCallMPI(MPIX_Isend_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 102 } 103 } else { // PETSCSF_ROOT2LEAF 104 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 105 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 106 cnt = leafoffset[i + 1] - leafoffset[i]; 107 PetscCallMPI(MPIX_Irecv_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 108 } 109 } 110 } 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 static PetscErrorCode PetscSFLinkWaitEnqueue_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 115 { 116 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 117 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; 118 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 119 120 PetscFunctionBegin; 121 PetscCallMPI(MPIX_Waitall_enqueue(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE)); 122 PetscCallMPI(MPIX_Waitall_enqueue(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE)); 123 PetscFunctionReturn(PETSC_SUCCESS); 124 } 125 #endif 126 127 /* 128 The routine Creates a communication link for the given operation. It first looks up its link cache. If 129 there is a free & suitable one, it uses it. Otherwise it creates a new one. 130 131 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 132 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 133 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 134 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 135 136 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 137 138 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 139 140 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 141 need pack/unpack data. 142 */ 143 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) 144 { 145 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 146 PetscInt i, j, k, nrootreqs, nleafreqs, nreqs; 147 PetscSFLink *p, link; 148 PetscSFDirection direction; 149 MPI_Request *reqs = NULL; 150 PetscBool match, rootdirect[2], leafdirect[2]; 151 PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */ 152 PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 153 PetscMemType rootmtype_mpi, leafmtype_mpi; /* mtypes seen by MPI */ 154 PetscInt rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/ 155 156 PetscFunctionBegin; 157 158 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 159 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 160 if (sfop == PETSCSF_BCAST) { 161 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 162 leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 163 } else if (sfop == PETSCSF_REDUCE) { 164 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 165 rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 166 } else { /* PETSCSF_FETCH */ 167 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 168 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 169 } 170 } 171 172 if (sf->use_gpu_aware_mpi) { 173 rootmtype_mpi = rootmtype; 174 leafmtype_mpi = leafmtype; 175 } else { 176 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 177 } 178 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */ 179 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0; 180 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0; 181 182 direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 183 nrootreqs = bas->nrootreqs; 184 nleafreqs = sf->nleafreqs; 185 186 /* Look for free links in cache */ 187 for (p = &bas->avail; (link = *p); p = &link->next) { 188 if (!link->use_nvshmem) { /* Only check with MPI links */ 189 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 190 if (match) { 191 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current. 192 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 193 */ 194 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 195 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 196 for (i = 0; i < nrootreqs; i++) { 197 if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 198 } 199 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 200 } 201 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 202 reqs = link->leafreqs[direction][leafmtype][1]; 203 for (i = 0; i < nleafreqs; i++) { 204 if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 205 } 206 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 207 } 208 *p = link->next; /* Remove from available list */ 209 goto found; 210 } 211 } 212 } 213 214 PetscCall(PetscNew(&link)); 215 PetscCall(PetscSFLinkSetUp_Host(sf, link, unit)); 216 PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */ 217 218 nreqs = (nrootreqs + nleafreqs) * 8; 219 PetscCall(PetscMalloc1(nreqs, &link->reqs)); 220 for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 221 222 if (nreqs) 223 for (i = 0; i < 2; i++) { /* Two communication directions */ 224 for (j = 0; j < 2; j++) { /* Two memory types */ 225 for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */ 226 link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k); 227 link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k); 228 } 229 } 230 } 231 link->StartCommunication = PetscSFLinkStartRequests_MPI; 232 link->FinishCommunication = PetscSFLinkWaitRequests_MPI; 233 #if defined(PETSC_HAVE_MPIX_STREAM) 234 if (sf->use_stream_aware_mpi && (PetscMemTypeDevice(rootmtype_mpi) || PetscMemTypeDevice(leafmtype_mpi))) { 235 link->StartCommunication = PetscSFLinkStartEnqueue_MPIX_Stream; 236 link->FinishCommunication = PetscSFLinkWaitEnqueue_MPIX_Stream; 237 } 238 #endif 239 240 found: 241 242 #if defined(PETSC_HAVE_DEVICE) 243 if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 244 #if defined(PETSC_HAVE_CUDA) 245 if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */ 246 #endif 247 #if defined(PETSC_HAVE_HIP) 248 if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */ 249 #endif 250 #if defined(PETSC_HAVE_KOKKOS) 251 if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit)); 252 #endif 253 } 254 #endif 255 256 /* Allocate buffers along root/leafdata */ 257 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 258 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 259 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 260 if (bas->rootbuflen[i]) { 261 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 262 link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes; 263 } else { /* Have to have a separate rootbuf */ 264 if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype])); 265 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 266 } 267 } 268 269 if (sf->leafbuflen[i]) { 270 if (leafdirect[i]) { 271 link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes; 272 } else { 273 if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype])); 274 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 275 } 276 } 277 } 278 279 #if defined(PETSC_HAVE_DEVICE) 280 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 281 if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) { 282 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 283 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 284 } 285 if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) { 286 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 287 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 288 } 289 #endif 290 291 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 292 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 293 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 294 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 295 } 296 297 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 298 link->leafdata = leafdata; 299 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 300 link->rootdirect[i] = rootdirect[i]; 301 link->leafdirect[i] = leafdirect[i]; 302 } 303 link->rootdirect_mpi = rootdirect_mpi; 304 link->leafdirect_mpi = leafdirect_mpi; 305 link->rootmtype = rootmtype; 306 link->leafmtype = leafmtype; 307 link->rootmtype_mpi = rootmtype_mpi; 308 link->leafmtype_mpi = leafmtype_mpi; 309 310 link->next = bas->inuse; 311 bas->inuse = link; 312 *mylink = link; 313 PetscFunctionReturn(PETSC_SUCCESS); 314 } 315