1 /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version, 2 we can also do abstractions like Prepare/StartCommunication. 3 */ 4 5 #include <../src/vec/is/sf/impls/basic/sfpack.h> 6 7 /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */ 8 static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 9 { 10 PetscMPIInt nreqs; 11 MPI_Request *reqs = NULL; 12 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 13 PetscInt buflen; 14 15 PetscFunctionBegin; 16 buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE]; 17 if (buflen) { 18 if (direction == PETSCSF_ROOT2LEAF) { 19 nreqs = sf->nleafreqs; 20 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs)); 21 } else { /* leaf to root */ 22 nreqs = bas->nrootreqs; 23 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL)); 24 } 25 PetscCallMPI(MPI_Startall_irecv(buflen, link->unit, nreqs, reqs)); 26 } 27 28 buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE]; 29 if (buflen) { 30 if (direction == PETSCSF_ROOT2LEAF) { 31 nreqs = bas->nrootreqs; 32 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */)); 33 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL)); 34 } else { /* leaf to root */ 35 nreqs = sf->nleafreqs; 36 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE)); 37 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs)); 38 } 39 PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction)); 40 PetscCallMPI(MPI_Startall_isend(buflen, link->unit, nreqs, reqs)); 41 } 42 PetscFunctionReturn(PETSC_SUCCESS); 43 } 44 45 static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 46 { 47 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 48 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; 49 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 50 51 PetscFunctionBegin; 52 PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE)); 53 PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE)); 54 if (direction == PETSCSF_ROOT2LEAF) { 55 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */)); 56 } else { 57 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE)); 58 } 59 PetscFunctionReturn(PETSC_SUCCESS); 60 } 61 62 #if defined(PETSC_HAVE_MPIX_STREAM) 63 // issue MPIX_Isend/Irecv_enqueue() 64 static PetscErrorCode PetscSFLinkStartEnqueue_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 65 { 66 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 67 PetscInt i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks; 68 const PetscInt *rootoffset, *leafoffset; 69 MPI_Aint disp; 70 MPI_Comm stream_comm = sf->stream_comm; 71 MPI_Datatype unit = link->unit; 72 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 73 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 74 75 PetscFunctionBegin; 76 if (bas->rootbuflen[PETSCSF_REMOTE]) { 77 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL)); 78 if (direction == PETSCSF_LEAF2ROOT) { 79 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 80 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 81 cnt = rootoffset[i + 1] - rootoffset[i]; 82 PetscCallMPI(MPIX_Irecv_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 83 } 84 } else { // PETSCSF_ROOT2LEAF 85 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 86 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 87 cnt = rootoffset[i + 1] - rootoffset[i]; 88 // no need to sync the gpu stream! 89 PetscCallMPI(MPIX_Isend_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 90 } 91 } 92 } 93 94 if (sf->leafbuflen[PETSCSF_REMOTE]) { 95 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL)); 96 if (direction == PETSCSF_LEAF2ROOT) { 97 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 98 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 99 cnt = leafoffset[i + 1] - leafoffset[i]; 100 // no need to sync the gpu stream! 101 PetscCallMPI(MPIX_Isend_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 102 } 103 } else { // PETSCSF_ROOT2LEAF 104 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 105 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 106 cnt = leafoffset[i + 1] - leafoffset[i]; 107 PetscCallMPI(MPIX_Irecv_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 108 } 109 } 110 } 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 static PetscErrorCode PetscSFLinkWaitEnqueue_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 115 { 116 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 117 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; 118 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 119 120 PetscFunctionBegin; 121 PetscCallMPI(MPIX_Waitall_enqueue(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE)); 122 PetscCallMPI(MPIX_Waitall_enqueue(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE)); 123 PetscFunctionReturn(PETSC_SUCCESS); 124 } 125 #endif 126 127 /* 128 The routine Creates a communication link for the given operation. It first looks up its link cache. If 129 there is a free & suitable one, it uses it. Otherwise it creates a new one. 130 131 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 132 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 133 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 134 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 135 136 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 137 138 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 139 140 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 141 need pack/unpack data. 142 */ 143 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) 144 { 145 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 146 PetscInt i, j, k, nrootreqs, nleafreqs, nreqs; 147 PetscSFLink *p, link; 148 PetscSFDirection direction; 149 MPI_Request *reqs = NULL; 150 PetscBool match, rootdirect[2], leafdirect[2]; 151 PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */ 152 PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 153 PetscMemType rootmtype_mpi, leafmtype_mpi; /* mtypes seen by MPI */ 154 PetscInt rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/ 155 156 PetscFunctionBegin; 157 158 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 159 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 160 if (sfop == PETSCSF_BCAST) { 161 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 162 leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 163 } else if (sfop == PETSCSF_REDUCE) { 164 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 165 rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 166 } else { /* PETSCSF_FETCH */ 167 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 168 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 169 } 170 } 171 172 if (sf->use_gpu_aware_mpi) { 173 rootmtype_mpi = rootmtype; 174 leafmtype_mpi = leafmtype; 175 } else { 176 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 177 } 178 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */ 179 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0; 180 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0; 181 182 direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 183 nrootreqs = bas->nrootreqs; 184 nleafreqs = sf->nleafreqs; 185 186 /* Look for free links in cache */ 187 for (p = &bas->avail; (link = *p); p = &link->next) { 188 if (!link->use_nvshmem) { /* Only check with MPI links */ 189 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 190 if (match) { 191 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current. 192 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 193 */ 194 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 195 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 196 for (i = 0; i < nrootreqs; i++) { 197 if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 198 } 199 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 200 } 201 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 202 reqs = link->leafreqs[direction][leafmtype][1]; 203 for (i = 0; i < nleafreqs; i++) { 204 if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 205 } 206 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 207 } 208 *p = link->next; /* Remove from available list */ 209 goto found; 210 } 211 } 212 } 213 214 PetscCall(PetscNew(&link)); 215 PetscCall(PetscSFLinkSetUp_Host(sf, link, unit)); 216 PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */ 217 218 nreqs = (nrootreqs + nleafreqs) * 8; 219 PetscCall(PetscMalloc1(nreqs, &link->reqs)); 220 for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 221 222 for (i = 0; i < 2; i++) { /* Two communication directions */ 223 for (j = 0; j < 2; j++) { /* Two memory types */ 224 for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */ 225 link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k); 226 link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k); 227 } 228 } 229 } 230 link->StartCommunication = PetscSFLinkStartRequests_MPI; 231 link->FinishCommunication = PetscSFLinkWaitRequests_MPI; 232 #if defined(PETSC_HAVE_MPIX_STREAM) 233 if (sf->use_stream_aware_mpi && (PetscMemTypeDevice(rootmtype_mpi) || PetscMemTypeDevice(leafmtype_mpi))) { 234 link->StartCommunication = PetscSFLinkStartEnqueue_MPIX_Stream; 235 link->FinishCommunication = PetscSFLinkWaitEnqueue_MPIX_Stream; 236 } 237 #endif 238 239 found: 240 241 #if defined(PETSC_HAVE_DEVICE) 242 if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 243 #if defined(PETSC_HAVE_CUDA) 244 if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */ 245 #endif 246 #if defined(PETSC_HAVE_HIP) 247 if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */ 248 #endif 249 #if defined(PETSC_HAVE_KOKKOS) 250 if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit)); 251 #endif 252 } 253 #endif 254 255 /* Allocate buffers along root/leafdata */ 256 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 257 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 258 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 259 if (bas->rootbuflen[i]) { 260 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 261 link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes; 262 } else { /* Have to have a separate rootbuf */ 263 if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype])); 264 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 265 } 266 } 267 268 if (sf->leafbuflen[i]) { 269 if (leafdirect[i]) { 270 link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes; 271 } else { 272 if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype])); 273 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 274 } 275 } 276 } 277 278 #if defined(PETSC_HAVE_DEVICE) 279 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 280 if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) { 281 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 282 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 283 } 284 if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) { 285 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 286 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 287 } 288 #endif 289 290 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 291 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 292 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 293 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 294 } 295 296 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 297 link->leafdata = leafdata; 298 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 299 link->rootdirect[i] = rootdirect[i]; 300 link->leafdirect[i] = leafdirect[i]; 301 } 302 link->rootdirect_mpi = rootdirect_mpi; 303 link->leafdirect_mpi = leafdirect_mpi; 304 link->rootmtype = rootmtype; 305 link->leafmtype = leafmtype; 306 link->rootmtype_mpi = rootmtype_mpi; 307 link->leafmtype_mpi = leafmtype_mpi; 308 309 link->next = bas->inuse; 310 bas->inuse = link; 311 *mylink = link; 312 PetscFunctionReturn(PETSC_SUCCESS); 313 } 314