1 /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version, 2 we can also do abstractions like Prepare/StartCommunication. 3 */ 4 5 #include <../src/vec/is/sf/impls/basic/sfpack.h> 6 7 /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */ 8 static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) { 9 PetscMPIInt nreqs; 10 MPI_Request *reqs = NULL; 11 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 12 PetscInt buflen; 13 14 PetscFunctionBegin; 15 buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE]; 16 if (buflen) { 17 if (direction == PETSCSF_ROOT2LEAF) { 18 nreqs = sf->nleafreqs; 19 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs)); 20 } else { /* leaf to root */ 21 nreqs = bas->nrootreqs; 22 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL)); 23 } 24 PetscCallMPI(MPI_Startall_irecv(buflen, link->unit, nreqs, reqs)); 25 } 26 27 buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE]; 28 if (buflen) { 29 if (direction == PETSCSF_ROOT2LEAF) { 30 nreqs = bas->nrootreqs; 31 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */)); 32 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL)); 33 } else { /* leaf to root */ 34 nreqs = sf->nleafreqs; 35 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE)); 36 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs)); 37 } 38 PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction)); 39 PetscCallMPI(MPI_Startall_isend(buflen, link->unit, nreqs, reqs)); 40 } 41 PetscFunctionReturn(0); 42 } 43 44 static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) { 45 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 46 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; 47 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 48 49 PetscFunctionBegin; 50 PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE)); 51 PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE)); 52 if (direction == PETSCSF_ROOT2LEAF) { 53 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */)); 54 } else { 55 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE)); 56 } 57 PetscFunctionReturn(0); 58 } 59 60 /* 61 The routine Creates a communication link for the given operation. It first looks up its link cache. If 62 there is a free & suitable one, it uses it. Otherwise it creates a new one. 63 64 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 65 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 66 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 67 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 68 69 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 70 71 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 72 73 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 74 need pack/unpack data. 75 */ 76 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) { 77 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 78 PetscInt i, j, k, nrootreqs, nleafreqs, nreqs; 79 PetscSFLink *p, link; 80 PetscSFDirection direction; 81 MPI_Request *reqs = NULL; 82 PetscBool match, rootdirect[2], leafdirect[2]; 83 PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */ 84 PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 85 PetscMemType rootmtype_mpi, leafmtype_mpi; /* mtypes seen by MPI */ 86 PetscInt rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/ 87 88 PetscFunctionBegin; 89 90 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 91 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 92 if (sfop == PETSCSF_BCAST) { 93 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 94 leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 95 } else if (sfop == PETSCSF_REDUCE) { 96 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 97 rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 98 } else { /* PETSCSF_FETCH */ 99 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 100 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 101 } 102 } 103 104 if (sf->use_gpu_aware_mpi) { 105 rootmtype_mpi = rootmtype; 106 leafmtype_mpi = leafmtype; 107 } else { 108 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 109 } 110 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 111 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0; 112 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0; 113 114 direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 115 nrootreqs = bas->nrootreqs; 116 nleafreqs = sf->nleafreqs; 117 118 /* Look for free links in cache */ 119 for (p = &bas->avail; (link = *p); p = &link->next) { 120 if (!link->use_nvshmem) { /* Only check with MPI links */ 121 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 122 if (match) { 123 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current. 124 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 125 */ 126 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 127 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 128 for (i = 0; i < nrootreqs; i++) { 129 if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 130 } 131 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 132 } 133 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 134 reqs = link->leafreqs[direction][leafmtype][1]; 135 for (i = 0; i < nleafreqs; i++) { 136 if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 137 } 138 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 139 } 140 *p = link->next; /* Remove from available list */ 141 goto found; 142 } 143 } 144 } 145 146 PetscCall(PetscNew(&link)); 147 PetscCall(PetscSFLinkSetUp_Host(sf, link, unit)); 148 PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */ 149 150 nreqs = (nrootreqs + nleafreqs) * 8; 151 PetscCall(PetscMalloc1(nreqs, &link->reqs)); 152 for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 153 154 for (i = 0; i < 2; i++) { /* Two communication directions */ 155 for (j = 0; j < 2; j++) { /* Two memory types */ 156 for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */ 157 link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k); 158 link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k); 159 } 160 } 161 } 162 link->StartCommunication = PetscSFLinkStartRequests_MPI; 163 link->FinishCommunication = PetscSFLinkWaitRequests_MPI; 164 165 found: 166 167 #if defined(PETSC_HAVE_DEVICE) 168 if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 169 #if defined(PETSC_HAVE_CUDA) 170 if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */ 171 #endif 172 #if defined(PETSC_HAVE_HIP) 173 if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */ 174 #endif 175 #if defined(PETSC_HAVE_KOKKOS) 176 if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit)); 177 #endif 178 } 179 #endif 180 181 /* Allocate buffers along root/leafdata */ 182 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 183 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 184 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 185 if (bas->rootbuflen[i]) { 186 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 187 link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes; 188 } else { /* Have to have a separate rootbuf */ 189 if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype])); 190 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 191 } 192 } 193 194 if (sf->leafbuflen[i]) { 195 if (leafdirect[i]) { 196 link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes; 197 } else { 198 if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype])); 199 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 200 } 201 } 202 } 203 204 #if defined(PETSC_HAVE_DEVICE) 205 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 206 if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) { 207 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 208 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 209 } 210 if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) { 211 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 212 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 213 } 214 #endif 215 216 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 217 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 218 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 219 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 220 } 221 222 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 223 link->leafdata = leafdata; 224 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 225 link->rootdirect[i] = rootdirect[i]; 226 link->leafdirect[i] = leafdirect[i]; 227 } 228 link->rootdirect_mpi = rootdirect_mpi; 229 link->leafdirect_mpi = leafdirect_mpi; 230 link->rootmtype = rootmtype; 231 link->leafmtype = leafmtype; 232 link->rootmtype_mpi = rootmtype_mpi; 233 link->leafmtype_mpi = leafmtype_mpi; 234 235 link->next = bas->inuse; 236 bas->inuse = link; 237 *mylink = link; 238 PetscFunctionReturn(0); 239 } 240