1 /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version, 2 we can also do abstractions like Prepare/StartCommunication. 3 */ 4 5 #include <../src/vec/is/sf/impls/basic/sfpack.h> 6 7 /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */ 8 static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 9 { 10 PetscMPIInt nreqs; 11 MPI_Request *reqs = NULL; 12 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 13 PetscInt buflen; 14 15 PetscFunctionBegin; 16 buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE]; 17 if (buflen) { 18 if (direction == PETSCSF_ROOT2LEAF) { 19 nreqs = sf->nleafreqs; 20 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs)); 21 } else { /* leaf to root */ 22 nreqs = bas->nrootreqs; 23 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL)); 24 } 25 PetscCallMPI(MPI_Startall_irecv(buflen, link->unit, nreqs, reqs)); 26 } 27 28 buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE]; 29 if (buflen) { 30 if (direction == PETSCSF_ROOT2LEAF) { 31 nreqs = bas->nrootreqs; 32 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */)); 33 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL)); 34 } else { /* leaf to root */ 35 nreqs = sf->nleafreqs; 36 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE)); 37 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs)); 38 } 39 PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction)); 40 PetscCallMPI(MPI_Startall_isend(buflen, link->unit, nreqs, reqs)); 41 } 42 PetscFunctionReturn(0); 43 } 44 45 static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 46 { 47 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 48 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; 49 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 50 51 PetscFunctionBegin; 52 PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE)); 53 PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE)); 54 if (direction == PETSCSF_ROOT2LEAF) { 55 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */)); 56 } else { 57 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE)); 58 } 59 PetscFunctionReturn(0); 60 } 61 62 /* 63 The routine Creates a communication link for the given operation. It first looks up its link cache. If 64 there is a free & suitable one, it uses it. Otherwise it creates a new one. 65 66 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 67 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 68 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 69 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 70 71 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 72 73 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 74 75 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 76 need pack/unpack data. 77 */ 78 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) 79 { 80 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 81 PetscInt i, j, k, nrootreqs, nleafreqs, nreqs; 82 PetscSFLink *p, link; 83 PetscSFDirection direction; 84 MPI_Request *reqs = NULL; 85 PetscBool match, rootdirect[2], leafdirect[2]; 86 PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */ 87 PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 88 PetscMemType rootmtype_mpi, leafmtype_mpi; /* mtypes seen by MPI */ 89 PetscInt rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/ 90 91 PetscFunctionBegin; 92 93 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 94 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 95 if (sfop == PETSCSF_BCAST) { 96 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 97 leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 98 } else if (sfop == PETSCSF_REDUCE) { 99 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 100 rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 101 } else { /* PETSCSF_FETCH */ 102 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 103 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 104 } 105 } 106 107 if (sf->use_gpu_aware_mpi) { 108 rootmtype_mpi = rootmtype; 109 leafmtype_mpi = leafmtype; 110 } else { 111 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 112 } 113 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 114 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0; 115 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0; 116 117 direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 118 nrootreqs = bas->nrootreqs; 119 nleafreqs = sf->nleafreqs; 120 121 /* Look for free links in cache */ 122 for (p = &bas->avail; (link = *p); p = &link->next) { 123 if (!link->use_nvshmem) { /* Only check with MPI links */ 124 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 125 if (match) { 126 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current. 127 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 128 */ 129 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 130 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 131 for (i = 0; i < nrootreqs; i++) { 132 if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 133 } 134 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 135 } 136 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 137 reqs = link->leafreqs[direction][leafmtype][1]; 138 for (i = 0; i < nleafreqs; i++) { 139 if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 140 } 141 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 142 } 143 *p = link->next; /* Remove from available list */ 144 goto found; 145 } 146 } 147 } 148 149 PetscCall(PetscNew(&link)); 150 PetscCall(PetscSFLinkSetUp_Host(sf, link, unit)); 151 PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */ 152 153 nreqs = (nrootreqs + nleafreqs) * 8; 154 PetscCall(PetscMalloc1(nreqs, &link->reqs)); 155 for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 156 157 for (i = 0; i < 2; i++) { /* Two communication directions */ 158 for (j = 0; j < 2; j++) { /* Two memory types */ 159 for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */ 160 link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k); 161 link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k); 162 } 163 } 164 } 165 link->StartCommunication = PetscSFLinkStartRequests_MPI; 166 link->FinishCommunication = PetscSFLinkWaitRequests_MPI; 167 168 found: 169 170 #if defined(PETSC_HAVE_DEVICE) 171 if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 172 #if defined(PETSC_HAVE_CUDA) 173 if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */ 174 #endif 175 #if defined(PETSC_HAVE_HIP) 176 if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */ 177 #endif 178 #if defined(PETSC_HAVE_KOKKOS) 179 if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit)); 180 #endif 181 } 182 #endif 183 184 /* Allocate buffers along root/leafdata */ 185 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 186 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 187 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 188 if (bas->rootbuflen[i]) { 189 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 190 link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes; 191 } else { /* Have to have a separate rootbuf */ 192 if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype])); 193 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 194 } 195 } 196 197 if (sf->leafbuflen[i]) { 198 if (leafdirect[i]) { 199 link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes; 200 } else { 201 if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype])); 202 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 203 } 204 } 205 } 206 207 #if defined(PETSC_HAVE_DEVICE) 208 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 209 if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) { 210 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 211 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 212 } 213 if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) { 214 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 215 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 216 } 217 #endif 218 219 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 220 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 221 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 222 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 223 } 224 225 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 226 link->leafdata = leafdata; 227 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 228 link->rootdirect[i] = rootdirect[i]; 229 link->leafdirect[i] = leafdirect[i]; 230 } 231 link->rootdirect_mpi = rootdirect_mpi; 232 link->leafdirect_mpi = leafdirect_mpi; 233 link->rootmtype = rootmtype; 234 link->leafmtype = leafmtype; 235 link->rootmtype_mpi = rootmtype_mpi; 236 link->leafmtype_mpi = leafmtype_mpi; 237 238 link->next = bas->inuse; 239 bas->inuse = link; 240 *mylink = link; 241 PetscFunctionReturn(0); 242 } 243