1 #include <../src/vec/is/sf/impls/basic/sfpack.h> 2 3 // Though there is no default machanism to start a communication, we have a 4 // default to finish communication, which is just waiting on the requests. 5 // It should work for both non-blocking or persistent send/recvs or collectivwes. 6 static PetscErrorCode PetscSFLinkFinishCommunication_Default(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 7 { 8 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 9 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; 10 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 11 12 PetscFunctionBegin; 13 if (bas->nrootreqs) PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE)); 14 if (sf->nleafreqs) PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE)); 15 if (direction == PETSCSF_ROOT2LEAF) { 16 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */)); 17 } else { 18 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE)); 19 } 20 PetscFunctionReturn(PETSC_SUCCESS); 21 } 22 23 /* 24 The routine Creates a communication link for the given operation. It first looks up its link cache. If 25 there is a free & suitable one, it uses it. Otherwise it creates a new one. 26 27 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 28 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 29 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 30 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 31 32 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 33 34 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 35 36 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 37 need pack/unpack data. 38 */ 39 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) 40 { 41 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 42 PetscInt i, j, k, nrootreqs, nleafreqs, nreqs; 43 PetscSFLink *p, link; 44 PetscSFDirection direction; 45 MPI_Request *reqs = NULL; 46 PetscBool match, rootdirect[2], leafdirect[2]; 47 PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */ 48 PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 49 PetscMemType rootmtype_mpi, leafmtype_mpi; /* mtypes seen by MPI */ 50 PetscInt rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/ 51 52 PetscFunctionBegin; 53 54 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 55 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 56 if (sfop == PETSCSF_BCAST) { 57 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 58 leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 59 } else if (sfop == PETSCSF_REDUCE) { 60 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 61 rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 62 } else { /* PETSCSF_FETCH */ 63 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 64 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 65 } 66 } 67 68 if (sf->use_gpu_aware_mpi) { 69 rootmtype_mpi = rootmtype; 70 leafmtype_mpi = leafmtype; 71 } else { 72 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 73 } 74 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */ 75 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0; 76 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0; 77 78 direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 79 nrootreqs = bas->nrootreqs; 80 nleafreqs = sf->nleafreqs; 81 82 /* Look for free links in cache */ 83 for (p = &bas->avail; (link = *p); p = &link->next) { 84 if (!link->use_nvshmem) { /* Only check with MPI links */ 85 PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 86 if (match) { 87 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current. 88 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 89 */ 90 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 91 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 92 for (i = 0; i < nrootreqs; i++) { 93 if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 94 } 95 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 96 } 97 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 98 reqs = link->leafreqs[direction][leafmtype][1]; 99 for (i = 0; i < nleafreqs; i++) { 100 if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 101 } 102 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 103 } 104 *p = link->next; /* Remove from available list */ 105 goto found; 106 } 107 } 108 } 109 110 PetscCall(PetscNew(&link)); 111 PetscCall(PetscSFLinkSetUp_Host(sf, link, unit)); 112 PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */ 113 114 nreqs = (nrootreqs + nleafreqs) * 8; 115 PetscCall(PetscMalloc1(nreqs, &link->reqs)); 116 for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 117 118 if (nreqs) 119 for (i = 0; i < 2; i++) { /* Two communication directions */ 120 for (j = 0; j < 2; j++) { /* Two memory types */ 121 for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */ 122 link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k); 123 link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k); 124 } 125 } 126 } 127 128 link->FinishCommunication = PetscSFLinkFinishCommunication_Default; 129 // each SF type could customize their communication by setting function pointers in the link. 130 // Currently only BASIC and NEIGHBOR use this abstraction. 131 PetscTryTypeMethod(sf, SetCommunicationOps, link); 132 133 found: 134 135 #if defined(PETSC_HAVE_DEVICE) 136 if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 137 #if defined(PETSC_HAVE_CUDA) 138 if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */ 139 #endif 140 #if defined(PETSC_HAVE_HIP) 141 if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */ 142 #endif 143 #if defined(PETSC_HAVE_KOKKOS) 144 if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit)); 145 #endif 146 } 147 #endif 148 149 /* Allocate buffers along root/leafdata */ 150 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 151 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 152 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 153 if (bas->rootbuflen[i]) { 154 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 155 link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes; 156 } else { /* Have to have a separate rootbuf */ 157 if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype])); 158 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 159 } 160 } 161 162 if (sf->leafbuflen[i]) { 163 if (leafdirect[i]) { 164 link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes; 165 } else { 166 if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype])); 167 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 168 } 169 } 170 } 171 172 #if defined(PETSC_HAVE_DEVICE) 173 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 174 if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) { 175 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 176 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 177 } 178 if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) { 179 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 180 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 181 } 182 #endif 183 184 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 185 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 186 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 187 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 188 } 189 190 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 191 link->leafdata = leafdata; 192 for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 193 link->rootdirect[i] = rootdirect[i]; 194 link->leafdirect[i] = leafdirect[i]; 195 } 196 link->rootdirect_mpi = rootdirect_mpi; 197 link->leafdirect_mpi = leafdirect_mpi; 198 link->rootmtype = rootmtype; 199 link->leafmtype = leafmtype; 200 link->rootmtype_mpi = rootmtype_mpi; 201 link->leafmtype_mpi = leafmtype_mpi; 202 203 link->next = bas->inuse; 204 bas->inuse = link; 205 *mylink = link; 206 PetscFunctionReturn(PETSC_SUCCESS); 207 } 208