1 /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version, 2 we can also do abstractions like Prepare/StartCommunication. 3 */ 4 5 #include <../src/vec/is/sf/impls/basic/sfpack.h> 6 7 /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */ 8 static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 9 { 10 PetscMPIInt nreqs; 11 MPI_Request *reqs = NULL; 12 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 13 PetscInt buflen; 14 15 PetscFunctionBegin; 16 buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE]; 17 if (buflen) { 18 if (direction == PETSCSF_ROOT2LEAF) { 19 nreqs = sf->nleafreqs; 20 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,NULL,&reqs)); 21 } else { /* leaf to root */ 22 nreqs = bas->nrootreqs; 23 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,&reqs,NULL)); 24 } 25 PetscCallMPI(MPI_Startall_irecv(buflen,link->unit,nreqs,reqs)); 26 } 27 28 buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE]; 29 if (buflen) { 30 if (direction == PETSCSF_ROOT2LEAF) { 31 nreqs = bas->nrootreqs; 32 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host before sending */)); 33 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,&reqs,NULL)); 34 } else { /* leaf to root */ 35 nreqs = sf->nleafreqs; 36 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE)); 37 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,NULL,&reqs)); 38 } 39 PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf,link,direction)); 40 PetscCallMPI(MPI_Startall_isend(buflen,link->unit,nreqs,reqs)); 41 } 42 PetscFunctionReturn(0); 43 } 44 45 static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 46 { 47 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 48 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; 49 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 50 51 PetscFunctionBegin; 52 PetscCallMPI(MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE)); 53 PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE)); 54 if (direction == PETSCSF_ROOT2LEAF) { 55 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE/* host2device after recving */)); 56 } else { 57 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE)); 58 } 59 PetscFunctionReturn(0); 60 } 61 62 /* 63 The routine Creates a communication link for the given operation. It first looks up its link cache. If 64 there is a free & suitable one, it uses it. Otherwise it creates a new one. 65 66 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 67 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 68 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 69 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 70 71 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 72 73 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 74 75 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 76 need pack/unpack data. 77 */ 78 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 79 { 80 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 81 PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 82 PetscSFLink *p,link; 83 PetscSFDirection direction; 84 MPI_Request *reqs = NULL; 85 PetscBool match,rootdirect[2],leafdirect[2]; 86 PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */ 87 PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 88 PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 89 PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 90 91 PetscFunctionBegin; 92 93 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 94 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 95 if (sfop == PETSCSF_BCAST) { 96 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 97 leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 98 } else if (sfop == PETSCSF_REDUCE) { 99 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 100 rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 101 } else { /* PETSCSF_FETCH */ 102 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 103 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 104 } 105 } 106 107 if (sf->use_gpu_aware_mpi) { 108 rootmtype_mpi = rootmtype; 109 leafmtype_mpi = leafmtype; 110 } else { 111 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 112 } 113 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 114 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 115 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 116 117 direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 118 nrootreqs = bas->nrootreqs; 119 nleafreqs = sf->nleafreqs; 120 121 /* Look for free links in cache */ 122 for (p=&bas->avail; (link=*p); p=&link->next) { 123 if (!link->use_nvshmem) { /* Only check with MPI links */ 124 PetscCall(MPIPetsc_Type_compare(unit,link->unit,&match)); 125 if (match) { 126 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current. 127 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 128 */ 129 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 130 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 131 for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));} 132 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 133 } 134 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 135 reqs = link->leafreqs[direction][leafmtype][1]; 136 for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));} 137 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 138 } 139 *p = link->next; /* Remove from available list */ 140 goto found; 141 } 142 } 143 } 144 145 PetscCall(PetscNew(&link)); 146 PetscCall(PetscSFLinkSetUp_Host(sf,link,unit)); 147 PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag)); /* One tag per link */ 148 149 nreqs = (nrootreqs+nleafreqs)*8; 150 PetscCall(PetscMalloc1(nreqs,&link->reqs)); 151 for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 152 153 for (i=0; i<2; i++) { /* Two communication directions */ 154 for (j=0; j<2; j++) { /* Two memory types */ 155 for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 156 link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 157 link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 158 } 159 } 160 } 161 link->StartCommunication = PetscSFLinkStartRequests_MPI; 162 link->FinishCommunication = PetscSFLinkWaitRequests_MPI; 163 164 found: 165 166 #if defined(PETSC_HAVE_DEVICE) 167 if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 168 #if defined(PETSC_HAVE_CUDA) 169 if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf,link,unit)); /* Setup streams etc */ 170 #endif 171 #if defined(PETSC_HAVE_HIP) 172 if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf,link,unit)); /* Setup streams etc */ 173 #endif 174 #if defined(PETSC_HAVE_KOKKOS) 175 if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf,link,unit)); 176 #endif 177 } 178 #endif 179 180 /* Allocate buffers along root/leafdata */ 181 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 182 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 183 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 184 if (bas->rootbuflen[i]) { 185 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 186 link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 187 } else { /* Have to have a separate rootbuf */ 188 if (!link->rootbuf_alloc[i][rootmtype]) { 189 PetscCall(PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype])); 190 } 191 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 192 } 193 } 194 195 if (sf->leafbuflen[i]) { 196 if (leafdirect[i]) { 197 link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 198 } else { 199 if (!link->leafbuf_alloc[i][leafmtype]) { 200 PetscCall(PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype])); 201 } 202 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 203 } 204 } 205 } 206 207 #if defined(PETSC_HAVE_DEVICE) 208 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 209 if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) { 210 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 211 PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 212 } 213 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 214 } 215 if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) { 216 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 217 PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 218 } 219 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 220 } 221 #endif 222 223 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 224 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 225 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 226 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 227 } 228 229 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 230 link->leafdata = leafdata; 231 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 232 link->rootdirect[i] = rootdirect[i]; 233 link->leafdirect[i] = leafdirect[i]; 234 } 235 link->rootdirect_mpi = rootdirect_mpi; 236 link->leafdirect_mpi = leafdirect_mpi; 237 link->rootmtype = rootmtype; 238 link->leafmtype = leafmtype; 239 link->rootmtype_mpi = rootmtype_mpi; 240 link->leafmtype_mpi = leafmtype_mpi; 241 242 link->next = bas->inuse; 243 bas->inuse = link; 244 *mylink = link; 245 PetscFunctionReturn(0); 246 } 247