1 /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version, 2 we can also do abstractions like Prepare/StartCommunication. 3 */ 4 5 #include <../src/vec/is/sf/impls/basic/sfpack.h> 6 7 /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */ 8 static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 9 { 10 PetscErrorCode ierr; 11 PetscMPIInt nreqs; 12 MPI_Request *reqs = NULL; 13 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 14 PetscInt buflen; 15 16 PetscFunctionBegin; 17 buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE]; 18 if (buflen) { 19 if (direction == PETSCSF_ROOT2LEAF) { 20 nreqs = sf->nleafreqs; 21 ierr = PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,NULL,&reqs);CHKERRQ(ierr); 22 } else { /* leaf to root */ 23 nreqs = bas->nrootreqs; 24 ierr = PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,&reqs,NULL);CHKERRQ(ierr); 25 } 26 ierr = MPI_Startall_irecv(buflen,link->unit,nreqs,reqs);CHKERRMPI(ierr); 27 } 28 29 buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE]; 30 if (buflen) { 31 if (direction == PETSCSF_ROOT2LEAF) { 32 nreqs = bas->nrootreqs; 33 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host before sending */);CHKERRQ(ierr); 34 ierr = PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,&reqs,NULL);CHKERRQ(ierr); 35 } else { /* leaf to root */ 36 nreqs = sf->nleafreqs; 37 ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 38 ierr = PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,NULL,&reqs);CHKERRQ(ierr); 39 } 40 ierr = PetscSFLinkSyncStreamBeforeCallMPI(sf,link,direction);CHKERRQ(ierr); 41 ierr = MPI_Startall_isend(buflen,link->unit,nreqs,reqs);CHKERRMPI(ierr); 42 } 43 PetscFunctionReturn(0); 44 } 45 46 static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 47 { 48 PetscErrorCode ierr; 49 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 50 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; 51 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 52 53 PetscFunctionBegin; 54 ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 55 ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 56 if (direction == PETSCSF_ROOT2LEAF) { 57 ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE/* host2device after recving */);CHKERRQ(ierr); 58 } else { 59 ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr); 60 } 61 PetscFunctionReturn(0); 62 } 63 64 /* 65 The routine Creates a communication link for the given operation. It first looks up its link cache. If 66 there is a free & suitable one, it uses it. Otherwise it creates a new one. 67 68 A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 69 root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 70 can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 71 those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 72 73 The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 74 75 In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 76 77 The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 78 need pack/unpack data. 79 */ 80 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 81 { 82 PetscErrorCode ierr; 83 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 84 PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 85 PetscSFLink *p,link; 86 PetscSFDirection direction; 87 MPI_Request *reqs = NULL; 88 PetscBool match,rootdirect[2],leafdirect[2]; 89 PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */ 90 PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 91 PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 92 PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 93 94 PetscFunctionBegin; 95 96 /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 97 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 98 if (sfop == PETSCSF_BCAST) { 99 rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 100 leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 101 } else if (sfop == PETSCSF_REDUCE) { 102 leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 103 rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 104 } else { /* PETSCSF_FETCH */ 105 rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 106 leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 107 } 108 } 109 110 if (sf->use_gpu_aware_mpi) { 111 rootmtype_mpi = rootmtype; 112 leafmtype_mpi = leafmtype; 113 } else { 114 rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 115 } 116 /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 117 rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 118 leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 119 120 direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 121 nrootreqs = bas->nrootreqs; 122 nleafreqs = sf->nleafreqs; 123 124 /* Look for free links in cache */ 125 for (p=&bas->avail; (link=*p); p=&link->next) { 126 if (!link->use_nvshmem) { /* Only check with MPI links */ 127 ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 128 if (match) { 129 /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current. 130 If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 131 */ 132 if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 133 reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 134 for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}} 135 link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 136 } 137 if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 138 reqs = link->leafreqs[direction][leafmtype][1]; 139 for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}} 140 link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 141 } 142 *p = link->next; /* Remove from available list */ 143 goto found; 144 } 145 } 146 } 147 148 ierr = PetscNew(&link);CHKERRQ(ierr); 149 ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 150 ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 151 152 nreqs = (nrootreqs+nleafreqs)*8; 153 ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 154 for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 155 156 for (i=0; i<2; i++) { /* Two communication directions */ 157 for (j=0; j<2; j++) { /* Two memory types */ 158 for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 159 link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 160 link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 161 } 162 } 163 } 164 link->StartCommunication = PetscSFLinkStartRequests_MPI; 165 link->FinishCommunication = PetscSFLinkWaitRequests_MPI; 166 167 found: 168 169 #if defined(PETSC_HAVE_DEVICE) 170 if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 171 #if defined(PETSC_HAVE_CUDA) 172 if (sf->backend == PETSCSF_BACKEND_CUDA) {ierr = PetscSFLinkSetUp_CUDA(sf,link,unit);CHKERRQ(ierr);} /* Setup streams etc */ 173 #endif 174 #if defined(PETSC_HAVE_HIP) 175 if (sf->backend == PETSCSF_BACKEND_HIP) {ierr = PetscSFLinkSetUp_HIP(sf,link,unit);CHKERRQ(ierr);} /* Setup streams etc */ 176 #endif 177 #if defined(PETSC_HAVE_KOKKOS) 178 if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);} 179 #endif 180 } 181 #endif 182 183 /* Allocate buffers along root/leafdata */ 184 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 185 /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 186 if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 187 if (bas->rootbuflen[i]) { 188 if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 189 link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 190 } else { /* Have to have a separate rootbuf */ 191 if (!link->rootbuf_alloc[i][rootmtype]) { 192 ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 193 } 194 link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 195 } 196 } 197 198 if (sf->leafbuflen[i]) { 199 if (leafdirect[i]) { 200 link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 201 } else { 202 if (!link->leafbuf_alloc[i][leafmtype]) { 203 ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 204 } 205 link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 206 } 207 } 208 } 209 210 #if defined(PETSC_HAVE_DEVICE) 211 /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 212 if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) { 213 if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 214 ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 215 } 216 link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 217 } 218 if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) { 219 if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 220 ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 221 } 222 link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 223 } 224 #endif 225 226 /* Set `current` state of the link. They may change between different SF invocations with the same link */ 227 if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 228 if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 229 if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 230 } 231 232 link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 233 link->leafdata = leafdata; 234 for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 235 link->rootdirect[i] = rootdirect[i]; 236 link->leafdirect[i] = leafdirect[i]; 237 } 238 link->rootdirect_mpi = rootdirect_mpi; 239 link->leafdirect_mpi = leafdirect_mpi; 240 link->rootmtype = rootmtype; 241 link->leafmtype = leafmtype; 242 link->rootmtype_mpi = rootmtype_mpi; 243 link->leafmtype_mpi = leafmtype_mpi; 244 245 link->next = bas->inuse; 246 bas->inuse = link; 247 *mylink = link; 248 PetscFunctionReturn(0); 249 } 250 251