1 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 2 #include <../src/vec/is/sf/impls/basic/sfpack.h> 3 #include <petsc/private/viewerimpl.h> 4 5 // Init persistent MPI send/recv requests 6 static PetscErrorCode PetscSFLinkInitMPIRequests_Persistent_Basic(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 7 { 8 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 9 PetscInt i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks; 10 const PetscInt *rootoffset, *leafoffset; 11 MPI_Aint disp; 12 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 13 MPI_Datatype unit = link->unit; 14 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 15 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 16 17 PetscFunctionBegin; 18 if (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 19 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL)); 20 if (direction == PETSCSF_LEAF2ROOT) { 21 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 22 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 23 cnt = rootoffset[i + 1] - rootoffset[i]; 24 PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 25 } 26 } else { /* PETSCSF_ROOT2LEAF */ 27 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 28 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 29 cnt = rootoffset[i + 1] - rootoffset[i]; 30 PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 31 } 32 } 33 link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 34 } 35 36 if (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 37 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL)); 38 if (direction == PETSCSF_LEAF2ROOT) { 39 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 40 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 41 cnt = leafoffset[i + 1] - leafoffset[i]; 42 PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 43 } 44 } else { /* PETSCSF_ROOT2LEAF */ 45 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 46 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 47 cnt = leafoffset[i + 1] - leafoffset[i]; 48 PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 49 } 50 } 51 link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 52 } 53 PetscFunctionReturn(PETSC_SUCCESS); 54 } 55 56 // Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf 57 static PetscErrorCode PetscSFLinkStartCommunication_Persistent_Basic(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 58 { 59 PetscMPIInt nsreqs = 0, nrreqs = 0; 60 MPI_Request *sreqs = NULL, *rreqs = NULL; 61 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 62 PetscInt sbuflen, rbuflen; 63 64 PetscFunctionBegin; 65 rbuflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE]; 66 if (rbuflen) { 67 if (direction == PETSCSF_ROOT2LEAF) { 68 nrreqs = sf->nleafreqs; 69 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &rreqs)); 70 } else { /* leaf to root */ 71 nrreqs = bas->nrootreqs; 72 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &rreqs, NULL)); 73 } 74 } 75 76 sbuflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE]; 77 if (sbuflen) { 78 if (direction == PETSCSF_ROOT2LEAF) { 79 nsreqs = bas->nrootreqs; 80 PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */)); 81 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &sreqs, NULL)); 82 } else { /* leaf to root */ 83 nsreqs = sf->nleafreqs; 84 PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE)); 85 PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &sreqs)); 86 } 87 } 88 PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link)); // need to sync the stream to make BOTH sendbuf and recvbuf ready 89 if (rbuflen) PetscCallMPI(MPI_Startall_irecv(rbuflen, link->unit, nrreqs, rreqs)); 90 if (sbuflen) PetscCallMPI(MPI_Startall_isend(sbuflen, link->unit, nsreqs, sreqs)); 91 PetscFunctionReturn(PETSC_SUCCESS); 92 } 93 94 #if defined(PETSC_HAVE_MPIX_STREAM) 95 // issue MPIX_Isend/Irecv_enqueue() 96 static PetscErrorCode PetscSFLinkStartCommunication_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 97 { 98 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 99 PetscInt i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks; 100 const PetscInt *rootoffset, *leafoffset; 101 MPI_Aint disp; 102 MPI_Comm stream_comm = sf->stream_comm; 103 MPI_Datatype unit = link->unit; 104 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 105 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 106 107 PetscFunctionBegin; 108 if (bas->rootbuflen[PETSCSF_REMOTE]) { 109 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL)); 110 if (direction == PETSCSF_LEAF2ROOT) { 111 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 112 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 113 cnt = rootoffset[i + 1] - rootoffset[i]; 114 PetscCallMPI(MPIX_Irecv_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 115 } 116 } else { // PETSCSF_ROOT2LEAF 117 for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 118 disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 119 cnt = rootoffset[i + 1] - rootoffset[i]; 120 // no need to sync the gpu stream! 121 PetscCallMPI(MPIX_Isend_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 122 } 123 } 124 } 125 126 if (sf->leafbuflen[PETSCSF_REMOTE]) { 127 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL)); 128 if (direction == PETSCSF_LEAF2ROOT) { 129 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 130 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 131 cnt = leafoffset[i + 1] - leafoffset[i]; 132 // no need to sync the gpu stream! 133 PetscCallMPI(MPIX_Isend_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 134 } 135 } else { // PETSCSF_ROOT2LEAF 136 for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 137 disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 138 cnt = leafoffset[i + 1] - leafoffset[i]; 139 PetscCallMPI(MPIX_Irecv_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 140 } 141 } 142 } 143 PetscFunctionReturn(PETSC_SUCCESS); 144 } 145 146 static PetscErrorCode PetscSFLinkFinishCommunication_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 147 { 148 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 149 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; 150 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 151 152 PetscFunctionBegin; 153 PetscCallMPI(MPIX_Waitall_enqueue(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE)); 154 PetscCallMPI(MPIX_Waitall_enqueue(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE)); 155 PetscFunctionReturn(PETSC_SUCCESS); 156 } 157 #endif 158 159 static PetscErrorCode PetscSFSetCommunicationOps_Basic(PetscSF sf, PetscSFLink link) 160 { 161 PetscFunctionBegin; 162 link->InitMPIRequests = PetscSFLinkInitMPIRequests_Persistent_Basic; 163 link->StartCommunication = PetscSFLinkStartCommunication_Persistent_Basic; 164 #if defined(PETSC_HAVE_MPIX_STREAM) 165 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; 166 if (sf->use_stream_aware_mpi && (PetscMemTypeDevice(rootmtype_mpi) || PetscMemTypeDevice(leafmtype_mpi))) { 167 link->StartCommunication = PetscSFLinkStartCommunication_MPIX_Stream; 168 link->FinishCommunication = PetscSFLinkFinishCommunication_MPIX_Stream; 169 } 170 #endif 171 PetscFunctionReturn(PETSC_SUCCESS); 172 } 173 174 /*===================================================================================*/ 175 /* SF public interface implementations */ 176 /*===================================================================================*/ 177 PETSC_INTERN PetscErrorCode PetscSFSetUp_Basic(PetscSF sf) 178 { 179 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 180 PetscInt *rlengths, *ilengths, i, nRemoteRootRanks, nRemoteLeafRanks; 181 PetscMPIInt rank, niranks, *iranks, tag; 182 MPI_Comm comm; 183 MPI_Group group; 184 MPI_Request *rootreqs, *leafreqs; 185 186 PetscFunctionBegin; 187 PetscCallMPI(MPI_Comm_group(PETSC_COMM_SELF, &group)); 188 PetscCall(PetscSFSetUpRanks(sf, group)); 189 PetscCallMPI(MPI_Group_free(&group)); 190 PetscCall(PetscObjectGetComm((PetscObject)sf, &comm)); 191 PetscCall(PetscObjectGetNewTag((PetscObject)sf, &tag)); 192 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 193 /* 194 * Inform roots about how many leaves and from which ranks 195 */ 196 PetscCall(PetscMalloc1(sf->nranks, &rlengths)); 197 /* Determine number, sending ranks and length of incoming */ 198 for (i = 0; i < sf->nranks; i++) { rlengths[i] = sf->roffset[i + 1] - sf->roffset[i]; /* Number of roots referenced by my leaves; for rank sf->ranks[i] */ } 199 nRemoteRootRanks = sf->nranks - sf->ndranks; 200 PetscCall(PetscCommBuildTwoSided(comm, 1, MPIU_INT, nRemoteRootRanks, PetscSafePointerPlusOffset(sf->ranks, sf->ndranks), PetscSafePointerPlusOffset(rlengths, sf->ndranks), &niranks, &iranks, (void **)&ilengths)); 201 202 /* Sort iranks. See use of VecScatterGetRemoteOrdered_Private() in MatGetBrowsOfAoCols_MPIAIJ() on why. 203 We could sort ranks there at the price of allocating extra working arrays. Presumably, niranks is 204 small and the sorting is cheap. 205 */ 206 PetscCall(PetscSortMPIIntWithIntArray(niranks, iranks, ilengths)); 207 208 /* Partition into distinguished and non-distinguished incoming ranks */ 209 bas->ndiranks = sf->ndranks; 210 bas->niranks = bas->ndiranks + niranks; 211 PetscCall(PetscMalloc2(bas->niranks, &bas->iranks, bas->niranks + 1, &bas->ioffset)); 212 bas->ioffset[0] = 0; 213 for (i = 0; i < bas->ndiranks; i++) { 214 bas->iranks[i] = sf->ranks[i]; 215 bas->ioffset[i + 1] = bas->ioffset[i] + rlengths[i]; 216 } 217 PetscCheck(bas->ndiranks <= 1 && (bas->ndiranks != 1 || bas->iranks[0] == rank), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Broken setup for shared ranks"); 218 for (; i < bas->niranks; i++) { 219 bas->iranks[i] = iranks[i - bas->ndiranks]; 220 bas->ioffset[i + 1] = bas->ioffset[i] + ilengths[i - bas->ndiranks]; 221 } 222 bas->itotal = bas->ioffset[i]; 223 PetscCall(PetscFree(rlengths)); 224 PetscCall(PetscFree(iranks)); 225 PetscCall(PetscFree(ilengths)); 226 227 /* Send leaf identities to roots */ 228 nRemoteLeafRanks = bas->niranks - bas->ndiranks; 229 PetscCall(PetscMalloc1(bas->itotal, &bas->irootloc)); 230 PetscCall(PetscMalloc2(nRemoteLeafRanks, &rootreqs, nRemoteRootRanks, &leafreqs)); 231 for (i = bas->ndiranks; i < bas->niranks; i++) PetscCallMPI(MPIU_Irecv(bas->irootloc + bas->ioffset[i], bas->ioffset[i + 1] - bas->ioffset[i], MPIU_INT, bas->iranks[i], tag, comm, &rootreqs[i - bas->ndiranks])); 232 for (i = 0; i < sf->nranks; i++) { 233 PetscInt npoints = sf->roffset[i + 1] - sf->roffset[i]; 234 if (i < sf->ndranks) { 235 PetscCheck(sf->ranks[i] == rank, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot interpret distinguished leaf rank"); 236 PetscCheck(bas->iranks[0] == rank, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot interpret distinguished root rank"); 237 PetscCheck(npoints == bas->ioffset[1] - bas->ioffset[0], PETSC_COMM_SELF, PETSC_ERR_PLIB, "Distinguished rank exchange has mismatched lengths"); 238 PetscCall(PetscArraycpy(bas->irootloc + bas->ioffset[0], sf->rremote + sf->roffset[i], npoints)); 239 continue; 240 } 241 PetscCallMPI(MPIU_Isend(sf->rremote + sf->roffset[i], npoints, MPIU_INT, sf->ranks[i], tag, comm, &leafreqs[i - sf->ndranks])); 242 } 243 PetscCallMPI(MPI_Waitall(nRemoteLeafRanks, rootreqs, MPI_STATUSES_IGNORE)); 244 PetscCallMPI(MPI_Waitall(nRemoteRootRanks, leafreqs, MPI_STATUSES_IGNORE)); 245 246 sf->nleafreqs = nRemoteRootRanks; 247 bas->nrootreqs = nRemoteLeafRanks; 248 249 /* Setup fields related to packing, such as rootbuflen[] */ 250 PetscCall(PetscSFSetUpPackFields(sf)); 251 PetscCall(PetscFree2(rootreqs, leafreqs)); 252 PetscFunctionReturn(PETSC_SUCCESS); 253 } 254 255 PETSC_INTERN PetscErrorCode PetscSFReset_Basic(PetscSF sf) 256 { 257 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 258 PetscSFLink link = bas->avail, next; 259 260 PetscFunctionBegin; 261 PetscCheck(!bas->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Outstanding operation has not been completed"); 262 PetscCall(PetscFree2(bas->iranks, bas->ioffset)); 263 PetscCall(PetscFree(bas->irootloc)); 264 265 #if defined(PETSC_HAVE_DEVICE) 266 for (PetscInt i = 0; i < 2; i++) PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, bas->irootloc_d[i])); 267 #endif 268 269 #if defined(PETSC_HAVE_NVSHMEM) 270 PetscCall(PetscSFReset_Basic_NVSHMEM(sf)); 271 #endif 272 273 for (; link; link = next) { 274 next = link->next; 275 PetscCall(PetscSFLinkDestroy(sf, link)); 276 } 277 bas->avail = NULL; 278 PetscCall(PetscSFResetPackFields(sf)); 279 PetscFunctionReturn(PETSC_SUCCESS); 280 } 281 282 PETSC_INTERN PetscErrorCode PetscSFDestroy_Basic(PetscSF sf) 283 { 284 PetscFunctionBegin; 285 PetscCall(PetscSFReset_Basic(sf)); 286 PetscCall(PetscFree(sf->data)); 287 PetscFunctionReturn(PETSC_SUCCESS); 288 } 289 290 #if defined(PETSC_USE_SINGLE_LIBRARY) 291 #include <petscmat.h> 292 293 PETSC_INTERN PetscErrorCode PetscSFView_Basic_PatternAndSizes(PetscSF sf, PetscViewer viewer) 294 { 295 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 296 PetscInt i, nrootranks, ndrootranks; 297 const PetscInt *rootoffset; 298 PetscMPIInt rank, size; 299 const PetscMPIInt *rootranks; 300 MPI_Comm comm = PetscObjectComm((PetscObject)sf); 301 PetscScalar unitbytes; 302 Mat A; 303 304 PetscFunctionBegin; 305 PetscCallMPI(MPI_Comm_size(comm, &size)); 306 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 307 /* PetscSFView is most useful for the SF used in VecScatterBegin/End in MatMult etc, where we do 308 PetscSFBcast, i.e., roots send data to leaves. We dump the communication pattern into a matrix 309 in senders' view point: how many bytes I will send to my neighbors. 310 311 Looking at a column of the matrix, one can also know how many bytes the rank will receive from others. 312 313 If PetscSFLink bas->inuse is available, we can use that to get tree vertex size. But that would give 314 different interpretations for the same SF for different data types. Since we most care about VecScatter, 315 we uniformly treat each vertex as a PetscScalar. 316 */ 317 unitbytes = (PetscScalar)sizeof(PetscScalar); 318 319 PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, &rootranks, &rootoffset, NULL)); 320 PetscCall(MatCreateAIJ(comm, 1, 1, size, size, 1, NULL, nrootranks - ndrootranks, NULL, &A)); 321 PetscCall(MatSetOptionsPrefix(A, "__petsc_internal__")); /* To prevent the internal A from taking any command line options */ 322 for (i = 0; i < nrootranks; i++) PetscCall(MatSetValue(A, (PetscInt)rank, bas->iranks[i], (rootoffset[i + 1] - rootoffset[i]) * unitbytes, INSERT_VALUES)); 323 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 324 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 325 PetscCall(MatView(A, viewer)); 326 PetscCall(MatDestroy(&A)); 327 PetscFunctionReturn(PETSC_SUCCESS); 328 } 329 #endif 330 331 PETSC_INTERN PetscErrorCode PetscSFView_Basic(PetscSF sf, PetscViewer viewer) 332 { 333 PetscBool isascii; 334 335 PetscFunctionBegin; 336 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii)); 337 if (isascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) PetscCall(PetscViewerASCIIPrintf(viewer, " MultiSF sort=%s\n", sf->rankorder ? "rank-order" : "unordered")); 338 #if defined(PETSC_USE_SINGLE_LIBRARY) 339 else { 340 PetscBool isdraw, isbinary; 341 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 342 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 343 if ((isascii && viewer->format == PETSC_VIEWER_ASCII_MATLAB) || isdraw || isbinary) PetscCall(PetscSFView_Basic_PatternAndSizes(sf, viewer)); 344 } 345 #endif 346 PetscFunctionReturn(PETSC_SUCCESS); 347 } 348 349 PETSC_INTERN PetscErrorCode PetscSFBcastBegin_Basic(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op) 350 { 351 PetscSFLink link = NULL; 352 353 PetscFunctionBegin; 354 /* Create a communication link, which provides buffers, MPI requests etc (if MPI is used) */ 355 PetscCall(PetscSFLinkCreate(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, PETSCSF_BCAST, &link)); 356 /* Pack rootdata to rootbuf for remote communication */ 357 PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_REMOTE, rootdata)); 358 /* Start communication, e.g., post MPI_Isend */ 359 PetscCall(PetscSFLinkStartCommunication(sf, link, PETSCSF_ROOT2LEAF)); 360 /* Do local scatter (i.e., self to self communication), which overlaps with the remote communication above */ 361 PetscCall(PetscSFLinkScatterLocal(sf, link, PETSCSF_ROOT2LEAF, (void *)rootdata, leafdata, op)); 362 PetscFunctionReturn(PETSC_SUCCESS); 363 } 364 365 PETSC_INTERN PetscErrorCode PetscSFBcastEnd_Basic(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op) 366 { 367 PetscSFLink link = NULL; 368 369 PetscFunctionBegin; 370 /* Retrieve the link used in XxxBegin() with root/leafdata as key */ 371 PetscCall(PetscSFLinkGetInUse(sf, unit, rootdata, leafdata, PETSC_OWN_POINTER, &link)); 372 /* Finish remote communication, e.g., post MPI_Waitall */ 373 PetscCall(PetscSFLinkFinishCommunication(sf, link, PETSCSF_ROOT2LEAF)); 374 /* Unpack data in leafbuf to leafdata for remote communication */ 375 PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_REMOTE, leafdata, op)); 376 /* Recycle the link */ 377 PetscCall(PetscSFLinkReclaim(sf, &link)); 378 PetscFunctionReturn(PETSC_SUCCESS); 379 } 380 381 /* Shared by ReduceBegin and FetchAndOpBegin */ 382 static inline PetscErrorCode PetscSFLeafToRootBegin_Basic(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *out) 383 { 384 PetscSFLink link = NULL; 385 386 PetscFunctionBegin; 387 PetscCall(PetscSFLinkCreate(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, &link)); 388 PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_REMOTE, leafdata)); 389 PetscCall(PetscSFLinkStartCommunication(sf, link, PETSCSF_LEAF2ROOT)); 390 *out = link; 391 PetscFunctionReturn(PETSC_SUCCESS); 392 } 393 394 /* leaf -> root with reduction */ 395 PETSC_INTERN PetscErrorCode PetscSFReduceBegin_Basic(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op) 396 { 397 PetscSFLink link = NULL; 398 399 PetscFunctionBegin; 400 PetscCall(PetscSFLeafToRootBegin_Basic(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op, PETSCSF_REDUCE, &link)); 401 PetscCall(PetscSFLinkScatterLocal(sf, link, PETSCSF_LEAF2ROOT, rootdata, (void *)leafdata, op)); 402 PetscFunctionReturn(PETSC_SUCCESS); 403 } 404 405 PETSC_INTERN PetscErrorCode PetscSFReduceEnd_Basic(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op) 406 { 407 PetscSFLink link = NULL; 408 409 PetscFunctionBegin; 410 PetscCall(PetscSFLinkGetInUse(sf, unit, rootdata, leafdata, PETSC_OWN_POINTER, &link)); 411 PetscCall(PetscSFLinkFinishCommunication(sf, link, PETSCSF_LEAF2ROOT)); 412 PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_REMOTE, rootdata, op)); 413 PetscCall(PetscSFLinkReclaim(sf, &link)); 414 PetscFunctionReturn(PETSC_SUCCESS); 415 } 416 417 PETSC_INTERN PetscErrorCode PetscSFFetchAndOpBegin_Basic(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, void *leafupdate, MPI_Op op) 418 { 419 PetscSFLink link = NULL; 420 421 PetscFunctionBegin; 422 PetscCall(PetscSFLeafToRootBegin_Basic(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op, PETSCSF_FETCH, &link)); 423 PetscCall(PetscSFLinkFetchAndOpLocal(sf, link, rootdata, leafdata, leafupdate, op)); 424 PetscFunctionReturn(PETSC_SUCCESS); 425 } 426 427 PETSC_INTERN PetscErrorCode PetscSFFetchAndOpEnd_Basic(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op) 428 { 429 PetscSFLink link = NULL; 430 431 PetscFunctionBegin; 432 PetscCall(PetscSFLinkGetInUse(sf, unit, rootdata, leafdata, PETSC_OWN_POINTER, &link)); 433 /* This implementation could be changed to unpack as receives arrive, at the cost of non-determinism */ 434 PetscCall(PetscSFLinkFinishCommunication(sf, link, PETSCSF_LEAF2ROOT)); 435 /* Do fetch-and-op, the (remote) update results are in rootbuf */ 436 PetscCall(PetscSFLinkFetchAndOpRemote(sf, link, rootdata, op)); 437 /* Bcast rootbuf to leafupdate */ 438 PetscCall(PetscSFLinkStartCommunication(sf, link, PETSCSF_ROOT2LEAF)); 439 PetscCall(PetscSFLinkFinishCommunication(sf, link, PETSCSF_ROOT2LEAF)); 440 /* Unpack and insert fetched data into leaves */ 441 PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_REMOTE, leafupdate, MPI_REPLACE)); 442 PetscCall(PetscSFLinkReclaim(sf, &link)); 443 PetscFunctionReturn(PETSC_SUCCESS); 444 } 445 446 PETSC_INTERN PetscErrorCode PetscSFGetLeafRanks_Basic(PetscSF sf, PetscInt *niranks, const PetscMPIInt **iranks, const PetscInt **ioffset, const PetscInt **irootloc) 447 { 448 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 449 450 PetscFunctionBegin; 451 if (niranks) *niranks = bas->niranks; 452 if (iranks) *iranks = bas->iranks; 453 if (ioffset) *ioffset = bas->ioffset; 454 if (irootloc) *irootloc = bas->irootloc; 455 PetscFunctionReturn(PETSC_SUCCESS); 456 } 457 458 /* An optimized PetscSFCreateEmbeddedRootSF. We aggressively make use of the established communication on sf. 459 We need one bcast on sf, and no communication anymore to build the embedded sf. Note that selected[] 460 was sorted before calling the routine. 461 */ 462 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedRootSF_Basic(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *newsf) 463 { 464 PetscSF esf; 465 PetscInt esf_nranks, esf_ndranks, *esf_roffset, *esf_rmine, *esf_rremote; 466 PetscInt i, j, p, q, nroots, esf_nleaves, *new_ilocal, nranks, ndranks, niranks, ndiranks, minleaf, maxleaf, maxlocal; 467 char *rootdata, *leafdata, *leafmem; /* Only stores 0 or 1, so we can save memory with char */ 468 PetscMPIInt *esf_ranks; 469 const PetscMPIInt *ranks, *iranks; 470 const PetscInt *roffset, *rmine, *rremote, *ioffset, *irootloc; 471 PetscBool connected; 472 PetscSFNode *new_iremote; 473 PetscSF_Basic *bas; 474 475 PetscFunctionBegin; 476 PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sf), &esf)); 477 PetscCall(PetscSFSetFromOptions(esf)); 478 PetscCall(PetscSFSetType(esf, PETSCSFBASIC)); /* This optimized routine can only create a basic sf */ 479 480 /* Find out which leaves are still connected to roots in the embedded sf by doing a Bcast */ 481 PetscCall(PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL)); 482 PetscCall(PetscSFGetLeafRange(sf, &minleaf, &maxleaf)); 483 maxlocal = maxleaf - minleaf + 1; 484 PetscCall(PetscCalloc2(nroots, &rootdata, maxlocal, &leafmem)); 485 leafdata = PetscSafePointerPlusOffset(leafmem, -minleaf); 486 /* Tag selected roots */ 487 for (i = 0; i < nselected; ++i) rootdata[selected[i]] = 1; 488 489 PetscCall(PetscSFBcastBegin(sf, MPI_CHAR, rootdata, leafdata, MPI_REPLACE)); 490 PetscCall(PetscSFBcastEnd(sf, MPI_CHAR, rootdata, leafdata, MPI_REPLACE)); 491 PetscCall(PetscSFGetLeafInfo_Basic(sf, &nranks, &ndranks, &ranks, &roffset, &rmine, &rremote)); /* Get send info */ 492 esf_nranks = esf_ndranks = esf_nleaves = 0; 493 for (i = 0; i < nranks; i++) { 494 connected = PETSC_FALSE; /* Is this process still connected to this remote root rank? */ 495 for (j = roffset[i]; j < roffset[i + 1]; j++) { 496 if (leafdata[rmine[j]]) { 497 esf_nleaves++; 498 connected = PETSC_TRUE; 499 } 500 } 501 if (connected) { 502 esf_nranks++; 503 if (i < ndranks) esf_ndranks++; 504 } 505 } 506 507 /* Set graph of esf and also set up its outgoing communication (i.e., send info), which is usually done by PetscSFSetUpRanks */ 508 PetscCall(PetscMalloc1(esf_nleaves, &new_ilocal)); 509 PetscCall(PetscMalloc1(esf_nleaves, &new_iremote)); 510 PetscCall(PetscMalloc4(esf_nranks, &esf_ranks, esf_nranks + 1, &esf_roffset, esf_nleaves, &esf_rmine, esf_nleaves, &esf_rremote)); 511 p = 0; /* Counter for connected root ranks */ 512 q = 0; /* Counter for connected leaves */ 513 esf_roffset[0] = 0; 514 for (i = 0; i < nranks; i++) { /* Scan leaf data again to fill esf arrays */ 515 connected = PETSC_FALSE; 516 for (j = roffset[i]; j < roffset[i + 1]; j++) { 517 if (leafdata[rmine[j]]) { 518 esf_rmine[q] = new_ilocal[q] = rmine[j]; 519 esf_rremote[q] = rremote[j]; 520 new_iremote[q].index = rremote[j]; 521 new_iremote[q].rank = ranks[i]; 522 connected = PETSC_TRUE; 523 q++; 524 } 525 } 526 if (connected) { 527 esf_ranks[p] = ranks[i]; 528 esf_roffset[p + 1] = q; 529 p++; 530 } 531 } 532 533 /* SetGraph internally resets the SF, so we only set its fields after the call */ 534 PetscCall(PetscSFSetGraph(esf, nroots, esf_nleaves, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER)); 535 esf->nranks = esf_nranks; 536 esf->ndranks = esf_ndranks; 537 esf->ranks = esf_ranks; 538 esf->roffset = esf_roffset; 539 esf->rmine = esf_rmine; 540 esf->rremote = esf_rremote; 541 esf->nleafreqs = esf_nranks - esf_ndranks; 542 543 /* Set up the incoming communication (i.e., recv info) stored in esf->data, which is usually done by PetscSFSetUp_Basic */ 544 bas = (PetscSF_Basic *)esf->data; 545 PetscCall(PetscSFGetRootInfo_Basic(sf, &niranks, &ndiranks, &iranks, &ioffset, &irootloc)); /* Get recv info */ 546 /* Embedded sf always has simpler communication than the original one. We might allocate longer arrays than needed here. But we 547 we do not care since these arrays are usually short. The benefit is we can fill these arrays by just parsing irootloc once. 548 */ 549 PetscCall(PetscMalloc2(niranks, &bas->iranks, niranks + 1, &bas->ioffset)); 550 PetscCall(PetscMalloc1(ioffset[niranks], &bas->irootloc)); 551 bas->niranks = bas->ndiranks = bas->ioffset[0] = 0; 552 p = 0; /* Counter for connected leaf ranks */ 553 q = 0; /* Counter for connected roots */ 554 for (i = 0; i < niranks; i++) { 555 connected = PETSC_FALSE; /* Is the current process still connected to this remote leaf rank? */ 556 for (j = ioffset[i]; j < ioffset[i + 1]; j++) { 557 if (rootdata[irootloc[j]]) { 558 bas->irootloc[q++] = irootloc[j]; 559 connected = PETSC_TRUE; 560 } 561 } 562 if (connected) { 563 bas->niranks++; 564 if (i < ndiranks) bas->ndiranks++; /* Note that order of ranks (including distinguished ranks) is kept */ 565 bas->iranks[p] = iranks[i]; 566 bas->ioffset[p + 1] = q; 567 p++; 568 } 569 } 570 bas->itotal = q; 571 bas->nrootreqs = bas->niranks - bas->ndiranks; 572 esf->persistent = PETSC_TRUE; 573 /* Setup packing related fields */ 574 PetscCall(PetscSFSetUpPackFields(esf)); 575 576 /* Copy from PetscSFSetUp(), since this method wants to skip PetscSFSetUp(). */ 577 #if defined(PETSC_HAVE_CUDA) 578 if (esf->backend == PETSCSF_BACKEND_CUDA) { 579 esf->ops->Malloc = PetscSFMalloc_CUDA; 580 esf->ops->Free = PetscSFFree_CUDA; 581 } 582 #endif 583 584 #if defined(PETSC_HAVE_HIP) 585 /* TODO: Needs debugging */ 586 if (esf->backend == PETSCSF_BACKEND_HIP) { 587 esf->ops->Malloc = PetscSFMalloc_HIP; 588 esf->ops->Free = PetscSFFree_HIP; 589 } 590 #endif 591 592 #if defined(PETSC_HAVE_KOKKOS) 593 if (esf->backend == PETSCSF_BACKEND_KOKKOS) { 594 esf->ops->Malloc = PetscSFMalloc_Kokkos; 595 esf->ops->Free = PetscSFFree_Kokkos; 596 } 597 #endif 598 esf->setupcalled = PETSC_TRUE; /* We have done setup ourselves! */ 599 PetscCall(PetscFree2(rootdata, leafmem)); 600 *newsf = esf; 601 PetscFunctionReturn(PETSC_SUCCESS); 602 } 603 604 PETSC_EXTERN PetscErrorCode PetscSFCreate_Basic(PetscSF sf) 605 { 606 PetscSF_Basic *dat; 607 608 PetscFunctionBegin; 609 sf->ops->SetUp = PetscSFSetUp_Basic; 610 sf->ops->Reset = PetscSFReset_Basic; 611 sf->ops->Destroy = PetscSFDestroy_Basic; 612 sf->ops->View = PetscSFView_Basic; 613 sf->ops->BcastBegin = PetscSFBcastBegin_Basic; 614 sf->ops->BcastEnd = PetscSFBcastEnd_Basic; 615 sf->ops->ReduceBegin = PetscSFReduceBegin_Basic; 616 sf->ops->ReduceEnd = PetscSFReduceEnd_Basic; 617 sf->ops->FetchAndOpBegin = PetscSFFetchAndOpBegin_Basic; 618 sf->ops->FetchAndOpEnd = PetscSFFetchAndOpEnd_Basic; 619 sf->ops->GetLeafRanks = PetscSFGetLeafRanks_Basic; 620 sf->ops->CreateEmbeddedRootSF = PetscSFCreateEmbeddedRootSF_Basic; 621 sf->ops->SetCommunicationOps = PetscSFSetCommunicationOps_Basic; 622 623 sf->persistent = PETSC_TRUE; // currently SFBASIC always uses persistent send/recv 624 sf->collective = PETSC_FALSE; 625 626 PetscCall(PetscNew(&dat)); 627 sf->data = (void *)dat; 628 PetscFunctionReturn(PETSC_SUCCESS); 629 } 630