xref: /petsc/src/vec/is/sf/impls/basic/sfbasic.c (revision 88ec11df0f7a3325350fdab2975c1e95506c5973)
1 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <petsc/private/viewerimpl.h>
4 
5 // Init persistent MPI send/recv requests
6 static PetscErrorCode PetscSFLinkInitMPIRequests_Persistent_Basic(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
7 {
8   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
9   PetscInt           i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
10   const PetscInt    *rootoffset, *leafoffset;
11   MPI_Aint           disp;
12   MPI_Comm           comm          = PetscObjectComm((PetscObject)sf);
13   MPI_Datatype       unit          = link->unit;
14   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
15   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
16 
17   PetscFunctionBegin;
18   if (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
19     PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
20     if (direction == PETSCSF_LEAF2ROOT) {
21       for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
22         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
23         cnt  = rootoffset[i + 1] - rootoffset[i];
24         PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
25       }
26     } else { /* PETSCSF_ROOT2LEAF */
27       for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
28         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
29         cnt  = rootoffset[i + 1] - rootoffset[i];
30         PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
31       }
32     }
33     link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
34   }
35 
36   if (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
37     PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
38     if (direction == PETSCSF_LEAF2ROOT) {
39       for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
40         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
41         cnt  = leafoffset[i + 1] - leafoffset[i];
42         PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
43       }
44     } else { /* PETSCSF_ROOT2LEAF */
45       for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
46         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
47         cnt  = leafoffset[i + 1] - leafoffset[i];
48         PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
49       }
50     }
51     link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
52   }
53   PetscFunctionReturn(PETSC_SUCCESS);
54 }
55 
56 // Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf
57 static PetscErrorCode PetscSFLinkStartCommunication_Persistent_Basic(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
58 {
59   PetscMPIInt    nreqs;
60   MPI_Request   *reqs = NULL;
61   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
62   PetscInt       buflen;
63 
64   PetscFunctionBegin;
65   buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
66   if (buflen) {
67     if (direction == PETSCSF_ROOT2LEAF) {
68       nreqs = sf->nleafreqs;
69       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
70     } else { /* leaf to root */
71       nreqs = bas->nrootreqs;
72       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
73     }
74     PetscCallMPI(MPI_Startall_irecv(buflen, link->unit, nreqs, reqs));
75   }
76 
77   buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
78   if (buflen) {
79     if (direction == PETSCSF_ROOT2LEAF) {
80       nreqs = bas->nrootreqs;
81       PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */));
82       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
83     } else { /* leaf to root */
84       nreqs = sf->nleafreqs;
85       PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE));
86       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
87     }
88     PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction));
89     PetscCallMPI(MPI_Startall_isend(buflen, link->unit, nreqs, reqs));
90   }
91   PetscFunctionReturn(PETSC_SUCCESS);
92 }
93 
94 #if defined(PETSC_HAVE_MPIX_STREAM)
95 // issue MPIX_Isend/Irecv_enqueue()
96 static PetscErrorCode PetscSFLinkStartCommunication_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
97 {
98   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
99   PetscInt           i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
100   const PetscInt    *rootoffset, *leafoffset;
101   MPI_Aint           disp;
102   MPI_Comm           stream_comm   = sf->stream_comm;
103   MPI_Datatype       unit          = link->unit;
104   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
105   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
106 
107   PetscFunctionBegin;
108   if (bas->rootbuflen[PETSCSF_REMOTE]) {
109     PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
110     if (direction == PETSCSF_LEAF2ROOT) {
111       for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
112         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
113         cnt  = rootoffset[i + 1] - rootoffset[i];
114         PetscCallMPI(MPIX_Irecv_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
115       }
116     } else { // PETSCSF_ROOT2LEAF
117       for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
118         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
119         cnt  = rootoffset[i + 1] - rootoffset[i];
120         // no need to sync the gpu stream!
121         PetscCallMPI(MPIX_Isend_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
122       }
123     }
124   }
125 
126   if (sf->leafbuflen[PETSCSF_REMOTE]) {
127     PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
128     if (direction == PETSCSF_LEAF2ROOT) {
129       for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
130         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
131         cnt  = leafoffset[i + 1] - leafoffset[i];
132         // no need to sync the gpu stream!
133         PetscCallMPI(MPIX_Isend_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
134       }
135     } else { // PETSCSF_ROOT2LEAF
136       for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
137         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
138         cnt  = leafoffset[i + 1] - leafoffset[i];
139         PetscCallMPI(MPIX_Irecv_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
140       }
141     }
142   }
143   PetscFunctionReturn(PETSC_SUCCESS);
144 }
145 
146 static PetscErrorCode PetscSFLinkFinishCommunication_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
147 {
148   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
149   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
150   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
151 
152   PetscFunctionBegin;
153   PetscCallMPI(MPIX_Waitall_enqueue(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
154   PetscCallMPI(MPIX_Waitall_enqueue(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
155   PetscFunctionReturn(PETSC_SUCCESS);
156 }
157 #endif
158 
159 static PetscErrorCode PetscSFSetCommunicationOps_Basic(PetscSF sf, PetscSFLink link)
160 {
161   PetscFunctionBegin;
162   link->InitMPIRequests    = PetscSFLinkInitMPIRequests_Persistent_Basic;
163   link->StartCommunication = PetscSFLinkStartCommunication_Persistent_Basic;
164 #if defined(PETSC_HAVE_MPIX_STREAM)
165   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
166   if (sf->use_stream_aware_mpi && (PetscMemTypeDevice(rootmtype_mpi) || PetscMemTypeDevice(leafmtype_mpi))) {
167     link->StartCommunication  = PetscSFLinkStartCommunication_MPIX_Stream;
168     link->FinishCommunication = PetscSFLinkFinishCommunication_MPIX_Stream;
169   }
170 #endif
171   PetscFunctionReturn(PETSC_SUCCESS);
172 }
173 
174 /*===================================================================================*/
175 /*              SF public interface implementations                                  */
176 /*===================================================================================*/
177 PETSC_INTERN PetscErrorCode PetscSFSetUp_Basic(PetscSF sf)
178 {
179   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
180   PetscInt      *rlengths, *ilengths, i, nRemoteRootRanks, nRemoteLeafRanks;
181   PetscMPIInt    rank, niranks, *iranks, tag;
182   MPI_Comm       comm;
183   MPI_Group      group;
184   MPI_Request   *rootreqs, *leafreqs;
185 
186   PetscFunctionBegin;
187   PetscCallMPI(MPI_Comm_group(PETSC_COMM_SELF, &group));
188   PetscCall(PetscSFSetUpRanks(sf, group));
189   PetscCallMPI(MPI_Group_free(&group));
190   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
191   PetscCall(PetscObjectGetNewTag((PetscObject)sf, &tag));
192   PetscCallMPI(MPI_Comm_rank(comm, &rank));
193   /*
194    * Inform roots about how many leaves and from which ranks
195    */
196   PetscCall(PetscMalloc1(sf->nranks, &rlengths));
197   /* Determine number, sending ranks and length of incoming */
198   for (i = 0; i < sf->nranks; i++) { rlengths[i] = sf->roffset[i + 1] - sf->roffset[i]; /* Number of roots referenced by my leaves; for rank sf->ranks[i] */ }
199   nRemoteRootRanks = sf->nranks - sf->ndranks;
200   PetscCall(PetscCommBuildTwoSided(comm, 1, MPIU_INT, nRemoteRootRanks, PetscSafePointerPlusOffset(sf->ranks, sf->ndranks), PetscSafePointerPlusOffset(rlengths, sf->ndranks), &niranks, &iranks, (void **)&ilengths));
201 
202   /* Sort iranks. See use of VecScatterGetRemoteOrdered_Private() in MatGetBrowsOfAoCols_MPIAIJ() on why.
203      We could sort ranks there at the price of allocating extra working arrays. Presumably, niranks is
204      small and the sorting is cheap.
205    */
206   PetscCall(PetscSortMPIIntWithIntArray(niranks, iranks, ilengths));
207 
208   /* Partition into distinguished and non-distinguished incoming ranks */
209   bas->ndiranks = sf->ndranks;
210   bas->niranks  = bas->ndiranks + niranks;
211   PetscCall(PetscMalloc2(bas->niranks, &bas->iranks, bas->niranks + 1, &bas->ioffset));
212   bas->ioffset[0] = 0;
213   for (i = 0; i < bas->ndiranks; i++) {
214     bas->iranks[i]      = sf->ranks[i];
215     bas->ioffset[i + 1] = bas->ioffset[i] + rlengths[i];
216   }
217   PetscCheck(bas->ndiranks <= 1 && (bas->ndiranks != 1 || bas->iranks[0] == rank), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Broken setup for shared ranks");
218   for (; i < bas->niranks; i++) {
219     bas->iranks[i]      = iranks[i - bas->ndiranks];
220     bas->ioffset[i + 1] = bas->ioffset[i] + ilengths[i - bas->ndiranks];
221   }
222   bas->itotal = bas->ioffset[i];
223   PetscCall(PetscFree(rlengths));
224   PetscCall(PetscFree(iranks));
225   PetscCall(PetscFree(ilengths));
226 
227   /* Send leaf identities to roots */
228   nRemoteLeafRanks = bas->niranks - bas->ndiranks;
229   PetscCall(PetscMalloc1(bas->itotal, &bas->irootloc));
230   PetscCall(PetscMalloc2(nRemoteLeafRanks, &rootreqs, nRemoteRootRanks, &leafreqs));
231   for (i = bas->ndiranks; i < bas->niranks; i++) PetscCallMPI(MPIU_Irecv(bas->irootloc + bas->ioffset[i], bas->ioffset[i + 1] - bas->ioffset[i], MPIU_INT, bas->iranks[i], tag, comm, &rootreqs[i - bas->ndiranks]));
232   for (i = 0; i < sf->nranks; i++) {
233     PetscInt npoints = sf->roffset[i + 1] - sf->roffset[i];
234     if (i < sf->ndranks) {
235       PetscCheck(sf->ranks[i] == rank, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot interpret distinguished leaf rank");
236       PetscCheck(bas->iranks[0] == rank, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot interpret distinguished root rank");
237       PetscCheck(npoints == bas->ioffset[1] - bas->ioffset[0], PETSC_COMM_SELF, PETSC_ERR_PLIB, "Distinguished rank exchange has mismatched lengths");
238       PetscCall(PetscArraycpy(bas->irootloc + bas->ioffset[0], sf->rremote + sf->roffset[i], npoints));
239       continue;
240     }
241     PetscCallMPI(MPIU_Isend(sf->rremote + sf->roffset[i], npoints, MPIU_INT, sf->ranks[i], tag, comm, &leafreqs[i - sf->ndranks]));
242   }
243   PetscCallMPI(MPI_Waitall(nRemoteLeafRanks, rootreqs, MPI_STATUSES_IGNORE));
244   PetscCallMPI(MPI_Waitall(nRemoteRootRanks, leafreqs, MPI_STATUSES_IGNORE));
245 
246   sf->nleafreqs  = nRemoteRootRanks;
247   bas->nrootreqs = nRemoteLeafRanks;
248 
249   /* Setup fields related to packing, such as rootbuflen[] */
250   PetscCall(PetscSFSetUpPackFields(sf));
251   PetscCall(PetscFree2(rootreqs, leafreqs));
252   PetscFunctionReturn(PETSC_SUCCESS);
253 }
254 
255 PETSC_INTERN PetscErrorCode PetscSFReset_Basic(PetscSF sf)
256 {
257   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
258   PetscSFLink    link = bas->avail, next;
259 
260   PetscFunctionBegin;
261   PetscCheck(!bas->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Outstanding operation has not been completed");
262   PetscCall(PetscFree2(bas->iranks, bas->ioffset));
263   PetscCall(PetscFree(bas->irootloc));
264 
265 #if defined(PETSC_HAVE_DEVICE)
266   for (PetscInt i = 0; i < 2; i++) PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, bas->irootloc_d[i]));
267 #endif
268 
269 #if defined(PETSC_HAVE_NVSHMEM)
270   PetscCall(PetscSFReset_Basic_NVSHMEM(sf));
271 #endif
272 
273   for (; link; link = next) {
274     next = link->next;
275     PetscCall(PetscSFLinkDestroy(sf, link));
276   }
277   bas->avail = NULL;
278   PetscCall(PetscSFResetPackFields(sf));
279   PetscFunctionReturn(PETSC_SUCCESS);
280 }
281 
282 PETSC_INTERN PetscErrorCode PetscSFDestroy_Basic(PetscSF sf)
283 {
284   PetscFunctionBegin;
285   PetscCall(PetscSFReset_Basic(sf));
286   PetscCall(PetscFree(sf->data));
287   PetscFunctionReturn(PETSC_SUCCESS);
288 }
289 
290 #if defined(PETSC_USE_SINGLE_LIBRARY)
291   #include <petscmat.h>
292 
293 PETSC_INTERN PetscErrorCode PetscSFView_Basic_PatternAndSizes(PetscSF sf, PetscViewer viewer)
294 {
295   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
296   PetscInt           i, nrootranks, ndrootranks;
297   const PetscInt    *rootoffset;
298   PetscMPIInt        rank, size;
299   const PetscMPIInt *rootranks;
300   MPI_Comm           comm = PetscObjectComm((PetscObject)sf);
301   PetscScalar        unitbytes;
302   Mat                A;
303 
304   PetscFunctionBegin;
305   PetscCallMPI(MPI_Comm_size(comm, &size));
306   PetscCallMPI(MPI_Comm_rank(comm, &rank));
307   /* PetscSFView is most useful for the SF used in VecScatterBegin/End in MatMult etc, where we do
308     PetscSFBcast, i.e., roots send data to leaves.  We dump the communication pattern into a matrix
309     in senders' view point: how many bytes I will send to my neighbors.
310 
311     Looking at a column of the matrix, one can also know how many bytes the rank will receive from others.
312 
313     If PetscSFLink bas->inuse is available, we can use that to get tree vertex size. But that would give
314     different interpretations for the same SF for different data types. Since we most care about VecScatter,
315     we uniformly treat each vertex as a PetscScalar.
316   */
317   unitbytes = (PetscScalar)sizeof(PetscScalar);
318 
319   PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, &rootranks, &rootoffset, NULL));
320   PetscCall(MatCreateAIJ(comm, 1, 1, size, size, 1, NULL, nrootranks - ndrootranks, NULL, &A));
321   PetscCall(MatSetOptionsPrefix(A, "__petsc_internal__")); /* To prevent the internal A from taking any command line options */
322   for (i = 0; i < nrootranks; i++) PetscCall(MatSetValue(A, (PetscInt)rank, bas->iranks[i], (rootoffset[i + 1] - rootoffset[i]) * unitbytes, INSERT_VALUES));
323   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
324   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
325   PetscCall(MatView(A, viewer));
326   PetscCall(MatDestroy(&A));
327   PetscFunctionReturn(PETSC_SUCCESS);
328 }
329 #endif
330 
331 PETSC_INTERN PetscErrorCode PetscSFView_Basic(PetscSF sf, PetscViewer viewer)
332 {
333   PetscBool isascii;
334 
335   PetscFunctionBegin;
336   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
337   if (isascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) PetscCall(PetscViewerASCIIPrintf(viewer, "  MultiSF sort=%s\n", sf->rankorder ? "rank-order" : "unordered"));
338 #if defined(PETSC_USE_SINGLE_LIBRARY)
339   else {
340     PetscBool isdraw, isbinary;
341     PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
342     PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
343     if ((isascii && viewer->format == PETSC_VIEWER_ASCII_MATLAB) || isdraw || isbinary) PetscCall(PetscSFView_Basic_PatternAndSizes(sf, viewer));
344   }
345 #endif
346   PetscFunctionReturn(PETSC_SUCCESS);
347 }
348 
349 PETSC_INTERN PetscErrorCode PetscSFBcastBegin_Basic(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
350 {
351   PetscSFLink link = NULL;
352 
353   PetscFunctionBegin;
354   /* Create a communication link, which provides buffers, MPI requests etc (if MPI is used) */
355   PetscCall(PetscSFLinkCreate(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, PETSCSF_BCAST, &link));
356   /* Pack rootdata to rootbuf for remote communication */
357   PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_REMOTE, rootdata));
358   /* Start communication, e.g., post MPI_Isend */
359   PetscCall(PetscSFLinkStartCommunication(sf, link, PETSCSF_ROOT2LEAF));
360   /* Do local scatter (i.e., self to self communication), which overlaps with the remote communication above */
361   PetscCall(PetscSFLinkScatterLocal(sf, link, PETSCSF_ROOT2LEAF, (void *)rootdata, leafdata, op));
362   PetscFunctionReturn(PETSC_SUCCESS);
363 }
364 
365 PETSC_INTERN PetscErrorCode PetscSFBcastEnd_Basic(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
366 {
367   PetscSFLink link = NULL;
368 
369   PetscFunctionBegin;
370   /* Retrieve the link used in XxxBegin() with root/leafdata as key */
371   PetscCall(PetscSFLinkGetInUse(sf, unit, rootdata, leafdata, PETSC_OWN_POINTER, &link));
372   /* Finish remote communication, e.g., post MPI_Waitall */
373   PetscCall(PetscSFLinkFinishCommunication(sf, link, PETSCSF_ROOT2LEAF));
374   /* Unpack data in leafbuf to leafdata for remote communication */
375   PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_REMOTE, leafdata, op));
376   /* Recycle the link */
377   PetscCall(PetscSFLinkReclaim(sf, &link));
378   PetscFunctionReturn(PETSC_SUCCESS);
379 }
380 
381 /* Shared by ReduceBegin and FetchAndOpBegin */
382 static inline PetscErrorCode PetscSFLeafToRootBegin_Basic(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *out)
383 {
384   PetscSFLink link = NULL;
385 
386   PetscFunctionBegin;
387   PetscCall(PetscSFLinkCreate(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, &link));
388   PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_REMOTE, leafdata));
389   PetscCall(PetscSFLinkStartCommunication(sf, link, PETSCSF_LEAF2ROOT));
390   *out = link;
391   PetscFunctionReturn(PETSC_SUCCESS);
392 }
393 
394 /* leaf -> root with reduction */
395 PETSC_INTERN PetscErrorCode PetscSFReduceBegin_Basic(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
396 {
397   PetscSFLink link = NULL;
398 
399   PetscFunctionBegin;
400   PetscCall(PetscSFLeafToRootBegin_Basic(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op, PETSCSF_REDUCE, &link));
401   PetscCall(PetscSFLinkScatterLocal(sf, link, PETSCSF_LEAF2ROOT, rootdata, (void *)leafdata, op));
402   PetscFunctionReturn(PETSC_SUCCESS);
403 }
404 
405 PETSC_INTERN PetscErrorCode PetscSFReduceEnd_Basic(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
406 {
407   PetscSFLink link = NULL;
408 
409   PetscFunctionBegin;
410   PetscCall(PetscSFLinkGetInUse(sf, unit, rootdata, leafdata, PETSC_OWN_POINTER, &link));
411   PetscCall(PetscSFLinkFinishCommunication(sf, link, PETSCSF_LEAF2ROOT));
412   PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_REMOTE, rootdata, op));
413   PetscCall(PetscSFLinkReclaim(sf, &link));
414   PetscFunctionReturn(PETSC_SUCCESS);
415 }
416 
417 PETSC_INTERN PetscErrorCode PetscSFFetchAndOpBegin_Basic(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, void *leafupdate, MPI_Op op)
418 {
419   PetscSFLink link = NULL;
420 
421   PetscFunctionBegin;
422   PetscCall(PetscSFLeafToRootBegin_Basic(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op, PETSCSF_FETCH, &link));
423   PetscCall(PetscSFLinkFetchAndOpLocal(sf, link, rootdata, leafdata, leafupdate, op));
424   PetscFunctionReturn(PETSC_SUCCESS);
425 }
426 
427 PETSC_INTERN PetscErrorCode PetscSFFetchAndOpEnd_Basic(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
428 {
429   PetscSFLink link = NULL;
430 
431   PetscFunctionBegin;
432   PetscCall(PetscSFLinkGetInUse(sf, unit, rootdata, leafdata, PETSC_OWN_POINTER, &link));
433   /* This implementation could be changed to unpack as receives arrive, at the cost of non-determinism */
434   PetscCall(PetscSFLinkFinishCommunication(sf, link, PETSCSF_LEAF2ROOT));
435   /* Do fetch-and-op, the (remote) update results are in rootbuf */
436   PetscCall(PetscSFLinkFetchAndOpRemote(sf, link, rootdata, op));
437   /* Bcast rootbuf to leafupdate */
438   PetscCall(PetscSFLinkStartCommunication(sf, link, PETSCSF_ROOT2LEAF));
439   PetscCall(PetscSFLinkFinishCommunication(sf, link, PETSCSF_ROOT2LEAF));
440   /* Unpack and insert fetched data into leaves */
441   PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_REMOTE, leafupdate, MPI_REPLACE));
442   PetscCall(PetscSFLinkReclaim(sf, &link));
443   PetscFunctionReturn(PETSC_SUCCESS);
444 }
445 
446 PETSC_INTERN PetscErrorCode PetscSFGetLeafRanks_Basic(PetscSF sf, PetscInt *niranks, const PetscMPIInt **iranks, const PetscInt **ioffset, const PetscInt **irootloc)
447 {
448   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
449 
450   PetscFunctionBegin;
451   if (niranks) *niranks = bas->niranks;
452   if (iranks) *iranks = bas->iranks;
453   if (ioffset) *ioffset = bas->ioffset;
454   if (irootloc) *irootloc = bas->irootloc;
455   PetscFunctionReturn(PETSC_SUCCESS);
456 }
457 
458 /* An optimized PetscSFCreateEmbeddedRootSF. We aggressively make use of the established communication on sf.
459    We need one bcast on sf, and no communication anymore to build the embedded sf. Note that selected[]
460    was sorted before calling the routine.
461  */
462 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedRootSF_Basic(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *newsf)
463 {
464   PetscSF            esf;
465   PetscInt           esf_nranks, esf_ndranks, *esf_roffset, *esf_rmine, *esf_rremote;
466   PetscInt           i, j, p, q, nroots, esf_nleaves, *new_ilocal, nranks, ndranks, niranks, ndiranks, minleaf, maxleaf, maxlocal;
467   char              *rootdata, *leafdata, *leafmem; /* Only stores 0 or 1, so we can save memory with char */
468   PetscMPIInt       *esf_ranks;
469   const PetscMPIInt *ranks, *iranks;
470   const PetscInt    *roffset, *rmine, *rremote, *ioffset, *irootloc;
471   PetscBool          connected;
472   PetscSFNode       *new_iremote;
473   PetscSF_Basic     *bas;
474 
475   PetscFunctionBegin;
476   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sf), &esf));
477   PetscCall(PetscSFSetFromOptions(esf));
478   PetscCall(PetscSFSetType(esf, PETSCSFBASIC)); /* This optimized routine can only create a basic sf */
479 
480   /* Find out which leaves are still connected to roots in the embedded sf by doing a Bcast */
481   PetscCall(PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL));
482   PetscCall(PetscSFGetLeafRange(sf, &minleaf, &maxleaf));
483   maxlocal = maxleaf - minleaf + 1;
484   PetscCall(PetscCalloc2(nroots, &rootdata, maxlocal, &leafmem));
485   leafdata = PetscSafePointerPlusOffset(leafmem, -minleaf);
486   /* Tag selected roots */
487   for (i = 0; i < nselected; ++i) rootdata[selected[i]] = 1;
488 
489   PetscCall(PetscSFBcastBegin(sf, MPI_CHAR, rootdata, leafdata, MPI_REPLACE));
490   PetscCall(PetscSFBcastEnd(sf, MPI_CHAR, rootdata, leafdata, MPI_REPLACE));
491   PetscCall(PetscSFGetLeafInfo_Basic(sf, &nranks, &ndranks, &ranks, &roffset, &rmine, &rremote)); /* Get send info */
492   esf_nranks = esf_ndranks = esf_nleaves = 0;
493   for (i = 0; i < nranks; i++) {
494     connected = PETSC_FALSE; /* Is this process still connected to this remote root rank? */
495     for (j = roffset[i]; j < roffset[i + 1]; j++) {
496       if (leafdata[rmine[j]]) {
497         esf_nleaves++;
498         connected = PETSC_TRUE;
499       }
500     }
501     if (connected) {
502       esf_nranks++;
503       if (i < ndranks) esf_ndranks++;
504     }
505   }
506 
507   /* Set graph of esf and also set up its outgoing communication (i.e., send info), which is usually done by PetscSFSetUpRanks */
508   PetscCall(PetscMalloc1(esf_nleaves, &new_ilocal));
509   PetscCall(PetscMalloc1(esf_nleaves, &new_iremote));
510   PetscCall(PetscMalloc4(esf_nranks, &esf_ranks, esf_nranks + 1, &esf_roffset, esf_nleaves, &esf_rmine, esf_nleaves, &esf_rremote));
511   p              = 0; /* Counter for connected root ranks */
512   q              = 0; /* Counter for connected leaves */
513   esf_roffset[0] = 0;
514   for (i = 0; i < nranks; i++) { /* Scan leaf data again to fill esf arrays */
515     connected = PETSC_FALSE;
516     for (j = roffset[i]; j < roffset[i + 1]; j++) {
517       if (leafdata[rmine[j]]) {
518         esf_rmine[q] = new_ilocal[q] = rmine[j];
519         esf_rremote[q]               = rremote[j];
520         new_iremote[q].index         = rremote[j];
521         new_iremote[q].rank          = ranks[i];
522         connected                    = PETSC_TRUE;
523         q++;
524       }
525     }
526     if (connected) {
527       esf_ranks[p]       = ranks[i];
528       esf_roffset[p + 1] = q;
529       p++;
530     }
531   }
532 
533   /* SetGraph internally resets the SF, so we only set its fields after the call */
534   PetscCall(PetscSFSetGraph(esf, nroots, esf_nleaves, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
535   esf->nranks    = esf_nranks;
536   esf->ndranks   = esf_ndranks;
537   esf->ranks     = esf_ranks;
538   esf->roffset   = esf_roffset;
539   esf->rmine     = esf_rmine;
540   esf->rremote   = esf_rremote;
541   esf->nleafreqs = esf_nranks - esf_ndranks;
542 
543   /* Set up the incoming communication (i.e., recv info) stored in esf->data, which is usually done by PetscSFSetUp_Basic */
544   bas = (PetscSF_Basic *)esf->data;
545   PetscCall(PetscSFGetRootInfo_Basic(sf, &niranks, &ndiranks, &iranks, &ioffset, &irootloc)); /* Get recv info */
546   /* Embedded sf always has simpler communication than the original one. We might allocate longer arrays than needed here. But we
547      we do not care since these arrays are usually short. The benefit is we can fill these arrays by just parsing irootloc once.
548    */
549   PetscCall(PetscMalloc2(niranks, &bas->iranks, niranks + 1, &bas->ioffset));
550   PetscCall(PetscMalloc1(ioffset[niranks], &bas->irootloc));
551   bas->niranks = bas->ndiranks = bas->ioffset[0] = 0;
552   p                                              = 0; /* Counter for connected leaf ranks */
553   q                                              = 0; /* Counter for connected roots */
554   for (i = 0; i < niranks; i++) {
555     connected = PETSC_FALSE; /* Is the current process still connected to this remote leaf rank? */
556     for (j = ioffset[i]; j < ioffset[i + 1]; j++) {
557       if (rootdata[irootloc[j]]) {
558         bas->irootloc[q++] = irootloc[j];
559         connected          = PETSC_TRUE;
560       }
561     }
562     if (connected) {
563       bas->niranks++;
564       if (i < ndiranks) bas->ndiranks++; /* Note that order of ranks (including distinguished ranks) is kept */
565       bas->iranks[p]      = iranks[i];
566       bas->ioffset[p + 1] = q;
567       p++;
568     }
569   }
570   bas->itotal     = q;
571   bas->nrootreqs  = bas->niranks - bas->ndiranks;
572   esf->persistent = PETSC_TRUE;
573   /* Setup packing related fields */
574   PetscCall(PetscSFSetUpPackFields(esf));
575 
576   /* Copy from PetscSFSetUp(), since this method wants to skip PetscSFSetUp(). */
577 #if defined(PETSC_HAVE_CUDA)
578   if (esf->backend == PETSCSF_BACKEND_CUDA) {
579     esf->ops->Malloc = PetscSFMalloc_CUDA;
580     esf->ops->Free   = PetscSFFree_CUDA;
581   }
582 #endif
583 
584 #if defined(PETSC_HAVE_HIP)
585   /* TODO: Needs debugging */
586   if (esf->backend == PETSCSF_BACKEND_HIP) {
587     esf->ops->Malloc = PetscSFMalloc_HIP;
588     esf->ops->Free   = PetscSFFree_HIP;
589   }
590 #endif
591 
592 #if defined(PETSC_HAVE_KOKKOS)
593   if (esf->backend == PETSCSF_BACKEND_KOKKOS) {
594     esf->ops->Malloc = PetscSFMalloc_Kokkos;
595     esf->ops->Free   = PetscSFFree_Kokkos;
596   }
597 #endif
598   esf->setupcalled = PETSC_TRUE; /* We have done setup ourselves! */
599   PetscCall(PetscFree2(rootdata, leafmem));
600   *newsf = esf;
601   PetscFunctionReturn(PETSC_SUCCESS);
602 }
603 
604 PETSC_EXTERN PetscErrorCode PetscSFCreate_Basic(PetscSF sf)
605 {
606   PetscSF_Basic *dat;
607 
608   PetscFunctionBegin;
609   sf->ops->SetUp                = PetscSFSetUp_Basic;
610   sf->ops->Reset                = PetscSFReset_Basic;
611   sf->ops->Destroy              = PetscSFDestroy_Basic;
612   sf->ops->View                 = PetscSFView_Basic;
613   sf->ops->BcastBegin           = PetscSFBcastBegin_Basic;
614   sf->ops->BcastEnd             = PetscSFBcastEnd_Basic;
615   sf->ops->ReduceBegin          = PetscSFReduceBegin_Basic;
616   sf->ops->ReduceEnd            = PetscSFReduceEnd_Basic;
617   sf->ops->FetchAndOpBegin      = PetscSFFetchAndOpBegin_Basic;
618   sf->ops->FetchAndOpEnd        = PetscSFFetchAndOpEnd_Basic;
619   sf->ops->GetLeafRanks         = PetscSFGetLeafRanks_Basic;
620   sf->ops->CreateEmbeddedRootSF = PetscSFCreateEmbeddedRootSF_Basic;
621   sf->ops->SetCommunicationOps  = PetscSFSetCommunicationOps_Basic;
622 
623   sf->persistent = PETSC_TRUE; // currently SFBASIC always uses persistent send/recv
624   sf->collective = PETSC_FALSE;
625 
626   PetscCall(PetscNew(&dat));
627   sf->data = (void *)dat;
628   PetscFunctionReturn(PETSC_SUCCESS);
629 }
630