xref: /petsc/src/vec/is/sf/impls/basic/sfbasic.c (revision b2074a3e64637e664cd1fa6a594e90829c30f507)
1 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
2 #include <../src/vec/is/sf/impls/basic/sfpack.h>
3 #include <petsc/private/viewerimpl.h>
4 
5 // Init persistent MPI send/recv requests
6 static PetscErrorCode PetscSFLinkInitMPIRequests_Persistent_Basic(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
7 {
8   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
9   PetscInt           cnt;
10   PetscMPIInt        nrootranks, ndrootranks, nleafranks, ndleafranks;
11   const PetscInt    *rootoffset, *leafoffset;
12   MPI_Aint           disp;
13   MPI_Comm           comm          = PetscObjectComm((PetscObject)sf);
14   MPI_Datatype       unit          = link->unit;
15   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
16   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
17 
18   PetscFunctionBegin;
19   if (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
20     PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
21     if (direction == PETSCSF_LEAF2ROOT) {
22       for (PetscMPIInt i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
23         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
24         cnt  = rootoffset[i + 1] - rootoffset[i];
25         PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
26       }
27     } else { /* PETSCSF_ROOT2LEAF */
28       for (PetscMPIInt i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
29         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
30         cnt  = rootoffset[i + 1] - rootoffset[i];
31         PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
32       }
33     }
34     link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
35   }
36 
37   if (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
38     PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
39     if (direction == PETSCSF_LEAF2ROOT) {
40       for (PetscMPIInt i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
41         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
42         cnt  = leafoffset[i + 1] - leafoffset[i];
43         PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
44       }
45     } else { /* PETSCSF_ROOT2LEAF */
46       for (PetscMPIInt i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
47         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
48         cnt  = leafoffset[i + 1] - leafoffset[i];
49         PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
50       }
51     }
52     link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
53   }
54   PetscFunctionReturn(PETSC_SUCCESS);
55 }
56 
57 // Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf
58 static PetscErrorCode PetscSFLinkStartCommunication_Persistent_Basic(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
59 {
60   PetscMPIInt    nsreqs = 0, nrreqs = 0;
61   MPI_Request   *sreqs = NULL, *rreqs = NULL;
62   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
63   PetscInt       sbuflen, rbuflen;
64 
65   PetscFunctionBegin;
66   rbuflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
67   if (rbuflen) {
68     if (direction == PETSCSF_ROOT2LEAF) {
69       nrreqs = sf->nleafreqs;
70       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &rreqs));
71     } else { /* leaf to root */
72       nrreqs = bas->nrootreqs;
73       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &rreqs, NULL));
74     }
75   }
76 
77   sbuflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
78   if (sbuflen) {
79     if (direction == PETSCSF_ROOT2LEAF) {
80       nsreqs = bas->nrootreqs;
81       PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */));
82       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &sreqs, NULL));
83     } else { /* leaf to root */
84       nsreqs = sf->nleafreqs;
85       PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE));
86       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &sreqs));
87     }
88   }
89   PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link)); // need to sync the stream to make BOTH sendbuf and recvbuf ready
90   if (rbuflen) PetscCallMPI(MPI_Startall_irecv(rbuflen, link->unit, nrreqs, rreqs));
91   if (sbuflen) PetscCallMPI(MPI_Startall_isend(sbuflen, link->unit, nsreqs, sreqs));
92   PetscFunctionReturn(PETSC_SUCCESS);
93 }
94 
95 #if defined(PETSC_HAVE_MPIX_STREAM)
96 // issue MPIX_Isend/Irecv_enqueue()
97 static PetscErrorCode PetscSFLinkStartCommunication_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
98 {
99   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
100   PetscInt           i, j;
101   PetscMPIInt        nrootranks, ndrootranks, nleafranks, ndleafranks, cnt;
102   const PetscInt    *rootoffset, *leafoffset;
103   MPI_Aint           disp;
104   MPI_Comm           stream_comm   = sf->stream_comm;
105   MPI_Datatype       unit          = link->unit;
106   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
107   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
108 
109   PetscFunctionBegin;
110   if (bas->rootbuflen[PETSCSF_REMOTE]) {
111     PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
112     if (direction == PETSCSF_LEAF2ROOT) {
113       for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
114         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
115         cnt  = (PetscMPIInt)(rootoffset[i + 1] - rootoffset[i]);
116         PetscCallMPI(MPIX_Irecv_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
117       }
118     } else { // PETSCSF_ROOT2LEAF
119       for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
120         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
121         cnt  = (PetscMPIInt)(rootoffset[i + 1] - rootoffset[i]);
122         // no need to sync the gpu stream!
123         PetscCallMPI(MPIX_Isend_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
124       }
125     }
126   }
127 
128   if (sf->leafbuflen[PETSCSF_REMOTE]) {
129     PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
130     if (direction == PETSCSF_LEAF2ROOT) {
131       for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
132         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
133         cnt  = (PetscMPIInt)(leafoffset[i + 1] - leafoffset[i]);
134         // no need to sync the gpu stream!
135         PetscCallMPI(MPIX_Isend_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
136       }
137     } else { // PETSCSF_ROOT2LEAF
138       for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
139         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
140         cnt  = (PetscMPIInt)(leafoffset[i + 1] - leafoffset[i]);
141         PetscCallMPI(MPIX_Irecv_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
142       }
143     }
144   }
145   PetscFunctionReturn(PETSC_SUCCESS);
146 }
147 
148 static PetscErrorCode PetscSFLinkFinishCommunication_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
149 {
150   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
151   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
152   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
153 
154   PetscFunctionBegin;
155   PetscCallMPI(MPIX_Waitall_enqueue(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
156   PetscCallMPI(MPIX_Waitall_enqueue(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
157   PetscFunctionReturn(PETSC_SUCCESS);
158 }
159 #endif
160 
161 static PetscErrorCode PetscSFSetCommunicationOps_Basic(PetscSF sf, PetscSFLink link)
162 {
163   PetscFunctionBegin;
164   link->InitMPIRequests    = PetscSFLinkInitMPIRequests_Persistent_Basic;
165   link->StartCommunication = PetscSFLinkStartCommunication_Persistent_Basic;
166 #if defined(PETSC_HAVE_MPIX_STREAM)
167   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
168   if (sf->use_stream_aware_mpi && (PetscMemTypeDevice(rootmtype_mpi) || PetscMemTypeDevice(leafmtype_mpi))) {
169     link->StartCommunication  = PetscSFLinkStartCommunication_MPIX_Stream;
170     link->FinishCommunication = PetscSFLinkFinishCommunication_MPIX_Stream;
171   }
172 #endif
173   PetscFunctionReturn(PETSC_SUCCESS);
174 }
175 
176 /*===================================================================================*/
177 /*              SF public interface implementations                                  */
178 /*===================================================================================*/
179 PETSC_INTERN PetscErrorCode PetscSFSetUp_Basic(PetscSF sf)
180 {
181   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
182   PetscInt      *rlengths, *ilengths;
183   PetscMPIInt    nRemoteRootRanks, nRemoteLeafRanks;
184   PetscMPIInt    rank, niranks, *iranks, tag;
185   MPI_Comm       comm;
186   MPI_Group      group;
187   MPI_Request   *rootreqs, *leafreqs;
188 
189   PetscFunctionBegin;
190   PetscCallMPI(MPI_Comm_group(PETSC_COMM_SELF, &group));
191   PetscCall(PetscSFSetUpRanks(sf, group));
192   PetscCallMPI(MPI_Group_free(&group));
193   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
194   PetscCall(PetscObjectGetNewTag((PetscObject)sf, &tag));
195   PetscCallMPI(MPI_Comm_rank(comm, &rank));
196   /*
197    * Inform roots about how many leaves and from which ranks
198    */
199   PetscCall(PetscMalloc1(sf->nranks, &rlengths));
200   /* Determine number, sending ranks and length of incoming */
201   for (PetscMPIInt i = 0; i < sf->nranks; i++) rlengths[i] = sf->roffset[i + 1] - sf->roffset[i]; /* Number of roots referenced by my leaves; for rank sf->ranks[i] */
202   nRemoteRootRanks = sf->nranks - sf->ndranks;
203   PetscCall(PetscCommBuildTwoSided(comm, 1, MPIU_INT, nRemoteRootRanks, PetscSafePointerPlusOffset(sf->ranks, sf->ndranks), PetscSafePointerPlusOffset(rlengths, sf->ndranks), &niranks, &iranks, (void **)&ilengths));
204 
205   /* Sort iranks. See use of VecScatterGetRemoteOrdered_Private() in MatGetBrowsOfAoCols_MPIAIJ() on why.
206      We could sort ranks there at the price of allocating extra working arrays. Presumably, niranks is
207      small and the sorting is cheap.
208    */
209   PetscCall(PetscSortMPIIntWithIntArray(niranks, iranks, ilengths));
210 
211   /* Partition into distinguished and non-distinguished incoming ranks */
212   bas->ndiranks = sf->ndranks;
213   bas->niranks  = bas->ndiranks + niranks;
214   PetscCall(PetscMalloc2(bas->niranks, &bas->iranks, bas->niranks + 1, &bas->ioffset));
215   bas->ioffset[0] = 0;
216   for (PetscMPIInt i = 0; i < bas->ndiranks; i++) {
217     bas->iranks[i]      = sf->ranks[i];
218     bas->ioffset[i + 1] = bas->ioffset[i] + rlengths[i];
219   }
220   PetscCheck(bas->ndiranks <= 1 && (bas->ndiranks != 1 || bas->iranks[0] == rank), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Broken setup for shared ranks");
221   for (PetscMPIInt i = bas->ndiranks; i < bas->niranks; i++) {
222     bas->iranks[i]      = iranks[i - bas->ndiranks];
223     bas->ioffset[i + 1] = bas->ioffset[i] + ilengths[i - bas->ndiranks];
224   }
225   bas->itotal = bas->ioffset[bas->niranks];
226   PetscCall(PetscFree(rlengths));
227   PetscCall(PetscFree(iranks));
228   PetscCall(PetscFree(ilengths));
229 
230   /* Send leaf identities to roots */
231   nRemoteLeafRanks = bas->niranks - bas->ndiranks;
232   PetscCall(PetscMalloc1(bas->itotal, &bas->irootloc));
233   PetscCall(PetscMalloc2(nRemoteLeafRanks, &rootreqs, nRemoteRootRanks, &leafreqs));
234   for (PetscMPIInt i = bas->ndiranks; i < bas->niranks; i++) PetscCallMPI(MPIU_Irecv(bas->irootloc + bas->ioffset[i], bas->ioffset[i + 1] - bas->ioffset[i], MPIU_INT, bas->iranks[i], tag, comm, &rootreqs[i - bas->ndiranks]));
235   for (PetscMPIInt i = 0; i < sf->nranks; i++) {
236     PetscInt npoints = sf->roffset[i + 1] - sf->roffset[i];
237     if (i < sf->ndranks) {
238       PetscCheck(sf->ranks[i] == rank, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot interpret distinguished leaf rank");
239       PetscCheck(bas->iranks[0] == rank, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot interpret distinguished root rank");
240       PetscCheck(npoints == bas->ioffset[1] - bas->ioffset[0], PETSC_COMM_SELF, PETSC_ERR_PLIB, "Distinguished rank exchange has mismatched lengths");
241       PetscCall(PetscArraycpy(bas->irootloc + bas->ioffset[0], sf->rremote + sf->roffset[i], npoints));
242       continue;
243     }
244     PetscCallMPI(MPIU_Isend(sf->rremote + sf->roffset[i], npoints, MPIU_INT, sf->ranks[i], tag, comm, &leafreqs[i - sf->ndranks]));
245   }
246   PetscCallMPI(MPI_Waitall(nRemoteLeafRanks, rootreqs, MPI_STATUSES_IGNORE));
247   PetscCallMPI(MPI_Waitall(nRemoteRootRanks, leafreqs, MPI_STATUSES_IGNORE));
248 
249   sf->nleafreqs  = nRemoteRootRanks;
250   bas->nrootreqs = nRemoteLeafRanks;
251 
252   /* Setup fields related to packing, such as rootbuflen[] */
253   PetscCall(PetscSFSetUpPackFields(sf));
254   PetscCall(PetscFree2(rootreqs, leafreqs));
255   PetscFunctionReturn(PETSC_SUCCESS);
256 }
257 
258 PETSC_INTERN PetscErrorCode PetscSFReset_Basic(PetscSF sf)
259 {
260   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
261   PetscSFLink    link = bas->avail, next;
262 
263   PetscFunctionBegin;
264   PetscCheck(!bas->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Outstanding operation has not been completed");
265   PetscCall(PetscFree2(bas->iranks, bas->ioffset));
266   PetscCall(PetscFree(bas->irootloc));
267 
268 #if defined(PETSC_HAVE_DEVICE)
269   for (int i = 0; i < 2; i++) PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, bas->irootloc_d[i]));
270 #endif
271 
272 #if defined(PETSC_HAVE_NVSHMEM)
273   PetscCall(PetscSFReset_Basic_NVSHMEM(sf));
274 #endif
275 
276   for (; link; link = next) {
277     next = link->next;
278     PetscCall(PetscSFLinkDestroy(sf, link));
279   }
280   bas->avail = NULL;
281   PetscCall(PetscSFResetPackFields(sf));
282   PetscFunctionReturn(PETSC_SUCCESS);
283 }
284 
285 PETSC_INTERN PetscErrorCode PetscSFDestroy_Basic(PetscSF sf)
286 {
287   PetscFunctionBegin;
288   PetscCall(PetscSFReset_Basic(sf));
289   PetscCall(PetscFree(sf->data));
290   PetscFunctionReturn(PETSC_SUCCESS);
291 }
292 
293 #if defined(PETSC_USE_SINGLE_LIBRARY)
294   #include <petscmat.h>
295 
296 PETSC_INTERN PetscErrorCode PetscSFView_Basic_PatternAndSizes(PetscSF sf, PetscViewer viewer)
297 {
298   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
299   PetscMPIInt        nrootranks, ndrootranks;
300   const PetscInt    *rootoffset;
301   PetscMPIInt        rank, size;
302   const PetscMPIInt *rootranks;
303   MPI_Comm           comm = PetscObjectComm((PetscObject)sf);
304   PetscScalar        unitbytes;
305   Mat                A;
306 
307   PetscFunctionBegin;
308   PetscCallMPI(MPI_Comm_size(comm, &size));
309   PetscCallMPI(MPI_Comm_rank(comm, &rank));
310   /* PetscSFView is most useful for the SF used in VecScatterBegin/End in MatMult etc, where we do
311     PetscSFBcast, i.e., roots send data to leaves.  We dump the communication pattern into a matrix
312     in senders' view point: how many bytes I will send to my neighbors.
313 
314     Looking at a column of the matrix, one can also know how many bytes the rank will receive from others.
315 
316     If PetscSFLink bas->inuse is available, we can use that to get tree vertex size. But that would give
317     different interpretations for the same SF for different data types. Since we most care about VecScatter,
318     we uniformly treat each vertex as a PetscScalar.
319   */
320   unitbytes = (PetscScalar)sizeof(PetscScalar);
321 
322   PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, &rootranks, &rootoffset, NULL));
323   PetscCall(MatCreateAIJ(comm, 1, 1, size, size, 1, NULL, nrootranks - ndrootranks, NULL, &A));
324   PetscCall(MatSetOptionsPrefix(A, "__petsc_internal__")); /* To prevent the internal A from taking any command line options */
325   for (PetscMPIInt i = 0; i < nrootranks; i++) PetscCall(MatSetValue(A, rank, bas->iranks[i], (rootoffset[i + 1] - rootoffset[i]) * unitbytes, INSERT_VALUES));
326   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
327   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
328   PetscCall(MatView(A, viewer));
329   PetscCall(MatDestroy(&A));
330   PetscFunctionReturn(PETSC_SUCCESS);
331 }
332 #endif
333 
334 PETSC_INTERN PetscErrorCode PetscSFView_Basic(PetscSF sf, PetscViewer viewer)
335 {
336   PetscBool isascii;
337 
338   PetscFunctionBegin;
339   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
340   if (isascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) PetscCall(PetscViewerASCIIPrintf(viewer, "  MultiSF sort=%s\n", sf->rankorder ? "rank-order" : "unordered"));
341 #if defined(PETSC_USE_SINGLE_LIBRARY)
342   else {
343     PetscBool isdraw, isbinary;
344     PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
345     PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
346     if ((isascii && viewer->format == PETSC_VIEWER_ASCII_MATLAB) || isdraw || isbinary) PetscCall(PetscSFView_Basic_PatternAndSizes(sf, viewer));
347   }
348 #endif
349   PetscFunctionReturn(PETSC_SUCCESS);
350 }
351 
352 PETSC_INTERN PetscErrorCode PetscSFBcastBegin_Basic(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
353 {
354   PetscSFLink link = NULL;
355 
356   PetscFunctionBegin;
357   /* Create a communication link, which provides buffers, MPI requests etc (if MPI is used) */
358   PetscCall(PetscSFLinkCreate(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, PETSCSF_BCAST, &link));
359   /* Pack rootdata to rootbuf for remote communication */
360   PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_REMOTE, rootdata));
361   /* Start communication, e.g., post MPIU_Isend */
362   PetscCall(PetscSFLinkStartCommunication(sf, link, PETSCSF_ROOT2LEAF));
363   /* Do local scatter (i.e., self to self communication), which overlaps with the remote communication above */
364   PetscCall(PetscSFLinkScatterLocal(sf, link, PETSCSF_ROOT2LEAF, (void *)rootdata, leafdata, op));
365   PetscFunctionReturn(PETSC_SUCCESS);
366 }
367 
368 PETSC_INTERN PetscErrorCode PetscSFBcastEnd_Basic(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
369 {
370   PetscSFLink link = NULL;
371 
372   PetscFunctionBegin;
373   /* Retrieve the link used in XxxBegin() with root/leafdata as key */
374   PetscCall(PetscSFLinkGetInUse(sf, unit, rootdata, leafdata, PETSC_OWN_POINTER, &link));
375   /* Finish remote communication, e.g., post MPI_Waitall */
376   PetscCall(PetscSFLinkFinishCommunication(sf, link, PETSCSF_ROOT2LEAF));
377   /* Unpack data in leafbuf to leafdata for remote communication */
378   PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_REMOTE, leafdata, op));
379   /* Recycle the link */
380   PetscCall(PetscSFLinkReclaim(sf, &link));
381   PetscFunctionReturn(PETSC_SUCCESS);
382 }
383 
384 /* Shared by ReduceBegin and FetchAndOpBegin */
385 static inline PetscErrorCode PetscSFLeafToRootBegin_Basic(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *out)
386 {
387   PetscSFLink link = NULL;
388 
389   PetscFunctionBegin;
390   PetscCall(PetscSFLinkCreate(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, &link));
391   PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_REMOTE, leafdata));
392   PetscCall(PetscSFLinkStartCommunication(sf, link, PETSCSF_LEAF2ROOT));
393   *out = link;
394   PetscFunctionReturn(PETSC_SUCCESS);
395 }
396 
397 /* leaf -> root with reduction */
398 PETSC_INTERN PetscErrorCode PetscSFReduceBegin_Basic(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
399 {
400   PetscSFLink link = NULL;
401 
402   PetscFunctionBegin;
403   PetscCall(PetscSFLeafToRootBegin_Basic(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op, PETSCSF_REDUCE, &link));
404   PetscCall(PetscSFLinkScatterLocal(sf, link, PETSCSF_LEAF2ROOT, rootdata, (void *)leafdata, op));
405   PetscFunctionReturn(PETSC_SUCCESS);
406 }
407 
408 PETSC_INTERN PetscErrorCode PetscSFReduceEnd_Basic(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
409 {
410   PetscSFLink link = NULL;
411 
412   PetscFunctionBegin;
413   PetscCall(PetscSFLinkGetInUse(sf, unit, rootdata, leafdata, PETSC_OWN_POINTER, &link));
414   PetscCall(PetscSFLinkFinishCommunication(sf, link, PETSCSF_LEAF2ROOT));
415   PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_REMOTE, rootdata, op));
416   PetscCall(PetscSFLinkReclaim(sf, &link));
417   PetscFunctionReturn(PETSC_SUCCESS);
418 }
419 
420 PETSC_INTERN PetscErrorCode PetscSFFetchAndOpBegin_Basic(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, void *leafupdate, MPI_Op op)
421 {
422   PetscSFLink link = NULL;
423 
424   PetscFunctionBegin;
425   PetscCall(PetscSFLeafToRootBegin_Basic(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op, PETSCSF_FETCH, &link));
426   PetscCall(PetscSFLinkFetchAndOpLocal(sf, link, rootdata, leafdata, leafupdate, op));
427   PetscFunctionReturn(PETSC_SUCCESS);
428 }
429 
430 PETSC_INTERN PetscErrorCode PetscSFFetchAndOpEnd_Basic(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
431 {
432   PetscSFLink link = NULL;
433 
434   PetscFunctionBegin;
435   PetscCall(PetscSFLinkGetInUse(sf, unit, rootdata, leafdata, PETSC_OWN_POINTER, &link));
436   /* This implementation could be changed to unpack as receives arrive, at the cost of non-determinism */
437   PetscCall(PetscSFLinkFinishCommunication(sf, link, PETSCSF_LEAF2ROOT));
438   /* Do fetch-and-op, the (remote) update results are in rootbuf */
439   PetscCall(PetscSFLinkFetchAndOpRemote(sf, link, rootdata, op));
440   /* Bcast rootbuf to leafupdate */
441   PetscCall(PetscSFLinkStartCommunication(sf, link, PETSCSF_ROOT2LEAF));
442   PetscCall(PetscSFLinkFinishCommunication(sf, link, PETSCSF_ROOT2LEAF));
443   /* Unpack and insert fetched data into leaves */
444   PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_REMOTE, leafupdate, MPI_REPLACE));
445   PetscCall(PetscSFLinkReclaim(sf, &link));
446   PetscFunctionReturn(PETSC_SUCCESS);
447 }
448 
449 PETSC_INTERN PetscErrorCode PetscSFGetLeafRanks_Basic(PetscSF sf, PetscMPIInt *niranks, const PetscMPIInt **iranks, const PetscInt **ioffset, const PetscInt **irootloc)
450 {
451   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
452 
453   PetscFunctionBegin;
454   if (niranks) *niranks = bas->niranks;
455   if (iranks) *iranks = bas->iranks;
456   if (ioffset) *ioffset = bas->ioffset;
457   if (irootloc) *irootloc = bas->irootloc;
458   PetscFunctionReturn(PETSC_SUCCESS);
459 }
460 
461 /* An optimized PetscSFCreateEmbeddedRootSF. We aggressively make use of the established communication on sf.
462    We need one bcast on sf, and no communication anymore to build the embedded sf. Note that selected[]
463    was sorted before calling the routine.
464  */
465 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedRootSF_Basic(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *newsf)
466 {
467   PetscSF            esf;
468   PetscInt          *esf_roffset, *esf_rmine, *esf_rremote;
469   PetscInt           j, p, q, nroots, esf_nleaves, *new_ilocal, minleaf, maxleaf, maxlocal;
470   char              *rootdata, *leafdata, *leafmem; /* Only stores 0 or 1, so we can save memory with char */
471   PetscMPIInt       *esf_ranks, nranks, ndranks, niranks, esf_nranks, esf_ndranks, ndiranks;
472   const PetscMPIInt *ranks, *iranks;
473   const PetscInt    *roffset, *rmine, *rremote, *ioffset, *irootloc;
474   PetscBool          connected;
475   PetscSFNode       *new_iremote;
476   PetscSF_Basic     *bas;
477 
478   PetscFunctionBegin;
479   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sf), &esf));
480   PetscCall(PetscSFSetFromOptions(esf));
481   PetscCall(PetscSFSetType(esf, PETSCSFBASIC)); /* This optimized routine can only create a basic sf */
482 
483   /* Find out which leaves are still connected to roots in the embedded sf by doing a Bcast */
484   PetscCall(PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL));
485   PetscCall(PetscSFGetLeafRange(sf, &minleaf, &maxleaf));
486   maxlocal = maxleaf - minleaf + 1;
487   PetscCall(PetscCalloc2(nroots, &rootdata, maxlocal, &leafmem));
488   leafdata = PetscSafePointerPlusOffset(leafmem, -minleaf);
489   /* Tag selected roots */
490   for (PetscInt i = 0; i < nselected; ++i) rootdata[selected[i]] = 1;
491 
492   PetscCall(PetscSFBcastBegin(sf, MPI_CHAR, rootdata, leafdata, MPI_REPLACE));
493   PetscCall(PetscSFBcastEnd(sf, MPI_CHAR, rootdata, leafdata, MPI_REPLACE));
494   PetscCall(PetscSFGetLeafInfo_Basic(sf, &nranks, &ndranks, &ranks, &roffset, &rmine, &rremote)); /* Get send info */
495   esf_nranks = esf_ndranks = esf_nleaves = 0;
496   for (PetscMPIInt i = 0; i < nranks; i++) {
497     connected = PETSC_FALSE; /* Is this process still connected to this remote root rank? */
498     for (j = roffset[i]; j < roffset[i + 1]; j++) {
499       if (leafdata[rmine[j]]) {
500         esf_nleaves++;
501         connected = PETSC_TRUE;
502       }
503     }
504     if (connected) {
505       esf_nranks++;
506       if (i < ndranks) esf_ndranks++;
507     }
508   }
509 
510   /* Set graph of esf and also set up its outgoing communication (i.e., send info), which is usually done by PetscSFSetUpRanks */
511   PetscCall(PetscMalloc1(esf_nleaves, &new_ilocal));
512   PetscCall(PetscMalloc1(esf_nleaves, &new_iremote));
513   PetscCall(PetscMalloc4(esf_nranks, &esf_ranks, esf_nranks + 1, &esf_roffset, esf_nleaves, &esf_rmine, esf_nleaves, &esf_rremote));
514   p              = 0; /* Counter for connected root ranks */
515   q              = 0; /* Counter for connected leaves */
516   esf_roffset[0] = 0;
517   for (PetscMPIInt i = 0; i < nranks; i++) { /* Scan leaf data again to fill esf arrays */
518     connected = PETSC_FALSE;
519     for (j = roffset[i]; j < roffset[i + 1]; j++) {
520       if (leafdata[rmine[j]]) {
521         esf_rmine[q] = new_ilocal[q] = rmine[j];
522         esf_rremote[q]               = rremote[j];
523         new_iremote[q].index         = rremote[j];
524         new_iremote[q].rank          = ranks[i];
525         connected                    = PETSC_TRUE;
526         q++;
527       }
528     }
529     if (connected) {
530       esf_ranks[p]       = ranks[i];
531       esf_roffset[p + 1] = q;
532       p++;
533     }
534   }
535 
536   /* SetGraph internally resets the SF, so we only set its fields after the call */
537   PetscCall(PetscSFSetGraph(esf, nroots, esf_nleaves, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
538   esf->nranks    = esf_nranks;
539   esf->ndranks   = esf_ndranks;
540   esf->ranks     = esf_ranks;
541   esf->roffset   = esf_roffset;
542   esf->rmine     = esf_rmine;
543   esf->rremote   = esf_rremote;
544   esf->nleafreqs = esf_nranks - esf_ndranks;
545 
546   /* Set up the incoming communication (i.e., recv info) stored in esf->data, which is usually done by PetscSFSetUp_Basic */
547   bas = (PetscSF_Basic *)esf->data;
548   PetscCall(PetscSFGetRootInfo_Basic(sf, &niranks, &ndiranks, &iranks, &ioffset, &irootloc)); /* Get recv info */
549   /* Embedded sf always has simpler communication than the original one. We might allocate longer arrays than needed here. But we
550      we do not care since these arrays are usually short. The benefit is we can fill these arrays by just parsing irootloc once.
551    */
552   PetscCall(PetscMalloc2(niranks, &bas->iranks, niranks + 1, &bas->ioffset));
553   PetscCall(PetscMalloc1(ioffset[niranks], &bas->irootloc));
554   bas->niranks = bas->ndiranks = bas->ioffset[0] = 0;
555   p                                              = 0; /* Counter for connected leaf ranks */
556   q                                              = 0; /* Counter for connected roots */
557   for (PetscMPIInt i = 0; i < niranks; i++) {
558     connected = PETSC_FALSE; /* Is the current process still connected to this remote leaf rank? */
559     for (j = ioffset[i]; j < ioffset[i + 1]; j++) {
560       if (rootdata[irootloc[j]]) {
561         bas->irootloc[q++] = irootloc[j];
562         connected          = PETSC_TRUE;
563       }
564     }
565     if (connected) {
566       bas->niranks++;
567       if (i < ndiranks) bas->ndiranks++; /* Note that order of ranks (including distinguished ranks) is kept */
568       bas->iranks[p]      = iranks[i];
569       bas->ioffset[p + 1] = q;
570       p++;
571     }
572   }
573   bas->itotal     = q;
574   bas->nrootreqs  = bas->niranks - bas->ndiranks;
575   esf->persistent = PETSC_TRUE;
576   /* Setup packing related fields */
577   PetscCall(PetscSFSetUpPackFields(esf));
578 
579   /* Copy from PetscSFSetUp(), since this method wants to skip PetscSFSetUp(). */
580 #if defined(PETSC_HAVE_CUDA)
581   if (esf->backend == PETSCSF_BACKEND_CUDA) {
582     esf->ops->Malloc = PetscSFMalloc_CUDA;
583     esf->ops->Free   = PetscSFFree_CUDA;
584   }
585 #endif
586 
587 #if defined(PETSC_HAVE_HIP)
588   /* TODO: Needs debugging */
589   if (esf->backend == PETSCSF_BACKEND_HIP) {
590     esf->ops->Malloc = PetscSFMalloc_HIP;
591     esf->ops->Free   = PetscSFFree_HIP;
592   }
593 #endif
594 
595 #if defined(PETSC_HAVE_KOKKOS)
596   if (esf->backend == PETSCSF_BACKEND_KOKKOS) {
597     esf->ops->Malloc = PetscSFMalloc_Kokkos;
598     esf->ops->Free   = PetscSFFree_Kokkos;
599   }
600 #endif
601   esf->setupcalled = PETSC_TRUE; /* We have done setup ourselves! */
602   PetscCall(PetscFree2(rootdata, leafmem));
603   *newsf = esf;
604   PetscFunctionReturn(PETSC_SUCCESS);
605 }
606 
607 PETSC_EXTERN PetscErrorCode PetscSFCreate_Basic(PetscSF sf)
608 {
609   PetscSF_Basic *dat;
610 
611   PetscFunctionBegin;
612   sf->ops->SetUp                = PetscSFSetUp_Basic;
613   sf->ops->Reset                = PetscSFReset_Basic;
614   sf->ops->Destroy              = PetscSFDestroy_Basic;
615   sf->ops->View                 = PetscSFView_Basic;
616   sf->ops->BcastBegin           = PetscSFBcastBegin_Basic;
617   sf->ops->BcastEnd             = PetscSFBcastEnd_Basic;
618   sf->ops->ReduceBegin          = PetscSFReduceBegin_Basic;
619   sf->ops->ReduceEnd            = PetscSFReduceEnd_Basic;
620   sf->ops->FetchAndOpBegin      = PetscSFFetchAndOpBegin_Basic;
621   sf->ops->FetchAndOpEnd        = PetscSFFetchAndOpEnd_Basic;
622   sf->ops->GetLeafRanks         = PetscSFGetLeafRanks_Basic;
623   sf->ops->CreateEmbeddedRootSF = PetscSFCreateEmbeddedRootSF_Basic;
624   sf->ops->SetCommunicationOps  = PetscSFSetCommunicationOps_Basic;
625 
626   sf->persistent = PETSC_TRUE; // currently SFBASIC always uses persistent send/recv
627   sf->collective = PETSC_FALSE;
628 
629   PetscCall(PetscNew(&dat));
630   sf->data = (void *)dat;
631   PetscFunctionReturn(PETSC_SUCCESS);
632 }
633