xref: /petsc/src/vec/is/sf/interface/sf.c (revision 4237731afe36b4db0a3147aff0aed51d4657afa2)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petsc/private/viewerimpl.h>
4 #include <petsc/private/hashmapi.h>
5 
6 #if defined(PETSC_HAVE_CUDA)
7   #include <cuda_runtime.h>
8   #include <petscdevice_cuda.h>
9 #endif
10 
11 #if defined(PETSC_HAVE_HIP)
12   #include <hip/hip_runtime.h>
13 #endif
14 
15 #if defined(PETSC_CLANG_STATIC_ANALYZER)
16 extern void PetscSFCheckGraphSet(PetscSF, int);
17 #else
18   #if defined(PETSC_USE_DEBUG)
19     #define PetscSFCheckGraphSet(sf, arg) PetscCheck((sf)->graphset, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %d \"%s\" before %s()", (arg), #sf, PETSC_FUNCTION_NAME)
20   #else
21     #define PetscSFCheckGraphSet(sf, arg) \
22       do { \
23       } while (0)
24   #endif
25 #endif
26 
27 const char *const PetscSFDuplicateOptions[]     = {"CONFONLY", "RANKS", "GRAPH", "PetscSFDuplicateOption", "PETSCSF_DUPLICATE_", NULL};
28 const char *const PetscSFConcatenateRootModes[] = {"local", "shared", "global", "PetscSFConcatenateRootMode", "PETSCSF_CONCATENATE_ROOTMODE_", NULL};
29 
30 /*@
31   PetscSFCreate - create a star forest communication context
32 
33   Collective
34 
35   Input Parameter:
36 . comm - communicator on which the star forest will operate
37 
38   Output Parameter:
39 . sf - new star forest context
40 
41   Options Database Key:
42 + -sf_type basic                 - Use MPI persistent Isend/Irecv for communication (Default)
43 . -sf_type window                - Use MPI-3 one-sided window for communication
44 . -sf_type neighbor              - Use MPI-3 neighborhood collectives for communication
45 - -sf_neighbor_persistent <bool> - If true, use MPI-4 persistent neighborhood collectives for communication (used along with -sf_type neighbor)
46 
47   Level: intermediate
48 
49   Note:
50   When one knows the communication graph is one of the predefined graph, such as `MPI_Alltoall()`, `MPI_Allgatherv()`,
51   `MPI_Gatherv()`, one can create a `PetscSF` and then set its graph with `PetscSFSetGraphWithPattern()`. These special
52   `SF`s are optimized and they have better performance than the general `SF`s.
53 
54 .seealso: `PetscSF`, `PetscSFSetType`, `PetscSFSetGraph()`, `PetscSFSetGraphWithPattern()`, `PetscSFDestroy()`
55 @*/
56 PetscErrorCode PetscSFCreate(MPI_Comm comm, PetscSF *sf)
57 {
58   PetscSF b;
59 
60   PetscFunctionBegin;
61   PetscAssertPointer(sf, 2);
62   PetscCall(PetscSFInitializePackage());
63 
64   PetscCall(PetscHeaderCreate(b, PETSCSF_CLASSID, "PetscSF", "Star Forest", "PetscSF", comm, PetscSFDestroy, PetscSFView));
65   b->nroots    = -1;
66   b->nleaves   = -1;
67   b->minleaf   = PETSC_INT_MAX;
68   b->maxleaf   = PETSC_INT_MIN;
69   b->nranks    = -1;
70   b->rankorder = PETSC_TRUE;
71   b->ingroup   = MPI_GROUP_NULL;
72   b->outgroup  = MPI_GROUP_NULL;
73   b->graphset  = PETSC_FALSE;
74 #if defined(PETSC_HAVE_DEVICE)
75   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
76   b->use_stream_aware_mpi = PETSC_FALSE;
77   b->unknown_input_stream = PETSC_FALSE;
78   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
79   b->backend = PETSCSF_BACKEND_KOKKOS;
80   #elif defined(PETSC_HAVE_CUDA)
81   b->backend = PETSCSF_BACKEND_CUDA;
82   #elif defined(PETSC_HAVE_HIP)
83   b->backend = PETSCSF_BACKEND_HIP;
84   #endif
85 
86   #if defined(PETSC_HAVE_NVSHMEM)
87   b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
88   b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
89   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem", &b->use_nvshmem, NULL));
90   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem_get", &b->use_nvshmem_get, NULL));
91   #endif
92 #endif
93   b->vscat.from_n = -1;
94   b->vscat.to_n   = -1;
95   b->vscat.unit   = MPIU_SCALAR;
96   *sf             = b;
97   PetscFunctionReturn(PETSC_SUCCESS);
98 }
99 
100 /*@
101   PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
102 
103   Collective
104 
105   Input Parameter:
106 . sf - star forest
107 
108   Level: advanced
109 
110 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetGraph()`, `PetscSFDestroy()`
111 @*/
112 PetscErrorCode PetscSFReset(PetscSF sf)
113 {
114   PetscFunctionBegin;
115   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
116   PetscTryTypeMethod(sf, Reset);
117   PetscCall(PetscSFDestroy(&sf->rankssf));
118 
119   sf->nroots   = -1;
120   sf->nleaves  = -1;
121   sf->minleaf  = PETSC_INT_MAX;
122   sf->maxleaf  = PETSC_INT_MIN;
123   sf->mine     = NULL;
124   sf->remote   = NULL;
125   sf->graphset = PETSC_FALSE;
126   PetscCall(PetscFree(sf->mine_alloc));
127   PetscCall(PetscFree(sf->remote_alloc));
128   sf->nranks = -1;
129   PetscCall(PetscFree4(sf->ranks, sf->roffset, sf->rmine, sf->rremote));
130   sf->degreeknown = PETSC_FALSE;
131   PetscCall(PetscFree(sf->degree));
132   if (sf->ingroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->ingroup));
133   if (sf->outgroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->outgroup));
134 
135   if (sf->multi) sf->multi->multi = NULL;
136   PetscCall(PetscSFDestroy(&sf->multi));
137 
138   PetscCall(PetscLayoutDestroy(&sf->map));
139 
140 #if defined(PETSC_HAVE_DEVICE)
141   for (PetscInt i = 0; i < 2; i++) PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, sf->rmine_d[i]));
142 #endif
143 
144   sf->setupcalled = PETSC_FALSE;
145   PetscFunctionReturn(PETSC_SUCCESS);
146 }
147 
148 /*@
149   PetscSFSetType - Set the `PetscSF` communication implementation
150 
151   Collective
152 
153   Input Parameters:
154 + sf   - the `PetscSF` context
155 - type - a known method
156 .vb
157     PETSCSFWINDOW - MPI-2/3 one-sided
158     PETSCSFBASIC - basic implementation using MPI-1 two-sided
159 .ve
160 
161   Options Database Key:
162 . -sf_type <type> - Sets the method; for example `basic` or `window` use -help for a list of available methods
163 
164   Level: intermediate
165 
166   Notes:
167   See `PetscSFType` for possible values
168 
169 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`
170 @*/
171 PetscErrorCode PetscSFSetType(PetscSF sf, PetscSFType type)
172 {
173   PetscBool match;
174   PetscErrorCode (*r)(PetscSF);
175 
176   PetscFunctionBegin;
177   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
178   PetscAssertPointer(type, 2);
179 
180   PetscCall(PetscObjectTypeCompare((PetscObject)sf, type, &match));
181   if (match) PetscFunctionReturn(PETSC_SUCCESS);
182 
183   PetscCall(PetscFunctionListFind(PetscSFList, type, &r));
184   PetscCheck(r, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_UNKNOWN_TYPE, "Unable to find requested PetscSF type %s", type);
185   /* Destroy the previous PetscSF implementation context */
186   PetscTryTypeMethod(sf, Destroy);
187   PetscCall(PetscMemzero(sf->ops, sizeof(*sf->ops)));
188   PetscCall(PetscObjectChangeTypeName((PetscObject)sf, type));
189   PetscCall((*r)(sf));
190   PetscFunctionReturn(PETSC_SUCCESS);
191 }
192 
193 /*@
194   PetscSFGetType - Get the `PetscSF` communication implementation
195 
196   Not Collective
197 
198   Input Parameter:
199 . sf - the `PetscSF` context
200 
201   Output Parameter:
202 . type - the `PetscSF` type name
203 
204   Level: intermediate
205 
206 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetType()`, `PetscSFCreate()`
207 @*/
208 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
209 {
210   PetscFunctionBegin;
211   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
212   PetscAssertPointer(type, 2);
213   *type = ((PetscObject)sf)->type_name;
214   PetscFunctionReturn(PETSC_SUCCESS);
215 }
216 
217 /*@
218   PetscSFDestroy - destroy a star forest
219 
220   Collective
221 
222   Input Parameter:
223 . sf - address of star forest
224 
225   Level: intermediate
226 
227 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFReset()`
228 @*/
229 PetscErrorCode PetscSFDestroy(PetscSF *sf)
230 {
231   PetscFunctionBegin;
232   if (!*sf) PetscFunctionReturn(PETSC_SUCCESS);
233   PetscValidHeaderSpecific(*sf, PETSCSF_CLASSID, 1);
234   if (--((PetscObject)*sf)->refct > 0) {
235     *sf = NULL;
236     PetscFunctionReturn(PETSC_SUCCESS);
237   }
238   PetscCall(PetscSFReset(*sf));
239   PetscTryTypeMethod(*sf, Destroy);
240   PetscCall(PetscSFDestroy(&(*sf)->vscat.lsf));
241   if ((*sf)->vscat.bs > 1) PetscCallMPI(MPI_Type_free(&(*sf)->vscat.unit));
242 #if defined(PETSC_HAVE_CUDA) && defined(PETSC_HAVE_MPIX_STREAM)
243   if ((*sf)->use_stream_aware_mpi) {
244     PetscCallMPI(MPIX_Stream_free(&(*sf)->mpi_stream));
245     PetscCallMPI(MPI_Comm_free(&(*sf)->stream_comm));
246   }
247 #endif
248   PetscCall(PetscHeaderDestroy(sf));
249   PetscFunctionReturn(PETSC_SUCCESS);
250 }
251 
252 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
253 {
254   PetscInt           i, nleaves;
255   PetscMPIInt        size;
256   const PetscInt    *ilocal;
257   const PetscSFNode *iremote;
258 
259   PetscFunctionBegin;
260   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(PETSC_SUCCESS);
261   PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, &iremote));
262   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
263   for (i = 0; i < nleaves; i++) {
264     const PetscInt rank   = iremote[i].rank;
265     const PetscInt remote = iremote[i].index;
266     const PetscInt leaf   = ilocal ? ilocal[i] : i;
267     PetscCheck(rank >= 0 && rank < size, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided rank (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be in [0, %d)", rank, i, size);
268     PetscCheck(remote >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided index (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be >= 0", remote, i);
269     PetscCheck(leaf >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided location (%" PetscInt_FMT ") for leaf %" PetscInt_FMT " is invalid, should be >= 0", leaf, i);
270   }
271   PetscFunctionReturn(PETSC_SUCCESS);
272 }
273 
274 /*@
275   PetscSFSetUp - set up communication structures for a `PetscSF`, after this is done it may be used to perform communication
276 
277   Collective
278 
279   Input Parameter:
280 . sf - star forest communication object
281 
282   Level: beginner
283 
284 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetFromOptions()`, `PetscSFSetType()`
285 @*/
286 PetscErrorCode PetscSFSetUp(PetscSF sf)
287 {
288   PetscFunctionBegin;
289   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
290   PetscSFCheckGraphSet(sf, 1);
291   if (sf->setupcalled) PetscFunctionReturn(PETSC_SUCCESS);
292   PetscCall(PetscLogEventBegin(PETSCSF_SetUp, sf, 0, 0, 0));
293   PetscCall(PetscSFCheckGraphValid_Private(sf));
294   if (!((PetscObject)sf)->type_name) PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); /* Zero all sf->ops */
295   PetscTryTypeMethod(sf, SetUp);
296 #if defined(PETSC_HAVE_CUDA)
297   if (sf->backend == PETSCSF_BACKEND_CUDA) {
298     sf->ops->Malloc = PetscSFMalloc_CUDA;
299     sf->ops->Free   = PetscSFFree_CUDA;
300   }
301 #endif
302 #if defined(PETSC_HAVE_HIP)
303   if (sf->backend == PETSCSF_BACKEND_HIP) {
304     sf->ops->Malloc = PetscSFMalloc_HIP;
305     sf->ops->Free   = PetscSFFree_HIP;
306   }
307 #endif
308 
309 #if defined(PETSC_HAVE_KOKKOS)
310   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
311     sf->ops->Malloc = PetscSFMalloc_Kokkos;
312     sf->ops->Free   = PetscSFFree_Kokkos;
313   }
314 #endif
315   PetscCall(PetscLogEventEnd(PETSCSF_SetUp, sf, 0, 0, 0));
316   sf->setupcalled = PETSC_TRUE;
317   PetscFunctionReturn(PETSC_SUCCESS);
318 }
319 
320 /*@
321   PetscSFSetFromOptions - set `PetscSF` options using the options database
322 
323   Logically Collective
324 
325   Input Parameter:
326 . sf - star forest
327 
328   Options Database Keys:
329 + -sf_type                      - implementation type, see `PetscSFSetType()`
330 . -sf_rank_order                - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
331 . -sf_use_default_stream        - Assume callers of `PetscSF` computed the input root/leafdata with the default CUDA stream. `PetscSF` will also
332                                   use the default stream to process data. Therefore, no stream synchronization is needed between `PetscSF` and its caller (default: true).
333                                   If true, this option only works with `-use_gpu_aware_mpi 1`.
334 . -sf_use_stream_aware_mpi      - Assume the underlying MPI is CUDA-stream aware and `PetscSF` won't sync streams for send/recv buffers passed to MPI (default: false).
335                                   If true, this option only works with `-use_gpu_aware_mpi 1`.
336 
337 - -sf_backend <cuda,hip,kokkos> - Select the device backend`PetscSF` uses. Currently `PetscSF` has these backends: cuda - hip and Kokkos.
338                                   On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
339                                   the only available is kokkos.
340 
341   Level: intermediate
342 
343 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetType()`
344 @*/
345 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
346 {
347   PetscSFType deft;
348   char        type[256];
349   PetscBool   flg;
350 
351   PetscFunctionBegin;
352   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
353   PetscObjectOptionsBegin((PetscObject)sf);
354   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
355   PetscCall(PetscOptionsFList("-sf_type", "PetscSF implementation type", "PetscSFSetType", PetscSFList, deft, type, sizeof(type), &flg));
356   PetscCall(PetscSFSetType(sf, flg ? type : deft));
357   PetscCall(PetscOptionsBool("-sf_rank_order", "sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise", "PetscSFSetRankOrder", sf->rankorder, &sf->rankorder, NULL));
358   PetscCall(PetscOptionsBool("-sf_monitor", "monitor the MPI communication in sf", NULL, sf->monitor, &sf->monitor, NULL));
359 #if defined(PETSC_HAVE_DEVICE)
360   {
361     char      backendstr[32] = {0};
362     PetscBool isCuda = PETSC_FALSE, isHip = PETSC_FALSE, isKokkos = PETSC_FALSE, set;
363     /* Change the defaults set in PetscSFCreate() with command line options */
364     PetscCall(PetscOptionsBool("-sf_unknown_input_stream", "SF root/leafdata is computed on arbitrary streams unknown to SF", "PetscSFSetFromOptions", sf->unknown_input_stream, &sf->unknown_input_stream, NULL));
365     PetscCall(PetscOptionsBool("-sf_use_stream_aware_mpi", "Assume the underlying MPI is cuda-stream aware", "PetscSFSetFromOptions", sf->use_stream_aware_mpi, &sf->use_stream_aware_mpi, NULL));
366     PetscCall(PetscOptionsString("-sf_backend", "Select the device backend SF uses", "PetscSFSetFromOptions", NULL, backendstr, sizeof(backendstr), &set));
367     PetscCall(PetscStrcasecmp("cuda", backendstr, &isCuda));
368     PetscCall(PetscStrcasecmp("kokkos", backendstr, &isKokkos));
369     PetscCall(PetscStrcasecmp("hip", backendstr, &isHip));
370   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
371     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
372     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
373     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
374     else PetscCheck(!set, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
375   #elif defined(PETSC_HAVE_KOKKOS)
376     PetscCheck(!set || isKokkos, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You can only choose kokkos", backendstr);
377   #endif
378 
379   #if defined(PETSC_HAVE_CUDA) && defined(PETSC_HAVE_MPIX_STREAM)
380     if (sf->use_stream_aware_mpi) {
381       MPI_Info info;
382 
383       PetscCallMPI(MPI_Info_create(&info));
384       PetscCallMPI(MPI_Info_set(info, "type", "cudaStream_t"));
385       PetscCallMPI(MPIX_Info_set_hex(info, "value", &PetscDefaultCudaStream, sizeof(PetscDefaultCudaStream)));
386       PetscCallMPI(MPIX_Stream_create(info, &sf->mpi_stream));
387       PetscCallMPI(MPI_Info_free(&info));
388       PetscCallMPI(MPIX_Stream_comm_create(PetscObjectComm((PetscObject)sf), sf->mpi_stream, &sf->stream_comm));
389     }
390   #endif
391   }
392 #endif
393   PetscTryTypeMethod(sf, SetFromOptions, PetscOptionsObject);
394   PetscOptionsEnd();
395   PetscFunctionReturn(PETSC_SUCCESS);
396 }
397 
398 /*@
399   PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
400 
401   Logically Collective
402 
403   Input Parameters:
404 + sf  - star forest
405 - flg - `PETSC_TRUE` to sort, `PETSC_FALSE` to skip sorting (lower setup cost, but non-deterministic)
406 
407   Level: advanced
408 
409 .seealso: `PetscSF`, `PetscSFType`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`
410 @*/
411 PetscErrorCode PetscSFSetRankOrder(PetscSF sf, PetscBool flg)
412 {
413   PetscFunctionBegin;
414   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
415   PetscValidLogicalCollectiveBool(sf, flg, 2);
416   PetscCheck(!sf->multi, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
417   sf->rankorder = flg;
418   PetscFunctionReturn(PETSC_SUCCESS);
419 }
420 
421 /*@
422   PetscSFSetGraph - Set a parallel star forest
423 
424   Collective
425 
426   Input Parameters:
427 + sf         - star forest
428 . nroots     - number of root vertices on the current process (these are possible targets for other process to attach leaves)
429 . nleaves    - number of leaf vertices on the current process, each of these references a root on any process
430 . ilocal     - locations of leaves in leafdata buffers, pass `NULL` for contiguous storage (locations must be >= 0, enforced
431 during setup in debug mode)
432 . localmode  - copy mode for `ilocal`
433 . iremote    - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
434 during setup in debug mode)
435 - remotemode - copy mode for `iremote`
436 
437   Level: intermediate
438 
439   Notes:
440   Leaf indices in `ilocal` must be unique, otherwise an error occurs.
441 
442   Input arrays `ilocal` and `iremote` follow the `PetscCopyMode` semantics.
443   In particular, if `localmode` or `remotemode` is `PETSC_OWN_POINTER` or `PETSC_USE_POINTER`,
444   PETSc might modify the respective array;
445   if `PETSC_USE_POINTER`, the user must delete the array after `PetscSFDestroy()`.
446   Only if `PETSC_COPY_VALUES` is used, the respective array is guaranteed to stay intact and a const array can be passed (but a cast to non-const is needed).
447 
448   Fortran Notes:
449   In Fortran you must use `PETSC_COPY_VALUES` for `localmode` and `remotemode`.
450 
451   Developer Notes:
452   We sort leaves to check for duplicates and contiguousness and to find minleaf/maxleaf.
453   This also allows to compare leaf sets of two `PetscSF`s easily.
454 
455 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
456 @*/
457 PetscErrorCode PetscSFSetGraph(PetscSF sf, PetscInt nroots, PetscInt nleaves, PetscInt *ilocal, PetscCopyMode localmode, PetscSFNode *iremote, PetscCopyMode remotemode)
458 {
459   PetscBool unique, contiguous;
460 
461   PetscFunctionBegin;
462   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
463   if (nleaves > 0 && ilocal) PetscAssertPointer(ilocal, 4);
464   if (nleaves > 0) PetscAssertPointer(iremote, 6);
465   PetscCheck(nroots >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nroots %" PetscInt_FMT ", cannot be negative", nroots);
466   PetscCheck(nleaves >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nleaves %" PetscInt_FMT ", cannot be negative", nleaves);
467   /* enums may be handled as unsigned by some compilers, NVHPC for example, the int cast
468    * below is to prevent NVHPC from warning about meaningless comparison of unsigned with zero */
469   PetscCheck((int)localmode >= PETSC_COPY_VALUES && localmode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong localmode %d", localmode);
470   PetscCheck((int)remotemode >= PETSC_COPY_VALUES && remotemode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong remotemode %d", remotemode);
471 
472   if (sf->nroots >= 0) { /* Reset only if graph already set */
473     PetscCall(PetscSFReset(sf));
474   }
475 
476   PetscCall(PetscLogEventBegin(PETSCSF_SetGraph, sf, 0, 0, 0));
477   if (PetscDefined(USE_DEBUG)) {
478     PetscMPIInt size;
479 
480     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
481     for (PetscInt i = 0; i < nleaves; i++) { PetscCheck(iremote[i].rank >= -1 && iremote[i].rank < size, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "iremote contains incorrect rank values"); }
482   }
483 
484   sf->nroots  = nroots;
485   sf->nleaves = nleaves;
486 
487   if (localmode == PETSC_COPY_VALUES && ilocal) {
488     PetscInt *tlocal = NULL;
489 
490     PetscCall(PetscMalloc1(nleaves, &tlocal));
491     PetscCall(PetscArraycpy(tlocal, ilocal, nleaves));
492     ilocal = tlocal;
493   }
494   if (remotemode == PETSC_COPY_VALUES) {
495     PetscSFNode *tremote = NULL;
496 
497     PetscCall(PetscMalloc1(nleaves, &tremote));
498     PetscCall(PetscArraycpy(tremote, iremote, nleaves));
499     iremote = tremote;
500   }
501 
502   if (nleaves && ilocal) {
503     PetscSFNode work;
504 
505     PetscCall(PetscSortIntWithDataArray(nleaves, ilocal, iremote, sizeof(PetscSFNode), &work));
506     PetscCall(PetscSortedCheckDupsInt(nleaves, ilocal, &unique));
507     unique = PetscNot(unique);
508     PetscCheck(sf->allow_multi_leaves || unique, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Input ilocal has duplicate entries which is not allowed for this PetscSF");
509     sf->minleaf = ilocal[0];
510     sf->maxleaf = ilocal[nleaves - 1];
511     contiguous  = (PetscBool)(unique && ilocal[0] == 0 && ilocal[nleaves - 1] == nleaves - 1);
512   } else {
513     sf->minleaf = 0;
514     sf->maxleaf = nleaves - 1;
515     unique      = PETSC_TRUE;
516     contiguous  = PETSC_TRUE;
517   }
518 
519   if (contiguous) {
520     if (localmode == PETSC_USE_POINTER) {
521       ilocal = NULL;
522     } else {
523       PetscCall(PetscFree(ilocal));
524     }
525   }
526   sf->mine = ilocal;
527   if (localmode == PETSC_USE_POINTER) {
528     sf->mine_alloc = NULL;
529   } else {
530     sf->mine_alloc = ilocal;
531   }
532   if (PetscDefined(USE_DEBUG)) {
533     PetscMPIInt size;
534 
535     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
536     for (PetscInt i = 0; i < nleaves; i++) { PetscCheck(iremote[i].rank >= -1 && iremote[i].rank < size, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "iremote contains incorrect rank values"); }
537   }
538   sf->remote = iremote;
539   if (remotemode == PETSC_USE_POINTER) {
540     sf->remote_alloc = NULL;
541   } else {
542     sf->remote_alloc = iremote;
543   }
544   PetscCall(PetscLogEventEnd(PETSCSF_SetGraph, sf, 0, 0, 0));
545   sf->graphset = PETSC_TRUE;
546   PetscFunctionReturn(PETSC_SUCCESS);
547 }
548 
549 /*@
550   PetscSFSetGraphWithPattern - Sets the graph of a `PetscSF` with a specific pattern
551 
552   Collective
553 
554   Input Parameters:
555 + sf      - The `PetscSF`
556 . map     - Layout of roots over all processes (insignificant when pattern is `PETSCSF_PATTERN_ALLTOALL`)
557 - pattern - One of `PETSCSF_PATTERN_ALLGATHER`, `PETSCSF_PATTERN_GATHER`, `PETSCSF_PATTERN_ALLTOALL`
558 
559   Level: intermediate
560 
561   Notes:
562   It is easier to explain `PetscSFPattern` using vectors. Suppose we have an MPI vector `x` and its `PetscLayout` is `map`.
563   `n` and `N` are the local and global sizes of `x` respectively.
564 
565   With `PETSCSF_PATTERN_ALLGATHER`, the routine creates a graph that if one does `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on it, it will copy `x` to
566   sequential vectors `y` on all MPI processes.
567 
568   With `PETSCSF_PATTERN_GATHER`, the routine creates a graph that if one does `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on it, it will copy `x` to a
569   sequential vector `y` on rank 0.
570 
571   In above cases, entries of `x` are roots and entries of `y` are leaves.
572 
573   With `PETSCSF_PATTERN_ALLTOALL`, map is insignificant. Suppose NP is size of `sf`'s communicator. The routine
574   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
575   of rank j. Here 0 <=i,j<NP. It is a kind of `MPI_Alltoall()` with sendcount/recvcount being 1. Note that it does
576   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
577   items with `MPI_Type_contiguous` and use that as the <unit> argument in SF routines.
578 
579   In this case, roots and leaves are symmetric.
580 
581 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
582  @*/
583 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf, PetscLayout map, PetscSFPattern pattern)
584 {
585   MPI_Comm    comm;
586   PetscInt    n, N, res[2];
587   PetscMPIInt rank, size;
588   PetscSFType type;
589 
590   PetscFunctionBegin;
591   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
592   if (pattern != PETSCSF_PATTERN_ALLTOALL) PetscAssertPointer(map, 2);
593   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
594   PetscCheck(pattern >= PETSCSF_PATTERN_ALLGATHER && pattern <= PETSCSF_PATTERN_ALLTOALL, comm, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported PetscSFPattern %d", pattern);
595   PetscCallMPI(MPI_Comm_rank(comm, &rank));
596   PetscCallMPI(MPI_Comm_size(comm, &size));
597 
598   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
599     type = PETSCSFALLTOALL;
600     PetscCall(PetscLayoutCreate(comm, &sf->map));
601     PetscCall(PetscLayoutSetLocalSize(sf->map, size));
602     PetscCall(PetscLayoutSetSize(sf->map, (PetscInt)size * size));
603     PetscCall(PetscLayoutSetUp(sf->map));
604   } else {
605     PetscCall(PetscLayoutGetLocalSize(map, &n));
606     PetscCall(PetscLayoutGetSize(map, &N));
607     res[0] = n;
608     res[1] = -n;
609     /* Check if n are same over all ranks so that we can optimize it */
610     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, res, 2, MPIU_INT, MPI_MAX, comm));
611     if (res[0] == -res[1]) { /* same n */
612       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER : PETSCSFGATHER;
613     } else {
614       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
615     }
616     PetscCall(PetscLayoutReference(map, &sf->map));
617   }
618   PetscCall(PetscSFSetType(sf, type));
619 
620   sf->pattern = pattern;
621   sf->mine    = NULL; /* Contiguous */
622 
623   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
624      Also set other easy stuff.
625    */
626   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
627     sf->nleaves = N;
628     sf->nroots  = n;
629     sf->nranks  = size;
630     sf->minleaf = 0;
631     sf->maxleaf = N - 1;
632   } else if (pattern == PETSCSF_PATTERN_GATHER) {
633     sf->nleaves = rank ? 0 : N;
634     sf->nroots  = n;
635     sf->nranks  = rank ? 0 : size;
636     sf->minleaf = 0;
637     sf->maxleaf = rank ? -1 : N - 1;
638   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
639     sf->nleaves = size;
640     sf->nroots  = size;
641     sf->nranks  = size;
642     sf->minleaf = 0;
643     sf->maxleaf = size - 1;
644   }
645   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
646   sf->graphset = PETSC_TRUE;
647   PetscFunctionReturn(PETSC_SUCCESS);
648 }
649 
650 /*@
651   PetscSFCreateInverseSF - given a `PetscSF` in which all vertices have degree 1, creates the inverse map
652 
653   Collective
654 
655   Input Parameter:
656 . sf - star forest to invert
657 
658   Output Parameter:
659 . isf - inverse of `sf`
660 
661   Level: advanced
662 
663   Notes:
664   All roots must have degree 1.
665 
666   The local space may be a permutation, but cannot be sparse.
667 
668 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetGraph()`
669 @*/
670 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf, PetscSF *isf)
671 {
672   PetscMPIInt     rank;
673   PetscInt        i, nroots, nleaves, maxlocal, count, *newilocal;
674   const PetscInt *ilocal;
675   PetscSFNode    *roots, *leaves;
676 
677   PetscFunctionBegin;
678   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
679   PetscSFCheckGraphSet(sf, 1);
680   PetscAssertPointer(isf, 2);
681 
682   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, NULL));
683   maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
684 
685   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
686   PetscCall(PetscMalloc2(nroots, &roots, maxlocal, &leaves));
687   for (i = 0; i < maxlocal; i++) {
688     leaves[i].rank  = rank;
689     leaves[i].index = i;
690   }
691   for (i = 0; i < nroots; i++) {
692     roots[i].rank  = -1;
693     roots[i].index = -1;
694   }
695   PetscCall(PetscSFReduceBegin(sf, MPIU_SF_NODE, leaves, roots, MPI_REPLACE));
696   PetscCall(PetscSFReduceEnd(sf, MPIU_SF_NODE, leaves, roots, MPI_REPLACE));
697 
698   /* Check whether our leaves are sparse */
699   for (i = 0, count = 0; i < nroots; i++)
700     if (roots[i].rank >= 0) count++;
701   if (count == nroots) newilocal = NULL;
702   else { /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */ PetscCall(PetscMalloc1(count, &newilocal));
703     for (i = 0, count = 0; i < nroots; i++) {
704       if (roots[i].rank >= 0) {
705         newilocal[count]   = i;
706         roots[count].rank  = roots[i].rank;
707         roots[count].index = roots[i].index;
708         count++;
709       }
710     }
711   }
712 
713   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, isf));
714   PetscCall(PetscSFSetGraph(*isf, maxlocal, count, newilocal, PETSC_OWN_POINTER, roots, PETSC_COPY_VALUES));
715   PetscCall(PetscFree2(roots, leaves));
716   PetscFunctionReturn(PETSC_SUCCESS);
717 }
718 
719 /*@
720   PetscSFDuplicate - duplicate a `PetscSF`, optionally preserving rank connectivity and graph
721 
722   Collective
723 
724   Input Parameters:
725 + sf  - communication object to duplicate
726 - opt - `PETSCSF_DUPLICATE_CONFONLY`, `PETSCSF_DUPLICATE_RANKS`, or `PETSCSF_DUPLICATE_GRAPH` (see `PetscSFDuplicateOption`)
727 
728   Output Parameter:
729 . newsf - new communication object
730 
731   Level: beginner
732 
733 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFSetType()`, `PetscSFSetGraph()`
734 @*/
735 PetscErrorCode PetscSFDuplicate(PetscSF sf, PetscSFDuplicateOption opt, PetscSF *newsf)
736 {
737   PetscSFType  type;
738   MPI_Datatype dtype = MPIU_SCALAR;
739 
740   PetscFunctionBegin;
741   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
742   PetscValidLogicalCollectiveEnum(sf, opt, 2);
743   PetscAssertPointer(newsf, 3);
744   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sf), newsf));
745   PetscCall(PetscSFGetType(sf, &type));
746   if (type) PetscCall(PetscSFSetType(*newsf, type));
747   (*newsf)->allow_multi_leaves = sf->allow_multi_leaves; /* Dup this flag earlier since PetscSFSetGraph() below checks on this flag */
748   if (opt == PETSCSF_DUPLICATE_GRAPH) {
749     PetscSFCheckGraphSet(sf, 1);
750     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
751       PetscInt           nroots, nleaves;
752       const PetscInt    *ilocal;
753       const PetscSFNode *iremote;
754       PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
755       PetscCall(PetscSFSetGraph(*newsf, nroots, nleaves, (PetscInt *)ilocal, PETSC_COPY_VALUES, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
756     } else {
757       PetscCall(PetscSFSetGraphWithPattern(*newsf, sf->map, sf->pattern));
758     }
759   }
760   /* Since oldtype is committed, so is newtype, according to MPI */
761   if (sf->vscat.bs > 1) PetscCallMPI(MPI_Type_dup(sf->vscat.unit, &dtype));
762   (*newsf)->vscat.bs     = sf->vscat.bs;
763   (*newsf)->vscat.unit   = dtype;
764   (*newsf)->vscat.to_n   = sf->vscat.to_n;
765   (*newsf)->vscat.from_n = sf->vscat.from_n;
766   /* Do not copy lsf. Build it on demand since it is rarely used */
767 
768 #if defined(PETSC_HAVE_DEVICE)
769   (*newsf)->backend              = sf->backend;
770   (*newsf)->unknown_input_stream = sf->unknown_input_stream;
771   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
772   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
773 #endif
774   PetscTryTypeMethod(sf, Duplicate, opt, *newsf);
775   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
776   PetscFunctionReturn(PETSC_SUCCESS);
777 }
778 
779 /*@C
780   PetscSFGetGraph - Get the graph specifying a parallel star forest
781 
782   Not Collective
783 
784   Input Parameter:
785 . sf - star forest
786 
787   Output Parameters:
788 + nroots  - number of root vertices on the current process (these are possible targets for other process to attach leaves)
789 . nleaves - number of leaf vertices on the current process, each of these references a root on any process
790 . ilocal  - locations of leaves in leafdata buffers (if returned value is `NULL`, it means leaves are in contiguous storage)
791 - iremote - remote locations of root vertices for each leaf on the current process
792 
793   Level: intermediate
794 
795   Notes:
796   We are not currently requiring that the graph is set, thus returning `nroots` = -1 if it has not been set yet
797 
798   The returned `ilocal` and `iremote` might contain values in different order than the input ones in `PetscSFSetGraph()`
799 
800   Fortran Notes:
801   The returned `iremote` array is a copy and must be deallocated after use. Consequently, if you
802   want to update the graph, you must call `PetscSFSetGraph()` after modifying the `iremote` array.
803 
804   To check for a `NULL` `ilocal` use
805 $      if (loc(ilocal) == loc(PETSC_NULL_INTEGER)) then
806 
807 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`
808 @*/
809 PetscErrorCode PetscSFGetGraph(PetscSF sf, PetscInt *nroots, PetscInt *nleaves, const PetscInt **ilocal, const PetscSFNode **iremote)
810 {
811   PetscFunctionBegin;
812   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
813   if (sf->ops->GetGraph) {
814     PetscCall(sf->ops->GetGraph(sf, nroots, nleaves, ilocal, iremote));
815   } else {
816     if (nroots) *nroots = sf->nroots;
817     if (nleaves) *nleaves = sf->nleaves;
818     if (ilocal) *ilocal = sf->mine;
819     if (iremote) *iremote = sf->remote;
820   }
821   PetscFunctionReturn(PETSC_SUCCESS);
822 }
823 
824 /*@
825   PetscSFGetLeafRange - Get the active leaf ranges
826 
827   Not Collective
828 
829   Input Parameter:
830 . sf - star forest
831 
832   Output Parameters:
833 + minleaf - minimum active leaf on this process. Returns 0 if there are no leaves.
834 - maxleaf - maximum active leaf on this process. Returns -1 if there are no leaves.
835 
836   Level: developer
837 
838 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
839 @*/
840 PetscErrorCode PetscSFGetLeafRange(PetscSF sf, PetscInt *minleaf, PetscInt *maxleaf)
841 {
842   PetscFunctionBegin;
843   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
844   PetscSFCheckGraphSet(sf, 1);
845   if (minleaf) *minleaf = sf->minleaf;
846   if (maxleaf) *maxleaf = sf->maxleaf;
847   PetscFunctionReturn(PETSC_SUCCESS);
848 }
849 
850 /*@
851   PetscSFViewFromOptions - View a `PetscSF` based on arguments in the options database
852 
853   Collective
854 
855   Input Parameters:
856 + A    - the star forest
857 . obj  - Optional object that provides the prefix for the option names
858 - name - command line option
859 
860   Level: intermediate
861 
862   Note:
863   See `PetscObjectViewFromOptions()` for possible `PetscViewer` and `PetscViewerFormat`
864 
865 .seealso: `PetscSF`, `PetscSFView`, `PetscObjectViewFromOptions()`, `PetscSFCreate()`
866 @*/
867 PetscErrorCode PetscSFViewFromOptions(PetscSF A, PetscObject obj, const char name[])
868 {
869   PetscFunctionBegin;
870   PetscValidHeaderSpecific(A, PETSCSF_CLASSID, 1);
871   PetscCall(PetscObjectViewFromOptions((PetscObject)A, obj, name));
872   PetscFunctionReturn(PETSC_SUCCESS);
873 }
874 
875 /*@
876   PetscSFView - view a star forest
877 
878   Collective
879 
880   Input Parameters:
881 + sf     - star forest
882 - viewer - viewer to display graph, for example `PETSC_VIEWER_STDOUT_WORLD`
883 
884   Level: beginner
885 
886 .seealso: `PetscSF`, `PetscViewer`, `PetscSFCreate()`, `PetscSFSetGraph()`
887 @*/
888 PetscErrorCode PetscSFView(PetscSF sf, PetscViewer viewer)
889 {
890   PetscBool         iascii;
891   PetscViewerFormat format;
892 
893   PetscFunctionBegin;
894   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
895   if (!viewer) PetscCall(PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf), &viewer));
896   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
897   PetscCheckSameComm(sf, 1, viewer, 2);
898   if (sf->graphset) PetscCall(PetscSFSetUp(sf));
899   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
900   if (iascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) {
901     PetscMPIInt rank;
902     PetscInt    j;
903 
904     PetscCall(PetscObjectPrintClassNamePrefixType((PetscObject)sf, viewer));
905     PetscCall(PetscViewerASCIIPushTab(viewer));
906     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
907       if (!sf->graphset) {
908         PetscCall(PetscViewerASCIIPrintf(viewer, "PetscSFSetGraph() has not been called yet\n"));
909         PetscCall(PetscViewerASCIIPopTab(viewer));
910         PetscFunctionReturn(PETSC_SUCCESS);
911       }
912       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
913       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
914       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Number of roots=%" PetscInt_FMT ", leaves=%" PetscInt_FMT ", remote ranks=%d\n", rank, sf->nroots, sf->nleaves, sf->nranks));
915       for (PetscInt i = 0; i < sf->nleaves; i++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %" PetscInt_FMT " <- (%d,%" PetscInt_FMT ")\n", rank, sf->mine ? sf->mine[i] : i, (PetscMPIInt)sf->remote[i].rank, sf->remote[i].index));
916       PetscCall(PetscViewerFlush(viewer));
917       PetscCall(PetscViewerGetFormat(viewer, &format));
918       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
919         PetscMPIInt *tmpranks, *perm;
920 
921         PetscCall(PetscMalloc2(sf->nranks, &tmpranks, sf->nranks, &perm));
922         PetscCall(PetscArraycpy(tmpranks, sf->ranks, sf->nranks));
923         for (PetscMPIInt i = 0; i < sf->nranks; i++) perm[i] = i;
924         PetscCall(PetscSortMPIIntWithArray(sf->nranks, tmpranks, perm));
925         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Roots referenced by my leaves, by rank\n", rank));
926         for (PetscMPIInt ii = 0; ii < sf->nranks; ii++) {
927           PetscMPIInt i = perm[ii];
928 
929           PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %d: %" PetscInt_FMT " edges\n", rank, sf->ranks[i], sf->roffset[i + 1] - sf->roffset[i]));
930           for (j = sf->roffset[i]; j < sf->roffset[i + 1]; j++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d]    %" PetscInt_FMT " <- %" PetscInt_FMT "\n", rank, sf->rmine[j], sf->rremote[j]));
931         }
932         PetscCall(PetscFree2(tmpranks, perm));
933       }
934       PetscCall(PetscViewerFlush(viewer));
935       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
936     }
937     PetscCall(PetscViewerASCIIPopTab(viewer));
938   }
939   PetscTryTypeMethod(sf, View, viewer);
940   PetscFunctionReturn(PETSC_SUCCESS);
941 }
942 
943 /*@C
944   PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
945 
946   Not Collective
947 
948   Input Parameter:
949 . sf - star forest
950 
951   Output Parameters:
952 + nranks  - number of ranks referenced by local part
953 . ranks   - [`nranks`] array of ranks
954 . roffset - [`nranks`+1] offset in `rmine`/`rremote` for each rank
955 . rmine   - [`roffset`[`nranks`]] concatenated array holding local indices referencing each remote rank, or `NULL`
956 - rremote - [`roffset`[`nranks`]] concatenated array holding remote indices referenced for each remote rank, or `NULL`
957 
958   Level: developer
959 
960 .seealso: `PetscSF`, `PetscSFGetLeafRanks()`
961 @*/
962 PetscErrorCode PetscSFGetRootRanks(PetscSF sf, PetscMPIInt *nranks, const PetscMPIInt **ranks, const PetscInt **roffset, const PetscInt **rmine, const PetscInt **rremote)
963 {
964   PetscFunctionBegin;
965   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
966   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
967   if (sf->ops->GetRootRanks) {
968     PetscUseTypeMethod(sf, GetRootRanks, nranks, ranks, roffset, rmine, rremote);
969   } else {
970     /* The generic implementation */
971     if (nranks) *nranks = sf->nranks;
972     if (ranks) *ranks = sf->ranks;
973     if (roffset) *roffset = sf->roffset;
974     if (rmine) *rmine = sf->rmine;
975     if (rremote) *rremote = sf->rremote;
976   }
977   PetscFunctionReturn(PETSC_SUCCESS);
978 }
979 
980 /*@C
981   PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
982 
983   Not Collective
984 
985   Input Parameter:
986 . sf - star forest
987 
988   Output Parameters:
989 + niranks  - number of leaf ranks referencing roots on this process
990 . iranks   - [`niranks`] array of ranks
991 . ioffset  - [`niranks`+1] offset in `irootloc` for each rank
992 - irootloc - [`ioffset`[`niranks`]] concatenated array holding local indices of roots referenced by each leaf rank
993 
994   Level: developer
995 
996 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
997 @*/
998 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf, PetscMPIInt *niranks, const PetscMPIInt **iranks, const PetscInt **ioffset, const PetscInt **irootloc)
999 {
1000   PetscFunctionBegin;
1001   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1002   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
1003   if (sf->ops->GetLeafRanks) {
1004     PetscUseTypeMethod(sf, GetLeafRanks, niranks, iranks, ioffset, irootloc);
1005   } else {
1006     PetscSFType type;
1007     PetscCall(PetscSFGetType(sf, &type));
1008     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
1009   }
1010   PetscFunctionReturn(PETSC_SUCCESS);
1011 }
1012 
1013 static PetscBool InList(PetscMPIInt needle, PetscMPIInt n, const PetscMPIInt *list)
1014 {
1015   PetscInt i;
1016   for (i = 0; i < n; i++) {
1017     if (needle == list[i]) return PETSC_TRUE;
1018   }
1019   return PETSC_FALSE;
1020 }
1021 
1022 /*@C
1023   PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by `PetscSF` implementations.
1024 
1025   Collective
1026 
1027   Input Parameters:
1028 + sf     - `PetscSF` to set up; `PetscSFSetGraph()` must have been called
1029 - dgroup - `MPI_Group` of ranks to be distinguished (e.g., for self or shared memory exchange)
1030 
1031   Level: developer
1032 
1033 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
1034 @*/
1035 PetscErrorCode PetscSFSetUpRanks(PetscSF sf, MPI_Group dgroup)
1036 {
1037   PetscHMapI    table;
1038   PetscHashIter pos;
1039   PetscMPIInt   size, groupsize, *groupranks, *ranks;
1040   PetscInt     *rcount;
1041   PetscInt      irank, sfnrank, ranksi;
1042   PetscMPIInt   i, orank = -1;
1043 
1044   PetscFunctionBegin;
1045   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1046   PetscSFCheckGraphSet(sf, 1);
1047   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
1048   PetscCall(PetscHMapICreateWithSize(10, &table));
1049   for (i = 0; i < sf->nleaves; i++) {
1050     /* Log 1-based rank */
1051     PetscCall(PetscHMapISetWithMode(table, sf->remote[i].rank + 1, 1, ADD_VALUES));
1052   }
1053   PetscCall(PetscHMapIGetSize(table, &sfnrank));
1054   PetscCall(PetscMPIIntCast(sfnrank, &sf->nranks));
1055   PetscCall(PetscMalloc4(sf->nranks, &sf->ranks, sf->nranks + 1, &sf->roffset, sf->nleaves, &sf->rmine, sf->nleaves, &sf->rremote));
1056   PetscCall(PetscMalloc2(sf->nranks, &rcount, sf->nranks, &ranks));
1057   PetscHashIterBegin(table, pos);
1058   for (i = 0; i < sf->nranks; i++) {
1059     PetscHashIterGetKey(table, pos, ranksi);
1060     PetscCall(PetscMPIIntCast(ranksi, &ranks[i]));
1061     PetscHashIterGetVal(table, pos, rcount[i]);
1062     PetscHashIterNext(table, pos);
1063     ranks[i]--; /* Convert back to 0-based */
1064   }
1065   PetscCall(PetscHMapIDestroy(&table));
1066 
1067   /* We expect that dgroup is reliably "small" while nranks could be large */
1068   {
1069     MPI_Group    group = MPI_GROUP_NULL;
1070     PetscMPIInt *dgroupranks;
1071 
1072     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1073     PetscCallMPI(MPI_Group_size(dgroup, &groupsize));
1074     PetscCall(PetscMalloc1(groupsize, &dgroupranks));
1075     PetscCall(PetscMalloc1(groupsize, &groupranks));
1076     for (i = 0; i < groupsize; i++) dgroupranks[i] = i;
1077     if (groupsize) PetscCallMPI(MPI_Group_translate_ranks(dgroup, groupsize, dgroupranks, group, groupranks));
1078     PetscCallMPI(MPI_Group_free(&group));
1079     PetscCall(PetscFree(dgroupranks));
1080   }
1081 
1082   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1083   for (sf->ndranks = 0, i = sf->nranks; sf->ndranks < i;) {
1084     for (i--; sf->ndranks < i; i--) { /* Scan i backward looking for distinguished rank */
1085       if (InList(ranks[i], groupsize, groupranks)) break;
1086     }
1087     for (; sf->ndranks <= i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1088       if (!InList(ranks[sf->ndranks], groupsize, groupranks)) break;
1089     }
1090     if (sf->ndranks < i) { /* Swap ranks[sf->ndranks] with ranks[i] */
1091       PetscMPIInt tmprank;
1092       PetscInt    tmpcount;
1093 
1094       tmprank             = ranks[i];
1095       tmpcount            = rcount[i];
1096       ranks[i]            = ranks[sf->ndranks];
1097       rcount[i]           = rcount[sf->ndranks];
1098       ranks[sf->ndranks]  = tmprank;
1099       rcount[sf->ndranks] = tmpcount;
1100       sf->ndranks++;
1101     }
1102   }
1103   PetscCall(PetscFree(groupranks));
1104   PetscCall(PetscSortMPIIntWithIntArray(sf->ndranks, ranks, rcount));
1105   if (rcount) PetscCall(PetscSortMPIIntWithIntArray(sf->nranks - sf->ndranks, ranks + sf->ndranks, rcount + sf->ndranks));
1106   sf->roffset[0] = 0;
1107   for (i = 0; i < sf->nranks; i++) {
1108     PetscCall(PetscMPIIntCast(ranks[i], sf->ranks + i));
1109     sf->roffset[i + 1] = sf->roffset[i] + rcount[i];
1110     rcount[i]          = 0;
1111   }
1112   for (i = 0, irank = -1, orank = -1; i < sf->nleaves; i++) {
1113     /* short circuit */
1114     if (orank != sf->remote[i].rank) {
1115       /* Search for index of iremote[i].rank in sf->ranks */
1116       PetscCall(PetscFindMPIInt((PetscMPIInt)sf->remote[i].rank, sf->ndranks, sf->ranks, &irank));
1117       if (irank < 0) {
1118         PetscCall(PetscFindMPIInt((PetscMPIInt)sf->remote[i].rank, sf->nranks - sf->ndranks, sf->ranks + sf->ndranks, &irank));
1119         if (irank >= 0) irank += sf->ndranks;
1120       }
1121       orank = (PetscMPIInt)sf->remote[i].rank;
1122     }
1123     PetscCheck(irank >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Could not find rank %d in array", (PetscMPIInt)sf->remote[i].rank);
1124     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1125     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1126     rcount[irank]++;
1127   }
1128   PetscCall(PetscFree2(rcount, ranks));
1129   PetscFunctionReturn(PETSC_SUCCESS);
1130 }
1131 
1132 /*@C
1133   PetscSFGetGroups - gets incoming and outgoing process groups
1134 
1135   Collective
1136 
1137   Input Parameter:
1138 . sf - star forest
1139 
1140   Output Parameters:
1141 + incoming - group of origin processes for incoming edges (leaves that reference my roots)
1142 - outgoing - group of destination processes for outgoing edges (roots that I reference)
1143 
1144   Level: developer
1145 
1146 .seealso: `PetscSF`, `PetscSFGetWindow()`, `PetscSFRestoreWindow()`
1147 @*/
1148 PetscErrorCode PetscSFGetGroups(PetscSF sf, MPI_Group *incoming, MPI_Group *outgoing)
1149 {
1150   MPI_Group group = MPI_GROUP_NULL;
1151 
1152   PetscFunctionBegin;
1153   PetscCheck(sf->nranks >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUpRanks() before obtaining groups");
1154   if (sf->ingroup == MPI_GROUP_NULL) {
1155     PetscInt        i;
1156     const PetscInt *indegree;
1157     PetscMPIInt     rank, *outranks, *inranks, indegree0;
1158     PetscSFNode    *remote;
1159     PetscSF         bgcount;
1160 
1161     /* Compute the number of incoming ranks */
1162     PetscCall(PetscMalloc1(sf->nranks, &remote));
1163     for (i = 0; i < sf->nranks; i++) {
1164       remote[i].rank  = sf->ranks[i];
1165       remote[i].index = 0;
1166     }
1167     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, &bgcount));
1168     PetscCall(PetscSFSetGraph(bgcount, 1, sf->nranks, NULL, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1169     PetscCall(PetscSFComputeDegreeBegin(bgcount, &indegree));
1170     PetscCall(PetscSFComputeDegreeEnd(bgcount, &indegree));
1171     /* Enumerate the incoming ranks */
1172     PetscCall(PetscMalloc2(indegree[0], &inranks, sf->nranks, &outranks));
1173     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1174     for (i = 0; i < sf->nranks; i++) outranks[i] = rank;
1175     PetscCall(PetscSFGatherBegin(bgcount, MPI_INT, outranks, inranks));
1176     PetscCall(PetscSFGatherEnd(bgcount, MPI_INT, outranks, inranks));
1177     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1178     PetscCall(PetscMPIIntCast(indegree[0], &indegree0));
1179     PetscCallMPI(MPI_Group_incl(group, indegree0, inranks, &sf->ingroup));
1180     PetscCallMPI(MPI_Group_free(&group));
1181     PetscCall(PetscFree2(inranks, outranks));
1182     PetscCall(PetscSFDestroy(&bgcount));
1183   }
1184   *incoming = sf->ingroup;
1185 
1186   if (sf->outgroup == MPI_GROUP_NULL) {
1187     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1188     PetscCallMPI(MPI_Group_incl(group, sf->nranks, sf->ranks, &sf->outgroup));
1189     PetscCallMPI(MPI_Group_free(&group));
1190   }
1191   *outgoing = sf->outgroup;
1192   PetscFunctionReturn(PETSC_SUCCESS);
1193 }
1194 
1195 /*@
1196   PetscSFGetRanksSF - gets the `PetscSF` to perform communications with root ranks
1197 
1198   Collective
1199 
1200   Input Parameter:
1201 . sf - star forest
1202 
1203   Output Parameter:
1204 . rsf - the star forest with a single root per process to perform communications
1205 
1206   Level: developer
1207 
1208 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGetRootRanks()`
1209 @*/
1210 PetscErrorCode PetscSFGetRanksSF(PetscSF sf, PetscSF *rsf)
1211 {
1212   PetscFunctionBegin;
1213   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1214   PetscAssertPointer(rsf, 2);
1215   if (!sf->rankssf) {
1216     PetscSFNode       *rremotes;
1217     const PetscMPIInt *ranks;
1218     PetscMPIInt        nranks;
1219 
1220     PetscCall(PetscSFGetRootRanks(sf, &nranks, &ranks, NULL, NULL, NULL));
1221     PetscCall(PetscMalloc1(nranks, &rremotes));
1222     for (PetscInt i = 0; i < nranks; i++) {
1223       rremotes[i].rank  = ranks[i];
1224       rremotes[i].index = 0;
1225     }
1226     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, &sf->rankssf));
1227     PetscCall(PetscSFSetGraph(sf->rankssf, 1, nranks, NULL, PETSC_OWN_POINTER, rremotes, PETSC_OWN_POINTER));
1228   }
1229   *rsf = sf->rankssf;
1230   PetscFunctionReturn(PETSC_SUCCESS);
1231 }
1232 
1233 /*@
1234   PetscSFGetMultiSF - gets the inner `PetscSF` implementing gathers and scatters
1235 
1236   Collective
1237 
1238   Input Parameter:
1239 . sf - star forest that may contain roots with 0 or with more than 1 vertex
1240 
1241   Output Parameter:
1242 . multi - star forest with split roots, such that each root has degree exactly 1
1243 
1244   Level: developer
1245 
1246   Note:
1247   In most cases, users should use `PetscSFGatherBegin()` and `PetscSFScatterBegin()` instead of manipulating multi
1248   directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1249   edge, it is a candidate for future optimization that might involve its removal.
1250 
1251 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`, `PetscSFComputeMultiRootOriginalNumbering()`
1252 @*/
1253 PetscErrorCode PetscSFGetMultiSF(PetscSF sf, PetscSF *multi)
1254 {
1255   PetscFunctionBegin;
1256   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1257   PetscAssertPointer(multi, 2);
1258   if (sf->nroots < 0) { /* Graph has not been set yet; why do we need this? */
1259     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1260     *multi           = sf->multi;
1261     sf->multi->multi = sf->multi;
1262     PetscFunctionReturn(PETSC_SUCCESS);
1263   }
1264   if (!sf->multi) {
1265     const PetscInt *indegree;
1266     PetscInt        i, *inoffset, *outones, *outoffset, maxlocal;
1267     PetscSFNode    *remote;
1268     maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
1269     PetscCall(PetscSFComputeDegreeBegin(sf, &indegree));
1270     PetscCall(PetscSFComputeDegreeEnd(sf, &indegree));
1271     PetscCall(PetscMalloc3(sf->nroots + 1, &inoffset, maxlocal, &outones, maxlocal, &outoffset));
1272     inoffset[0] = 0;
1273     for (i = 0; i < sf->nroots; i++) inoffset[i + 1] = inoffset[i] + indegree[i];
1274     for (i = 0; i < maxlocal; i++) outones[i] = 1;
1275     PetscCall(PetscSFFetchAndOpBegin(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1276     PetscCall(PetscSFFetchAndOpEnd(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1277     for (i = 0; i < sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1278     if (PetscDefined(USE_DEBUG)) {                               /* Check that the expected number of increments occurred */
1279       for (i = 0; i < sf->nroots; i++) PetscCheck(inoffset[i] + indegree[i] == inoffset[i + 1], PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incorrect result after PetscSFFetchAndOp");
1280     }
1281     PetscCall(PetscMalloc1(sf->nleaves, &remote));
1282     for (i = 0; i < sf->nleaves; i++) {
1283       remote[i].rank  = sf->remote[i].rank;
1284       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1285     }
1286     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1287     sf->multi->multi = sf->multi;
1288     PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1289     if (sf->rankorder) { /* Sort the ranks */
1290       PetscMPIInt  rank;
1291       PetscInt    *inranks, *newoffset, *outranks, *newoutoffset, *tmpoffset, maxdegree;
1292       PetscSFNode *newremote;
1293       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1294       for (i = 0, maxdegree = 0; i < sf->nroots; i++) maxdegree = PetscMax(maxdegree, indegree[i]);
1295       PetscCall(PetscMalloc5(sf->multi->nroots, &inranks, sf->multi->nroots, &newoffset, maxlocal, &outranks, maxlocal, &newoutoffset, maxdegree, &tmpoffset));
1296       for (i = 0; i < maxlocal; i++) outranks[i] = rank;
1297       PetscCall(PetscSFReduceBegin(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1298       PetscCall(PetscSFReduceEnd(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1299       /* Sort the incoming ranks at each vertex, build the inverse map */
1300       for (i = 0; i < sf->nroots; i++) {
1301         PetscInt j;
1302         for (j = 0; j < indegree[i]; j++) tmpoffset[j] = j;
1303         PetscCall(PetscSortIntWithArray(indegree[i], PetscSafePointerPlusOffset(inranks, inoffset[i]), tmpoffset));
1304         for (j = 0; j < indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1305       }
1306       PetscCall(PetscSFBcastBegin(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1307       PetscCall(PetscSFBcastEnd(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1308       PetscCall(PetscMalloc1(sf->nleaves, &newremote));
1309       for (i = 0; i < sf->nleaves; i++) {
1310         newremote[i].rank  = sf->remote[i].rank;
1311         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1312       }
1313       PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, newremote, PETSC_OWN_POINTER));
1314       PetscCall(PetscFree5(inranks, newoffset, outranks, newoutoffset, tmpoffset));
1315     }
1316     PetscCall(PetscFree3(inoffset, outones, outoffset));
1317   }
1318   *multi = sf->multi;
1319   PetscFunctionReturn(PETSC_SUCCESS);
1320 }
1321 
1322 /*@C
1323   PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots of a `PetscSF`, does not remap indices
1324 
1325   Collective
1326 
1327   Input Parameters:
1328 + sf        - original star forest
1329 . nselected - number of selected roots on this process
1330 - selected  - indices of the selected roots on this process
1331 
1332   Output Parameter:
1333 . esf - new star forest
1334 
1335   Level: advanced
1336 
1337   Note:
1338   To use the new `PetscSF`, it may be necessary to know the indices of the leaves that are still participating. This can
1339   be done by calling PetscSFGetGraph().
1340 
1341 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1342 @*/
1343 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *esf)
1344 {
1345   PetscInt           i, j, n, nroots, nleaves, esf_nleaves, *new_ilocal, minleaf, maxleaf, maxlocal;
1346   const PetscInt    *ilocal;
1347   signed char       *rootdata, *leafdata, *leafmem;
1348   const PetscSFNode *iremote;
1349   PetscSFNode       *new_iremote;
1350   MPI_Comm           comm;
1351 
1352   PetscFunctionBegin;
1353   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1354   PetscSFCheckGraphSet(sf, 1);
1355   if (nselected) PetscAssertPointer(selected, 3);
1356   PetscAssertPointer(esf, 4);
1357 
1358   PetscCall(PetscSFSetUp(sf));
1359   PetscCall(PetscLogEventBegin(PETSCSF_EmbedSF, sf, 0, 0, 0));
1360   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1361   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
1362 
1363   if (PetscDefined(USE_DEBUG)) { /* Error out if selected[] has dups or out of range indices */
1364     PetscBool dups;
1365     PetscCall(PetscCheckDupsInt(nselected, selected, &dups));
1366     PetscCheck(!dups, comm, PETSC_ERR_ARG_WRONG, "selected[] has dups");
1367     for (i = 0; i < nselected; i++) PetscCheck(selected[i] >= 0 && selected[i] < nroots, comm, PETSC_ERR_ARG_OUTOFRANGE, "selected root index %" PetscInt_FMT " is out of [0,%" PetscInt_FMT ")", selected[i], nroots);
1368   }
1369 
1370   if (sf->ops->CreateEmbeddedRootSF) PetscUseTypeMethod(sf, CreateEmbeddedRootSF, nselected, selected, esf);
1371   else {
1372     /* A generic version of creating embedded sf */
1373     PetscCall(PetscSFGetLeafRange(sf, &minleaf, &maxleaf));
1374     maxlocal = maxleaf - minleaf + 1;
1375     PetscCall(PetscCalloc2(nroots, &rootdata, maxlocal, &leafmem));
1376     leafdata = PetscSafePointerPlusOffset(leafmem, -minleaf);
1377     /* Tag selected roots and bcast to leaves */
1378     for (i = 0; i < nselected; i++) rootdata[selected[i]] = 1;
1379     PetscCall(PetscSFBcastBegin(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1380     PetscCall(PetscSFBcastEnd(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1381 
1382     /* Build esf with leaves that are still connected */
1383     esf_nleaves = 0;
1384     for (i = 0; i < nleaves; i++) {
1385       j = ilocal ? ilocal[i] : i;
1386       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1387          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1388       */
1389       esf_nleaves += (leafdata[j] ? 1 : 0);
1390     }
1391     PetscCall(PetscMalloc1(esf_nleaves, &new_ilocal));
1392     PetscCall(PetscMalloc1(esf_nleaves, &new_iremote));
1393     for (i = n = 0; i < nleaves; i++) {
1394       j = ilocal ? ilocal[i] : i;
1395       if (leafdata[j]) {
1396         new_ilocal[n]        = j;
1397         new_iremote[n].rank  = iremote[i].rank;
1398         new_iremote[n].index = iremote[i].index;
1399         ++n;
1400       }
1401     }
1402     PetscCall(PetscSFCreate(comm, esf));
1403     PetscCall(PetscSFSetFromOptions(*esf));
1404     PetscCall(PetscSFSetGraph(*esf, nroots, esf_nleaves, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1405     PetscCall(PetscFree2(rootdata, leafmem));
1406   }
1407   PetscCall(PetscLogEventEnd(PETSCSF_EmbedSF, sf, 0, 0, 0));
1408   PetscFunctionReturn(PETSC_SUCCESS);
1409 }
1410 
1411 /*@C
1412   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves of a `PetscSF`, does not remap indices
1413 
1414   Collective
1415 
1416   Input Parameters:
1417 + sf        - original star forest
1418 . nselected - number of selected leaves on this process
1419 - selected  - indices of the selected leaves on this process
1420 
1421   Output Parameter:
1422 . newsf - new star forest
1423 
1424   Level: advanced
1425 
1426 .seealso: `PetscSF`, `PetscSFCreateEmbeddedRootSF()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1427 @*/
1428 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *newsf)
1429 {
1430   const PetscSFNode *iremote;
1431   PetscSFNode       *new_iremote;
1432   const PetscInt    *ilocal;
1433   PetscInt           i, nroots, *leaves, *new_ilocal;
1434   MPI_Comm           comm;
1435 
1436   PetscFunctionBegin;
1437   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1438   PetscSFCheckGraphSet(sf, 1);
1439   if (nselected) PetscAssertPointer(selected, 3);
1440   PetscAssertPointer(newsf, 4);
1441 
1442   /* Uniq selected[] and put results in leaves[] */
1443   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1444   PetscCall(PetscMalloc1(nselected, &leaves));
1445   PetscCall(PetscArraycpy(leaves, selected, nselected));
1446   PetscCall(PetscSortedRemoveDupsInt(&nselected, leaves));
1447   PetscCheck(!nselected || !(leaves[0] < 0 || leaves[nselected - 1] >= sf->nleaves), comm, PETSC_ERR_ARG_OUTOFRANGE, "Min/Max leaf indices %" PetscInt_FMT "/%" PetscInt_FMT " are not in [0,%" PetscInt_FMT ")", leaves[0], leaves[nselected - 1], sf->nleaves);
1448 
1449   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1450   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) PetscUseTypeMethod(sf, CreateEmbeddedLeafSF, nselected, leaves, newsf);
1451   else {
1452     PetscCall(PetscSFGetGraph(sf, &nroots, NULL, &ilocal, &iremote));
1453     PetscCall(PetscMalloc1(nselected, &new_ilocal));
1454     PetscCall(PetscMalloc1(nselected, &new_iremote));
1455     for (i = 0; i < nselected; ++i) {
1456       const PetscInt l     = leaves[i];
1457       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1458       new_iremote[i].rank  = iremote[l].rank;
1459       new_iremote[i].index = iremote[l].index;
1460     }
1461     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, newsf));
1462     PetscCall(PetscSFSetGraph(*newsf, nroots, nselected, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1463   }
1464   PetscCall(PetscFree(leaves));
1465   PetscFunctionReturn(PETSC_SUCCESS);
1466 }
1467 
1468 /*@C
1469   PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to `PetscSFBcastEnd()`
1470 
1471   Collective
1472 
1473   Input Parameters:
1474 + sf       - star forest on which to communicate
1475 . unit     - data type associated with each node
1476 . rootdata - buffer to broadcast
1477 - op       - operation to use for reduction
1478 
1479   Output Parameter:
1480 . leafdata - buffer to be reduced with values from each leaf's respective root
1481 
1482   Level: intermediate
1483 
1484   Note:
1485   When PETSc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1486   are host pointers or device pointers, which may incur a noticeable cost. If you already knew the info, you should
1487   use `PetscSFBcastWithMemTypeBegin()` instead.
1488 
1489 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastWithMemTypeBegin()`
1490 @*/
1491 PetscErrorCode PetscSFBcastBegin(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1492 {
1493   PetscMemType rootmtype, leafmtype;
1494 
1495   PetscFunctionBegin;
1496   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1497   PetscCall(PetscSFSetUp(sf));
1498   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1499   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1500   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1501   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1502   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1503   PetscFunctionReturn(PETSC_SUCCESS);
1504 }
1505 
1506 /*@C
1507   PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call
1508   to `PetscSFBcastEnd()`
1509 
1510   Collective
1511 
1512   Input Parameters:
1513 + sf        - star forest on which to communicate
1514 . unit      - data type associated with each node
1515 . rootmtype - memory type of rootdata
1516 . rootdata  - buffer to broadcast
1517 . leafmtype - memory type of leafdata
1518 - op        - operation to use for reduction
1519 
1520   Output Parameter:
1521 . leafdata - buffer to be reduced with values from each leaf's respective root
1522 
1523   Level: intermediate
1524 
1525 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastBegin()`
1526 @*/
1527 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
1528 {
1529   PetscFunctionBegin;
1530   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1531   PetscCall(PetscSFSetUp(sf));
1532   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1533   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1534   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1535   PetscFunctionReturn(PETSC_SUCCESS);
1536 }
1537 
1538 /*@C
1539   PetscSFBcastEnd - end a broadcast and reduce operation started with `PetscSFBcastBegin()` or `PetscSFBcastWithMemTypeBegin()`
1540 
1541   Collective
1542 
1543   Input Parameters:
1544 + sf       - star forest
1545 . unit     - data type
1546 . rootdata - buffer to broadcast
1547 - op       - operation to use for reduction
1548 
1549   Output Parameter:
1550 . leafdata - buffer to be reduced with values from each leaf's respective root
1551 
1552   Level: intermediate
1553 
1554 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFReduceEnd()`
1555 @*/
1556 PetscErrorCode PetscSFBcastEnd(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1557 {
1558   PetscFunctionBegin;
1559   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1560   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastEnd, sf, 0, 0, 0));
1561   PetscUseTypeMethod(sf, BcastEnd, unit, rootdata, leafdata, op);
1562   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastEnd, sf, 0, 0, 0));
1563   PetscFunctionReturn(PETSC_SUCCESS);
1564 }
1565 
1566 /*@C
1567   PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to `PetscSFReduceEnd()`
1568 
1569   Collective
1570 
1571   Input Parameters:
1572 + sf       - star forest
1573 . unit     - data type
1574 . leafdata - values to reduce
1575 - op       - reduction operation
1576 
1577   Output Parameter:
1578 . rootdata - result of reduction of values from all leaves of each root
1579 
1580   Level: intermediate
1581 
1582   Note:
1583   When PETSc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1584   are host pointers or device pointers, which may incur a noticeable cost. If you already knew the info, you should
1585   use `PetscSFReduceWithMemTypeBegin()` instead.
1586 
1587 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceWithMemTypeBegin()`, `PetscSFReduceEnd()`
1588 @*/
1589 PetscErrorCode PetscSFReduceBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1590 {
1591   PetscMemType rootmtype, leafmtype;
1592 
1593   PetscFunctionBegin;
1594   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1595   PetscCall(PetscSFSetUp(sf));
1596   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1597   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1598   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1599   PetscCall(sf->ops->ReduceBegin(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1600   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1601   PetscFunctionReturn(PETSC_SUCCESS);
1602 }
1603 
1604 /*@C
1605   PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to `PetscSFReduceEnd()`
1606 
1607   Collective
1608 
1609   Input Parameters:
1610 + sf        - star forest
1611 . unit      - data type
1612 . leafmtype - memory type of leafdata
1613 . leafdata  - values to reduce
1614 . rootmtype - memory type of rootdata
1615 - op        - reduction operation
1616 
1617   Output Parameter:
1618 . rootdata - result of reduction of values from all leaves of each root
1619 
1620   Level: intermediate
1621 
1622 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceBegin()`, `PetscSFReduceEnd()`
1623 @*/
1624 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
1625 {
1626   PetscFunctionBegin;
1627   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1628   PetscCall(PetscSFSetUp(sf));
1629   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1630   PetscCall(sf->ops->ReduceBegin(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1631   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1632   PetscFunctionReturn(PETSC_SUCCESS);
1633 }
1634 
1635 /*@C
1636   PetscSFReduceEnd - end a reduction operation started with `PetscSFReduceBegin()` or `PetscSFReduceWithMemTypeBegin()`
1637 
1638   Collective
1639 
1640   Input Parameters:
1641 + sf       - star forest
1642 . unit     - data type
1643 . leafdata - values to reduce
1644 - op       - reduction operation
1645 
1646   Output Parameter:
1647 . rootdata - result of reduction of values from all leaves of each root
1648 
1649   Level: intermediate
1650 
1651 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFBcastEnd()`, `PetscSFReduceBegin()`, `PetscSFReduceWithMemTypeBegin()`
1652 @*/
1653 PetscErrorCode PetscSFReduceEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1654 {
1655   PetscFunctionBegin;
1656   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1657   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1658   PetscUseTypeMethod(sf, ReduceEnd, unit, leafdata, rootdata, op);
1659   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1660   PetscFunctionReturn(PETSC_SUCCESS);
1661 }
1662 
1663 /*@C
1664   PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value,
1665   to be completed with `PetscSFFetchAndOpEnd()`
1666 
1667   Collective
1668 
1669   Input Parameters:
1670 + sf       - star forest
1671 . unit     - data type
1672 . leafdata - leaf values to use in reduction
1673 - op       - operation to use for reduction
1674 
1675   Output Parameters:
1676 + rootdata   - root values to be updated, input state is seen by first process to perform an update
1677 - leafupdate - state at each leaf's respective root immediately prior to my atomic update
1678 
1679   Level: advanced
1680 
1681   Note:
1682   The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1683   might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1684   not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1685   integers.
1686 
1687 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`
1688 @*/
1689 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1690 {
1691   PetscMemType rootmtype, leafmtype, leafupdatemtype;
1692 
1693   PetscFunctionBegin;
1694   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1695   PetscCall(PetscSFSetUp(sf));
1696   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1697   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1698   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1699   PetscCall(PetscGetMemType(leafupdate, &leafupdatemtype));
1700   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1701   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1702   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1703   PetscFunctionReturn(PETSC_SUCCESS);
1704 }
1705 
1706 /*@C
1707   PetscSFFetchAndOpWithMemTypeBegin - begin operation with explicit memory types that fetches values from root and updates atomically by
1708   applying operation using my leaf value, to be completed with `PetscSFFetchAndOpEnd()`
1709 
1710   Collective
1711 
1712   Input Parameters:
1713 + sf              - star forest
1714 . unit            - data type
1715 . rootmtype       - memory type of rootdata
1716 . leafmtype       - memory type of leafdata
1717 . leafdata        - leaf values to use in reduction
1718 . leafupdatemtype - memory type of leafupdate
1719 - op              - operation to use for reduction
1720 
1721   Output Parameters:
1722 + rootdata   - root values to be updated, input state is seen by first process to perform an update
1723 - leafupdate - state at each leaf's respective root immediately prior to my atomic update
1724 
1725   Level: advanced
1726 
1727   Note:
1728   See `PetscSFFetchAndOpBegin()` for more details.
1729 
1730 .seealso: `PetscSF`, `PetscSFFetchAndOpBegin()`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`, `PetscSFFetchAndOpEnd()`
1731 @*/
1732 PetscErrorCode PetscSFFetchAndOpWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, PetscMemType leafupdatemtype, void *leafupdate, MPI_Op op)
1733 {
1734   PetscFunctionBegin;
1735   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1736   PetscCall(PetscSFSetUp(sf));
1737   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1738   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1739   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1740   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1741   PetscFunctionReturn(PETSC_SUCCESS);
1742 }
1743 
1744 /*@C
1745   PetscSFFetchAndOpEnd - end operation started in matching call to `PetscSFFetchAndOpBegin()` or `PetscSFFetchAndOpWithMemTypeBegin()`
1746   to fetch values from roots and update atomically by applying operation using my leaf value
1747 
1748   Collective
1749 
1750   Input Parameters:
1751 + sf       - star forest
1752 . unit     - data type
1753 . leafdata - leaf values to use in reduction
1754 - op       - operation to use for reduction
1755 
1756   Output Parameters:
1757 + rootdata   - root values to be updated, input state is seen by first process to perform an update
1758 - leafupdate - state at each leaf's respective root immediately prior to my atomic update
1759 
1760   Level: advanced
1761 
1762 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFReduceEnd()`, `PetscSFSetGraph()`, `PetscSFFetchAndOpBegin()`, `PetscSFFetchAndOpWithMemTypeBegin()`
1763 @*/
1764 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1765 {
1766   PetscFunctionBegin;
1767   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1768   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1769   PetscUseTypeMethod(sf, FetchAndOpEnd, unit, rootdata, leafdata, leafupdate, op);
1770   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1771   PetscFunctionReturn(PETSC_SUCCESS);
1772 }
1773 
1774 /*@C
1775   PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with `PetscSFComputeDegreeEnd()`
1776 
1777   Collective
1778 
1779   Input Parameter:
1780 . sf - star forest
1781 
1782   Output Parameter:
1783 . degree - degree of each root vertex
1784 
1785   Level: advanced
1786 
1787   Note:
1788   The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence there is no need to call `PetscFree()` on it.
1789 
1790 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeEnd()`
1791 @*/
1792 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf, const PetscInt *degree[])
1793 {
1794   PetscFunctionBegin;
1795   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1796   PetscSFCheckGraphSet(sf, 1);
1797   PetscAssertPointer(degree, 2);
1798   if (!sf->degreeknown) {
1799     PetscInt i, nroots = sf->nroots, maxlocal;
1800     PetscCheck(!sf->degree, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1801     maxlocal = sf->maxleaf - sf->minleaf + 1;
1802     PetscCall(PetscMalloc1(nroots, &sf->degree));
1803     PetscCall(PetscMalloc1(PetscMax(maxlocal, 1), &sf->degreetmp)); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1804     for (i = 0; i < nroots; i++) sf->degree[i] = 0;
1805     for (i = 0; i < maxlocal; i++) sf->degreetmp[i] = 1;
1806     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1807   }
1808   *degree = NULL;
1809   PetscFunctionReturn(PETSC_SUCCESS);
1810 }
1811 
1812 /*@C
1813   PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with `PetscSFComputeDegreeBegin()`
1814 
1815   Collective
1816 
1817   Input Parameter:
1818 . sf - star forest
1819 
1820   Output Parameter:
1821 . degree - degree of each root vertex
1822 
1823   Level: developer
1824 
1825   Note:
1826   The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence there is no need to call `PetscFree()` on it.
1827 
1828 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeBegin()`
1829 @*/
1830 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf, const PetscInt **degree)
1831 {
1832   PetscFunctionBegin;
1833   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1834   PetscSFCheckGraphSet(sf, 1);
1835   PetscAssertPointer(degree, 2);
1836   if (!sf->degreeknown) {
1837     PetscCheck(sf->degreetmp, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1838     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1839     PetscCall(PetscFree(sf->degreetmp));
1840     sf->degreeknown = PETSC_TRUE;
1841   }
1842   *degree = sf->degree;
1843   PetscFunctionReturn(PETSC_SUCCESS);
1844 }
1845 
1846 /*@C
1847   PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-`PetscSF` returned by `PetscSFGetMultiSF()`).
1848   Each multi-root is assigned index of the corresponding original root.
1849 
1850   Collective
1851 
1852   Input Parameters:
1853 + sf     - star forest
1854 - degree - degree of each root vertex, computed with `PetscSFComputeDegreeBegin()`/`PetscSFComputeDegreeEnd()`
1855 
1856   Output Parameters:
1857 + nMultiRoots             - (optional) number of multi-roots (roots of multi-`PetscSF`)
1858 - multiRootsOrigNumbering - original indices of multi-roots; length of this array is `nMultiRoots`
1859 
1860   Level: developer
1861 
1862   Note:
1863   The returned array `multiRootsOrigNumbering` is newly allocated and should be destroyed with `PetscFree()` when no longer needed.
1864 
1865 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFComputeDegreeEnd()`, `PetscSFGetMultiSF()`
1866 @*/
1867 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1868 {
1869   PetscSF  msf;
1870   PetscInt k = 0, nroots, nmroots;
1871 
1872   PetscFunctionBegin;
1873   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1874   PetscCall(PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL));
1875   if (nroots) PetscAssertPointer(degree, 2);
1876   if (nMultiRoots) PetscAssertPointer(nMultiRoots, 3);
1877   PetscAssertPointer(multiRootsOrigNumbering, 4);
1878   PetscCall(PetscSFGetMultiSF(sf, &msf));
1879   PetscCall(PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL));
1880   PetscCall(PetscMalloc1(nmroots, multiRootsOrigNumbering));
1881   for (PetscInt i = 0; i < nroots; i++) {
1882     if (!degree[i]) continue;
1883     for (PetscInt j = 0; j < degree[i]; j++, k++) (*multiRootsOrigNumbering)[k] = i;
1884   }
1885   PetscCheck(k == nmroots, PETSC_COMM_SELF, PETSC_ERR_PLIB, "sanity check fail");
1886   if (nMultiRoots) *nMultiRoots = nmroots;
1887   PetscFunctionReturn(PETSC_SUCCESS);
1888 }
1889 
1890 /*@C
1891   PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with `PetscSFGatherEnd()`
1892 
1893   Collective
1894 
1895   Input Parameters:
1896 + sf       - star forest
1897 . unit     - data type
1898 - leafdata - leaf data to gather to roots
1899 
1900   Output Parameter:
1901 . multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1902 
1903   Level: intermediate
1904 
1905 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterBegin()`
1906 @*/
1907 PetscErrorCode PetscSFGatherBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1908 {
1909   PetscSF multi = NULL;
1910 
1911   PetscFunctionBegin;
1912   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1913   PetscCall(PetscSFSetUp(sf));
1914   PetscCall(PetscSFGetMultiSF(sf, &multi));
1915   PetscCall(PetscSFReduceBegin(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1916   PetscFunctionReturn(PETSC_SUCCESS);
1917 }
1918 
1919 /*@C
1920   PetscSFGatherEnd - ends pointwise gather operation that was started with `PetscSFGatherBegin()`
1921 
1922   Collective
1923 
1924   Input Parameters:
1925 + sf       - star forest
1926 . unit     - data type
1927 - leafdata - leaf data to gather to roots
1928 
1929   Output Parameter:
1930 . multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1931 
1932   Level: intermediate
1933 
1934 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterEnd()`
1935 @*/
1936 PetscErrorCode PetscSFGatherEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1937 {
1938   PetscSF multi = NULL;
1939 
1940   PetscFunctionBegin;
1941   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1942   PetscCall(PetscSFGetMultiSF(sf, &multi));
1943   PetscCall(PetscSFReduceEnd(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1944   PetscFunctionReturn(PETSC_SUCCESS);
1945 }
1946 
1947 /*@C
1948   PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with `PetscSFScatterEnd()`
1949 
1950   Collective
1951 
1952   Input Parameters:
1953 + sf            - star forest
1954 . unit          - data type
1955 - multirootdata - root buffer to send to each leaf, one unit of data per leaf
1956 
1957   Output Parameter:
1958 . leafdata - leaf data to be update with personal data from each respective root
1959 
1960   Level: intermediate
1961 
1962 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterEnd()`
1963 @*/
1964 PetscErrorCode PetscSFScatterBegin(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1965 {
1966   PetscSF multi = NULL;
1967 
1968   PetscFunctionBegin;
1969   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1970   PetscCall(PetscSFSetUp(sf));
1971   PetscCall(PetscSFGetMultiSF(sf, &multi));
1972   PetscCall(PetscSFBcastBegin(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1973   PetscFunctionReturn(PETSC_SUCCESS);
1974 }
1975 
1976 /*@C
1977   PetscSFScatterEnd - ends pointwise scatter operation that was started with `PetscSFScatterBegin()`
1978 
1979   Collective
1980 
1981   Input Parameters:
1982 + sf            - star forest
1983 . unit          - data type
1984 - multirootdata - root buffer to send to each leaf, one unit of data per leaf
1985 
1986   Output Parameter:
1987 . leafdata - leaf data to be update with personal data from each respective root
1988 
1989   Level: intermediate
1990 
1991 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterBegin()`
1992 @*/
1993 PetscErrorCode PetscSFScatterEnd(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1994 {
1995   PetscSF multi = NULL;
1996 
1997   PetscFunctionBegin;
1998   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1999   PetscCall(PetscSFGetMultiSF(sf, &multi));
2000   PetscCall(PetscSFBcastEnd(multi, unit, multirootdata, leafdata, MPI_REPLACE));
2001   PetscFunctionReturn(PETSC_SUCCESS);
2002 }
2003 
2004 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
2005 {
2006   PetscInt        i, n, nleaves;
2007   const PetscInt *ilocal = NULL;
2008   PetscHSetI      seen;
2009 
2010   PetscFunctionBegin;
2011   if (PetscDefined(USE_DEBUG)) {
2012     PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, NULL));
2013     PetscCall(PetscHSetICreate(&seen));
2014     for (i = 0; i < nleaves; i++) {
2015       const PetscInt leaf = ilocal ? ilocal[i] : i;
2016       PetscCall(PetscHSetIAdd(seen, leaf));
2017     }
2018     PetscCall(PetscHSetIGetSize(seen, &n));
2019     PetscCheck(n == nleaves, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided leaves have repeated values: all leaves must be unique");
2020     PetscCall(PetscHSetIDestroy(&seen));
2021   }
2022   PetscFunctionReturn(PETSC_SUCCESS);
2023 }
2024 
2025 /*@
2026   PetscSFCompose - Compose a new `PetscSF` by putting the second `PetscSF` under the first one in a top (roots) down (leaves) view
2027 
2028   Input Parameters:
2029 + sfA - The first `PetscSF`
2030 - sfB - The second `PetscSF`
2031 
2032   Output Parameter:
2033 . sfBA - The composite `PetscSF`
2034 
2035   Level: developer
2036 
2037   Notes:
2038   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
2039   forests, i.e. the same leaf is not connected with different roots.
2040 
2041   `sfA`'s leaf space and `sfB`'s root space might be partially overlapped. The composition builds
2042   a graph with `sfA`'s roots and `sfB`'s leaves only when there is a path between them. Unconnected
2043   nodes (roots or leaves) are not in `sfBA`. Doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on the new `PetscSF` is equivalent to doing a
2044   `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on `sfA`, then a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on `sfB`, on connected nodes.
2045 
2046 .seealso: `PetscSF`, `PetscSFComposeInverse()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`
2047 @*/
2048 PetscErrorCode PetscSFCompose(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
2049 {
2050   const PetscSFNode *remotePointsA, *remotePointsB;
2051   PetscSFNode       *remotePointsBA = NULL, *reorderedRemotePointsA = NULL, *leafdataB;
2052   const PetscInt    *localPointsA, *localPointsB;
2053   PetscInt          *localPointsBA;
2054   PetscInt           i, numRootsA, numLeavesA, numRootsB, numLeavesB, minleaf, maxleaf, numLeavesBA;
2055   PetscBool          denseB;
2056 
2057   PetscFunctionBegin;
2058   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
2059   PetscSFCheckGraphSet(sfA, 1);
2060   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
2061   PetscSFCheckGraphSet(sfB, 2);
2062   PetscCheckSameComm(sfA, 1, sfB, 2);
2063   PetscAssertPointer(sfBA, 3);
2064   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
2065   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
2066 
2067   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
2068   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
2069   /* Make sure that PetscSFBcast{Begin, End}(sfB, ...) works with root data of size
2070      numRootsB; otherwise, garbage will be broadcasted.
2071      Example (comm size = 1):
2072      sfA: 0 <- (0, 0)
2073      sfB: 100 <- (0, 0)
2074           101 <- (0, 1)
2075      Here, we have remotePointsA = [(0, 0)], but for remotePointsA to be a valid tartget
2076      of sfB, it has to be recasted as [(0, 0), (-1, -1)] so that points 100 and 101 would
2077      receive (0, 0) and (-1, -1), respectively, when PetscSFBcast(sfB, ...) is called on
2078      remotePointsA; if not recasted, point 101 would receive a garbage value.             */
2079   PetscCall(PetscMalloc1(numRootsB, &reorderedRemotePointsA));
2080   for (i = 0; i < numRootsB; i++) {
2081     reorderedRemotePointsA[i].rank  = -1;
2082     reorderedRemotePointsA[i].index = -1;
2083   }
2084   for (i = 0; i < numLeavesA; i++) {
2085     PetscInt localp = localPointsA ? localPointsA[i] : i;
2086 
2087     if (localp >= numRootsB) continue;
2088     reorderedRemotePointsA[localp] = remotePointsA[i];
2089   }
2090   remotePointsA = reorderedRemotePointsA;
2091   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2092   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &leafdataB));
2093   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2094     leafdataB[i].rank  = -1;
2095     leafdataB[i].index = -1;
2096   }
2097   PetscCall(PetscSFBcastBegin(sfB, MPIU_SF_NODE, remotePointsA, PetscSafePointerPlusOffset(leafdataB, -minleaf), MPI_REPLACE));
2098   PetscCall(PetscSFBcastEnd(sfB, MPIU_SF_NODE, remotePointsA, PetscSafePointerPlusOffset(leafdataB, -minleaf), MPI_REPLACE));
2099   PetscCall(PetscFree(reorderedRemotePointsA));
2100 
2101   denseB = (PetscBool)!localPointsB;
2102   for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2103     if (leafdataB[localPointsB ? localPointsB[i] - minleaf : i].rank == -1) denseB = PETSC_FALSE;
2104     else numLeavesBA++;
2105   }
2106   if (denseB) {
2107     localPointsBA  = NULL;
2108     remotePointsBA = leafdataB;
2109   } else {
2110     PetscCall(PetscMalloc1(numLeavesBA, &localPointsBA));
2111     PetscCall(PetscMalloc1(numLeavesBA, &remotePointsBA));
2112     for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2113       const PetscInt l = localPointsB ? localPointsB[i] : i;
2114 
2115       if (leafdataB[l - minleaf].rank == -1) continue;
2116       remotePointsBA[numLeavesBA] = leafdataB[l - minleaf];
2117       localPointsBA[numLeavesBA]  = l;
2118       numLeavesBA++;
2119     }
2120     PetscCall(PetscFree(leafdataB));
2121   }
2122   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2123   PetscCall(PetscSFSetFromOptions(*sfBA));
2124   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2125   PetscFunctionReturn(PETSC_SUCCESS);
2126 }
2127 
2128 /*@
2129   PetscSFComposeInverse - Compose a new `PetscSF` by putting the inverse of the second `PetscSF` under the first one
2130 
2131   Input Parameters:
2132 + sfA - The first `PetscSF`
2133 - sfB - The second `PetscSF`
2134 
2135   Output Parameter:
2136 . sfBA - The composite `PetscSF`.
2137 
2138   Level: developer
2139 
2140   Notes:
2141   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
2142   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2143   second `PetscSF` must have a degree of 1, i.e., no roots have more than one leaf connected.
2144 
2145   `sfA`'s leaf space and `sfB`'s leaf space might be partially overlapped. The composition builds
2146   a graph with `sfA`'s roots and `sfB`'s roots only when there is a path between them. Unconnected
2147   roots are not in `sfBA`. Doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on the new `PetscSF` is equivalent to doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()`
2148   on `sfA`, then
2149   a `PetscSFReduceBegin()`/`PetscSFReduceEnd()` on `sfB`, on connected roots.
2150 
2151 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFCreateInverseSF()`
2152 @*/
2153 PetscErrorCode PetscSFComposeInverse(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
2154 {
2155   const PetscSFNode *remotePointsA, *remotePointsB;
2156   PetscSFNode       *remotePointsBA;
2157   const PetscInt    *localPointsA, *localPointsB;
2158   PetscSFNode       *reorderedRemotePointsA = NULL;
2159   PetscInt           i, numRootsA, numLeavesA, numLeavesBA, numRootsB, numLeavesB, minleaf, maxleaf, *localPointsBA;
2160   MPI_Op             op;
2161 #if defined(PETSC_USE_64BIT_INDICES)
2162   PetscBool iswin;
2163 #endif
2164 
2165   PetscFunctionBegin;
2166   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
2167   PetscSFCheckGraphSet(sfA, 1);
2168   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
2169   PetscSFCheckGraphSet(sfB, 2);
2170   PetscCheckSameComm(sfA, 1, sfB, 2);
2171   PetscAssertPointer(sfBA, 3);
2172   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
2173   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
2174 
2175   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
2176   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
2177 
2178   /* TODO: Check roots of sfB have degree of 1 */
2179   /* Once we implement it, we can replace the MPI_MAXLOC
2180      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2181      We use MPI_MAXLOC only to have a deterministic output from this routine if
2182      the root condition is not meet.
2183    */
2184   op = MPI_MAXLOC;
2185 #if defined(PETSC_USE_64BIT_INDICES)
2186   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2187   PetscCall(PetscObjectTypeCompare((PetscObject)sfB, PETSCSFWINDOW, &iswin));
2188   if (iswin) op = MPI_REPLACE;
2189 #endif
2190 
2191   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2192   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &reorderedRemotePointsA));
2193   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2194     reorderedRemotePointsA[i].rank  = -1;
2195     reorderedRemotePointsA[i].index = -1;
2196   }
2197   if (localPointsA) {
2198     for (i = 0; i < numLeavesA; i++) {
2199       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2200       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2201     }
2202   } else {
2203     for (i = 0; i < numLeavesA; i++) {
2204       if (i > maxleaf || i < minleaf) continue;
2205       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2206     }
2207   }
2208 
2209   PetscCall(PetscMalloc1(numRootsB, &localPointsBA));
2210   PetscCall(PetscMalloc1(numRootsB, &remotePointsBA));
2211   for (i = 0; i < numRootsB; i++) {
2212     remotePointsBA[i].rank  = -1;
2213     remotePointsBA[i].index = -1;
2214   }
2215 
2216   PetscCall(PetscSFReduceBegin(sfB, MPIU_SF_NODE, PetscSafePointerPlusOffset(reorderedRemotePointsA, -minleaf), remotePointsBA, op));
2217   PetscCall(PetscSFReduceEnd(sfB, MPIU_SF_NODE, PetscSafePointerPlusOffset(reorderedRemotePointsA, -minleaf), remotePointsBA, op));
2218   PetscCall(PetscFree(reorderedRemotePointsA));
2219   for (i = 0, numLeavesBA = 0; i < numRootsB; i++) {
2220     if (remotePointsBA[i].rank == -1) continue;
2221     remotePointsBA[numLeavesBA].rank  = remotePointsBA[i].rank;
2222     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2223     localPointsBA[numLeavesBA]        = i;
2224     numLeavesBA++;
2225   }
2226   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2227   PetscCall(PetscSFSetFromOptions(*sfBA));
2228   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2229   PetscFunctionReturn(PETSC_SUCCESS);
2230 }
2231 
2232 /*
2233   PetscSFCreateLocalSF_Private - Creates a local `PetscSF` that only has intra-process edges of the global `PetscSF`
2234 
2235   Input Parameter:
2236 . sf - The global `PetscSF`
2237 
2238   Output Parameter:
2239 . out - The local `PetscSF`
2240 
2241 .seealso: `PetscSF`, `PetscSFCreate()`
2242  */
2243 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf, PetscSF *out)
2244 {
2245   MPI_Comm           comm;
2246   PetscMPIInt        myrank;
2247   const PetscInt    *ilocal;
2248   const PetscSFNode *iremote;
2249   PetscInt           i, j, nroots, nleaves, lnleaves, *lilocal;
2250   PetscSFNode       *liremote;
2251   PetscSF            lsf;
2252 
2253   PetscFunctionBegin;
2254   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2255   if (sf->ops->CreateLocalSF) PetscUseTypeMethod(sf, CreateLocalSF, out);
2256   else {
2257     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2258     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
2259     PetscCallMPI(MPI_Comm_rank(comm, &myrank));
2260 
2261     /* Find out local edges and build a local SF */
2262     PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
2263     for (i = lnleaves = 0; i < nleaves; i++) {
2264       if (iremote[i].rank == (PetscInt)myrank) lnleaves++;
2265     }
2266     PetscCall(PetscMalloc1(lnleaves, &lilocal));
2267     PetscCall(PetscMalloc1(lnleaves, &liremote));
2268 
2269     for (i = j = 0; i < nleaves; i++) {
2270       if (iremote[i].rank == (PetscInt)myrank) {
2271         lilocal[j]        = ilocal ? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2272         liremote[j].rank  = 0;                      /* rank in PETSC_COMM_SELF */
2273         liremote[j].index = iremote[i].index;
2274         j++;
2275       }
2276     }
2277     PetscCall(PetscSFCreate(PETSC_COMM_SELF, &lsf));
2278     PetscCall(PetscSFSetFromOptions(lsf));
2279     PetscCall(PetscSFSetGraph(lsf, nroots, lnleaves, lilocal, PETSC_OWN_POINTER, liremote, PETSC_OWN_POINTER));
2280     PetscCall(PetscSFSetUp(lsf));
2281     *out = lsf;
2282   }
2283   PetscFunctionReturn(PETSC_SUCCESS);
2284 }
2285 
2286 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2287 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata)
2288 {
2289   PetscMemType rootmtype, leafmtype;
2290 
2291   PetscFunctionBegin;
2292   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2293   PetscCall(PetscSFSetUp(sf));
2294   PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
2295   PetscCall(PetscGetMemType(rootdata, &rootmtype));
2296   PetscCall(PetscGetMemType(leafdata, &leafmtype));
2297   PetscUseTypeMethod(sf, BcastToZero, unit, rootmtype, rootdata, leafmtype, leafdata);
2298   PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
2299   PetscFunctionReturn(PETSC_SUCCESS);
2300 }
2301 
2302 /*@
2303   PetscSFConcatenate - concatenate multiple `PetscSF` into one
2304 
2305   Input Parameters:
2306 + comm        - the communicator
2307 . nsfs        - the number of input `PetscSF`
2308 . sfs         - the array of input `PetscSF`
2309 . rootMode    - the root mode specifying how roots are handled
2310 - leafOffsets - the array of local leaf offsets, one for each input `PetscSF`, or `NULL` for contiguous storage
2311 
2312   Output Parameter:
2313 . newsf - The resulting `PetscSF`
2314 
2315   Level: advanced
2316 
2317   Notes:
2318   The communicator of all `PetscSF`s in `sfs` must be comm.
2319 
2320   Leaves are always concatenated locally, keeping them ordered by the input `PetscSF` index and original local order.
2321 
2322   The offsets in `leafOffsets` are added to the original leaf indices.
2323 
2324   If all input SFs use contiguous leaf storage (`ilocal` = `NULL`), `leafOffsets` can be passed as `NULL` as well.
2325   In this case, `NULL` is also passed as `ilocal` to the resulting `PetscSF`.
2326 
2327   If any input `PetscSF` has non-null `ilocal`, `leafOffsets` is needed to distinguish leaves from different input `PetscSF`s.
2328   In this case, user is responsible to provide correct offsets so that the resulting leaves are unique (otherwise an error occurs).
2329 
2330   All root modes retain the essential connectivity condition.
2331   If two leaves of the same input `PetscSF` are connected (sharing the same root), they are also connected in the output `PetscSF`.
2332   Parameter `rootMode` controls how the input root spaces are combined.
2333   For `PETSCSF_CONCATENATE_ROOTMODE_SHARED`, the root space is considered the same for each input `PetscSF` (checked in debug mode)
2334   and is also the same in the output `PetscSF`.
2335   For `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, the input root spaces are taken as separate and joined.
2336   `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` joins the root spaces locally;
2337   roots of sfs[0], sfs[1], sfs[2], ... are joined on each rank separately, ordered by input `PetscSF` and original local index, and renumbered contiguously.
2338   `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL` joins the root spaces globally;
2339   roots of sfs[0], sfs[1], sfs[2], ... are joined globally, ordered by input `PetscSF` index and original global index, and renumbered contiguously;
2340   the original root ranks are ignored.
2341   For both `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`,
2342   the output `PetscSF`'s root layout is such that the local number of roots is a sum of the input `PetscSF`'s local numbers of roots on each rank
2343   to keep the load balancing.
2344   However, for `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, roots can move to different ranks.
2345 
2346   Example:
2347   We can use src/vec/is/sf/tests/ex18.c to compare the root modes. By running
2348 .vb
2349   make -C $PETSC_DIR/src/vec/is/sf/tests ex18
2350   for m in {local,global,shared}; do
2351     mpirun -n 2 $PETSC_DIR/src/vec/is/sf/tests/ex18 -nsfs 2 -n 2 -root_mode $m -sf_view
2352   done
2353 .ve
2354   we generate two identical `PetscSF`s sf_0 and sf_1,
2355 .vb
2356   PetscSF Object: sf_0 2 MPI processes
2357     type: basic
2358     rank #leaves #roots
2359     [ 0]       4      2
2360     [ 1]       4      2
2361     leaves      roots       roots in global numbering
2362     ( 0,  0) <- ( 0,  0)  =   0
2363     ( 0,  1) <- ( 0,  1)  =   1
2364     ( 0,  2) <- ( 1,  0)  =   2
2365     ( 0,  3) <- ( 1,  1)  =   3
2366     ( 1,  0) <- ( 0,  0)  =   0
2367     ( 1,  1) <- ( 0,  1)  =   1
2368     ( 1,  2) <- ( 1,  0)  =   2
2369     ( 1,  3) <- ( 1,  1)  =   3
2370 .ve
2371   and pass them to `PetscSFConcatenate()` along with different choices of `rootMode`, yielding different result_sf\:
2372 .vb
2373   rootMode = local:
2374   PetscSF Object: result_sf 2 MPI processes
2375     type: basic
2376     rank #leaves #roots
2377     [ 0]       8      4
2378     [ 1]       8      4
2379     leaves      roots       roots in global numbering
2380     ( 0,  0) <- ( 0,  0)  =   0
2381     ( 0,  1) <- ( 0,  1)  =   1
2382     ( 0,  2) <- ( 1,  0)  =   4
2383     ( 0,  3) <- ( 1,  1)  =   5
2384     ( 0,  4) <- ( 0,  2)  =   2
2385     ( 0,  5) <- ( 0,  3)  =   3
2386     ( 0,  6) <- ( 1,  2)  =   6
2387     ( 0,  7) <- ( 1,  3)  =   7
2388     ( 1,  0) <- ( 0,  0)  =   0
2389     ( 1,  1) <- ( 0,  1)  =   1
2390     ( 1,  2) <- ( 1,  0)  =   4
2391     ( 1,  3) <- ( 1,  1)  =   5
2392     ( 1,  4) <- ( 0,  2)  =   2
2393     ( 1,  5) <- ( 0,  3)  =   3
2394     ( 1,  6) <- ( 1,  2)  =   6
2395     ( 1,  7) <- ( 1,  3)  =   7
2396 
2397   rootMode = global:
2398   PetscSF Object: result_sf 2 MPI processes
2399     type: basic
2400     rank #leaves #roots
2401     [ 0]       8      4
2402     [ 1]       8      4
2403     leaves      roots       roots in global numbering
2404     ( 0,  0) <- ( 0,  0)  =   0
2405     ( 0,  1) <- ( 0,  1)  =   1
2406     ( 0,  2) <- ( 0,  2)  =   2
2407     ( 0,  3) <- ( 0,  3)  =   3
2408     ( 0,  4) <- ( 1,  0)  =   4
2409     ( 0,  5) <- ( 1,  1)  =   5
2410     ( 0,  6) <- ( 1,  2)  =   6
2411     ( 0,  7) <- ( 1,  3)  =   7
2412     ( 1,  0) <- ( 0,  0)  =   0
2413     ( 1,  1) <- ( 0,  1)  =   1
2414     ( 1,  2) <- ( 0,  2)  =   2
2415     ( 1,  3) <- ( 0,  3)  =   3
2416     ( 1,  4) <- ( 1,  0)  =   4
2417     ( 1,  5) <- ( 1,  1)  =   5
2418     ( 1,  6) <- ( 1,  2)  =   6
2419     ( 1,  7) <- ( 1,  3)  =   7
2420 
2421   rootMode = shared:
2422   PetscSF Object: result_sf 2 MPI processes
2423     type: basic
2424     rank #leaves #roots
2425     [ 0]       8      2
2426     [ 1]       8      2
2427     leaves      roots       roots in global numbering
2428     ( 0,  0) <- ( 0,  0)  =   0
2429     ( 0,  1) <- ( 0,  1)  =   1
2430     ( 0,  2) <- ( 1,  0)  =   2
2431     ( 0,  3) <- ( 1,  1)  =   3
2432     ( 0,  4) <- ( 0,  0)  =   0
2433     ( 0,  5) <- ( 0,  1)  =   1
2434     ( 0,  6) <- ( 1,  0)  =   2
2435     ( 0,  7) <- ( 1,  1)  =   3
2436     ( 1,  0) <- ( 0,  0)  =   0
2437     ( 1,  1) <- ( 0,  1)  =   1
2438     ( 1,  2) <- ( 1,  0)  =   2
2439     ( 1,  3) <- ( 1,  1)  =   3
2440     ( 1,  4) <- ( 0,  0)  =   0
2441     ( 1,  5) <- ( 0,  1)  =   1
2442     ( 1,  6) <- ( 1,  0)  =   2
2443     ( 1,  7) <- ( 1,  1)  =   3
2444 .ve
2445 
2446 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFConcatenateRootMode`
2447 @*/
2448 PetscErrorCode PetscSFConcatenate(MPI_Comm comm, PetscInt nsfs, PetscSF sfs[], PetscSFConcatenateRootMode rootMode, PetscInt leafOffsets[], PetscSF *newsf)
2449 {
2450   PetscInt     i, s, nLeaves, nRoots;
2451   PetscInt    *leafArrayOffsets;
2452   PetscInt    *ilocal_new;
2453   PetscSFNode *iremote_new;
2454   PetscBool    all_ilocal_null = PETSC_FALSE;
2455   PetscLayout  glayout         = NULL;
2456   PetscInt    *gremote         = NULL;
2457   PetscMPIInt  rank, size;
2458 
2459   PetscFunctionBegin;
2460   if (PetscDefined(USE_DEBUG)) {
2461     PetscSF dummy; /* just to have a PetscObject on comm for input validation */
2462 
2463     PetscCall(PetscSFCreate(comm, &dummy));
2464     PetscValidLogicalCollectiveInt(dummy, nsfs, 2);
2465     PetscAssertPointer(sfs, 3);
2466     for (i = 0; i < nsfs; i++) {
2467       PetscValidHeaderSpecific(sfs[i], PETSCSF_CLASSID, 3);
2468       PetscCheckSameComm(dummy, 1, sfs[i], 3);
2469     }
2470     PetscValidLogicalCollectiveEnum(dummy, rootMode, 4);
2471     if (leafOffsets) PetscAssertPointer(leafOffsets, 5);
2472     PetscAssertPointer(newsf, 6);
2473     PetscCall(PetscSFDestroy(&dummy));
2474   }
2475   if (!nsfs) {
2476     PetscCall(PetscSFCreate(comm, newsf));
2477     PetscCall(PetscSFSetGraph(*newsf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
2478     PetscFunctionReturn(PETSC_SUCCESS);
2479   }
2480   PetscCallMPI(MPI_Comm_rank(comm, &rank));
2481   PetscCallMPI(MPI_Comm_size(comm, &size));
2482 
2483   /* Calculate leaf array offsets */
2484   PetscCall(PetscMalloc1(nsfs + 1, &leafArrayOffsets));
2485   leafArrayOffsets[0] = 0;
2486   for (s = 0; s < nsfs; s++) {
2487     PetscInt nl;
2488 
2489     PetscCall(PetscSFGetGraph(sfs[s], NULL, &nl, NULL, NULL));
2490     leafArrayOffsets[s + 1] = leafArrayOffsets[s] + nl;
2491   }
2492   nLeaves = leafArrayOffsets[nsfs];
2493 
2494   /* Calculate number of roots */
2495   switch (rootMode) {
2496   case PETSCSF_CONCATENATE_ROOTMODE_SHARED: {
2497     PetscCall(PetscSFGetGraph(sfs[0], &nRoots, NULL, NULL, NULL));
2498     if (PetscDefined(USE_DEBUG)) {
2499       for (s = 1; s < nsfs; s++) {
2500         PetscInt nr;
2501 
2502         PetscCall(PetscSFGetGraph(sfs[s], &nr, NULL, NULL, NULL));
2503         PetscCheck(nr == nRoots, comm, PETSC_ERR_ARG_SIZ, "rootMode = %s but sfs[%" PetscInt_FMT "] has a different number of roots (%" PetscInt_FMT ") than sfs[0] (%" PetscInt_FMT ")", PetscSFConcatenateRootModes[rootMode], s, nr, nRoots);
2504       }
2505     }
2506   } break;
2507   case PETSCSF_CONCATENATE_ROOTMODE_GLOBAL: {
2508     /* Calculate also global layout in this case */
2509     PetscInt    *nls;
2510     PetscLayout *lts;
2511     PetscInt   **inds;
2512     PetscInt     j;
2513     PetscInt     rootOffset = 0;
2514 
2515     PetscCall(PetscCalloc3(nsfs, &lts, nsfs, &nls, nsfs, &inds));
2516     PetscCall(PetscLayoutCreate(comm, &glayout));
2517     glayout->bs = 1;
2518     glayout->n  = 0;
2519     glayout->N  = 0;
2520     for (s = 0; s < nsfs; s++) {
2521       PetscCall(PetscSFGetGraphLayout(sfs[s], &lts[s], &nls[s], NULL, &inds[s]));
2522       glayout->n += lts[s]->n;
2523       glayout->N += lts[s]->N;
2524     }
2525     PetscCall(PetscLayoutSetUp(glayout));
2526     PetscCall(PetscMalloc1(nLeaves, &gremote));
2527     for (s = 0, j = 0; s < nsfs; s++) {
2528       for (i = 0; i < nls[s]; i++, j++) gremote[j] = inds[s][i] + rootOffset;
2529       rootOffset += lts[s]->N;
2530       PetscCall(PetscLayoutDestroy(&lts[s]));
2531       PetscCall(PetscFree(inds[s]));
2532     }
2533     PetscCall(PetscFree3(lts, nls, inds));
2534     nRoots = glayout->N;
2535   } break;
2536   case PETSCSF_CONCATENATE_ROOTMODE_LOCAL:
2537     /* nRoots calculated later in this case */
2538     break;
2539   default:
2540     SETERRQ(comm, PETSC_ERR_ARG_WRONG, "Invalid PetscSFConcatenateRootMode %d", rootMode);
2541   }
2542 
2543   if (!leafOffsets) {
2544     all_ilocal_null = PETSC_TRUE;
2545     for (s = 0; s < nsfs; s++) {
2546       const PetscInt *ilocal;
2547 
2548       PetscCall(PetscSFGetGraph(sfs[s], NULL, NULL, &ilocal, NULL));
2549       if (ilocal) {
2550         all_ilocal_null = PETSC_FALSE;
2551         break;
2552       }
2553     }
2554     PetscCheck(all_ilocal_null, PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "leafOffsets can be passed as NULL only if all SFs have ilocal = NULL");
2555   }
2556 
2557   /* Renumber and concatenate local leaves */
2558   ilocal_new = NULL;
2559   if (!all_ilocal_null) {
2560     PetscCall(PetscMalloc1(nLeaves, &ilocal_new));
2561     for (i = 0; i < nLeaves; i++) ilocal_new[i] = -1;
2562     for (s = 0; s < nsfs; s++) {
2563       const PetscInt *ilocal;
2564       PetscInt       *ilocal_l = PetscSafePointerPlusOffset(ilocal_new, leafArrayOffsets[s]);
2565       PetscInt        i, nleaves_l;
2566 
2567       PetscCall(PetscSFGetGraph(sfs[s], NULL, &nleaves_l, &ilocal, NULL));
2568       for (i = 0; i < nleaves_l; i++) ilocal_l[i] = (ilocal ? ilocal[i] : i) + leafOffsets[s];
2569     }
2570   }
2571 
2572   /* Renumber and concatenate remote roots */
2573   if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL || rootMode == PETSCSF_CONCATENATE_ROOTMODE_SHARED) {
2574     PetscInt rootOffset = 0;
2575 
2576     PetscCall(PetscMalloc1(nLeaves, &iremote_new));
2577     for (i = 0; i < nLeaves; i++) {
2578       iremote_new[i].rank  = -1;
2579       iremote_new[i].index = -1;
2580     }
2581     for (s = 0; s < nsfs; s++) {
2582       PetscInt           i, nl, nr;
2583       PetscSF            tmp_sf;
2584       const PetscSFNode *iremote;
2585       PetscSFNode       *tmp_rootdata;
2586       PetscSFNode       *tmp_leafdata = PetscSafePointerPlusOffset(iremote_new, leafArrayOffsets[s]);
2587 
2588       PetscCall(PetscSFGetGraph(sfs[s], &nr, &nl, NULL, &iremote));
2589       PetscCall(PetscSFCreate(comm, &tmp_sf));
2590       /* create helper SF with contiguous leaves */
2591       PetscCall(PetscSFSetGraph(tmp_sf, nr, nl, NULL, PETSC_USE_POINTER, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
2592       PetscCall(PetscSFSetUp(tmp_sf));
2593       PetscCall(PetscMalloc1(nr, &tmp_rootdata));
2594       if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) {
2595         for (i = 0; i < nr; i++) {
2596           tmp_rootdata[i].index = i + rootOffset;
2597           tmp_rootdata[i].rank  = rank;
2598         }
2599         rootOffset += nr;
2600       } else {
2601         for (i = 0; i < nr; i++) {
2602           tmp_rootdata[i].index = i;
2603           tmp_rootdata[i].rank  = rank;
2604         }
2605       }
2606       PetscCall(PetscSFBcastBegin(tmp_sf, MPIU_SF_NODE, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2607       PetscCall(PetscSFBcastEnd(tmp_sf, MPIU_SF_NODE, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2608       PetscCall(PetscSFDestroy(&tmp_sf));
2609       PetscCall(PetscFree(tmp_rootdata));
2610     }
2611     if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) nRoots = rootOffset; // else nRoots already calculated above
2612 
2613     /* Build the new SF */
2614     PetscCall(PetscSFCreate(comm, newsf));
2615     PetscCall(PetscSFSetGraph(*newsf, nRoots, nLeaves, ilocal_new, PETSC_OWN_POINTER, iremote_new, PETSC_OWN_POINTER));
2616   } else {
2617     /* Build the new SF */
2618     PetscCall(PetscSFCreate(comm, newsf));
2619     PetscCall(PetscSFSetGraphLayout(*newsf, glayout, nLeaves, ilocal_new, PETSC_OWN_POINTER, gremote));
2620   }
2621   PetscCall(PetscSFSetUp(*newsf));
2622   PetscCall(PetscSFViewFromOptions(*newsf, NULL, "-sf_concat_view"));
2623   PetscCall(PetscLayoutDestroy(&glayout));
2624   PetscCall(PetscFree(gremote));
2625   PetscCall(PetscFree(leafArrayOffsets));
2626   PetscFunctionReturn(PETSC_SUCCESS);
2627 }
2628 
2629 /*@
2630   PetscSFRegisterPersistent - Register root and leaf data as memory regions that will be used for repeated PetscSF communications.
2631 
2632   Collective
2633 
2634   Input Parameters:
2635 + sf       - star forest
2636 . unit     - the data type contained within the root and leaf data
2637 . rootdata - root data that will be used for multiple PetscSF communications
2638 - leafdata - leaf data that will be used for multiple PetscSF communications
2639 
2640   Level: advanced
2641 
2642   Notes:
2643   Implementations of `PetscSF` can make optimizations
2644   for repeated communication using the same memory regions, but these optimizations
2645   can be unsound if `rootdata` or `leafdata` is deallocated and the `PetscSF` is not informed.
2646   The intended pattern is
2647 
2648 .vb
2649   PetscMalloc2(nroots, &rootdata, nleaves, &leafdata);
2650 
2651   PetscSFRegisterPersistent(sf, unit, rootdata, leafdata);
2652   // repeated use of rootdata and leafdata will now be optimized
2653 
2654   PetscSFBcastBegin(sf, unit, rootdata, leafdata, MPI_REPLACE);
2655   PetscSFBcastEnd(sf, unit, rootdata, leafdata, MPI_REPLACE);
2656   // ...
2657   PetscSFReduceBegin(sf, unit, leafdata, rootdata, MPI_SUM);
2658   PetscSFReduceEnd(sf, unit, leafdata, rootdata, MPI_SUM);
2659   // ... (other communications)
2660 
2661   // rootdata and leafdata must be deregistered before freeing
2662   // skipping this can lead to undefined behavior including
2663   // deadlocks
2664   PetscSFDeregisterPersistent(sf, unit, rootdata, leafdata);
2665 
2666   // it is now safe to free rootdata and leafdata
2667   PetscFree2(rootdata, leafdata);
2668 .ve
2669 
2670   If you do not register `rootdata` and `leafdata` it will not cause an error,
2671   but optimizations that reduce the setup time for each communication cannot be
2672   made.  Currently, the only implementation of `PetscSF` that benefits from
2673   `PetscSFRegisterPersistent()` is `PETSCSFWINDOW`.  For the default
2674   `PETSCSFBASIC` there is no benefit to using `PetscSFRegisterPersistent()`.
2675 
2676 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFDeregisterPersistent()`
2677 @*/
2678 PetscErrorCode PetscSFRegisterPersistent(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
2679 {
2680   PetscFunctionBegin;
2681   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2682   PetscTryMethod(sf, "PetscSFRegisterPersistent_C", (PetscSF, MPI_Datatype, const void *, const void *), (sf, unit, rootdata, leafdata));
2683   PetscFunctionReturn(PETSC_SUCCESS);
2684 }
2685 
2686 /*@
2687   PetscSFDeregisterPersistent - Signal that repeated usage of root and leaf data for PetscSF communication has concluded.
2688 
2689   Collective
2690 
2691   Input Parameters:
2692 + sf       - star forest
2693 . unit     - the data type contained within the root and leaf data
2694 . rootdata - root data that was previously registered with `PetscSFRegisterPersistent()`
2695 - leafdata - leaf data that was previously registered with `PetscSFRegisterPersistent()`
2696 
2697   Level: advanced
2698 
2699   Note:
2700   See `PetscSFRegisterPersistent()` for when/how to use this function.
2701 
2702 .seealso: `PetscSF`, `PETSCSFWINDOW`, `PetscSFRegisterPersistent()`
2703 @*/
2704 PetscErrorCode PetscSFDeregisterPersistent(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
2705 {
2706   PetscFunctionBegin;
2707   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2708   PetscTryMethod(sf, "PetscSFDeregisterPersistent_C", (PetscSF, MPI_Datatype, const void *, const void *), (sf, unit, rootdata, leafdata));
2709   PetscFunctionReturn(PETSC_SUCCESS);
2710 }
2711 
2712 PETSC_INTERN PetscErrorCode PetscSFGetDatatypeSize_Internal(MPI_Comm comm, MPI_Datatype unit, MPI_Aint *size)
2713 {
2714   MPI_Aint lb, lb_true, bytes, bytes_true;
2715 
2716   PetscFunctionBegin;
2717   PetscCallMPI(MPI_Type_get_extent(unit, &lb, &bytes));
2718   PetscCallMPI(MPI_Type_get_true_extent(unit, &lb_true, &bytes_true));
2719   PetscCheck(lb == 0 && lb_true == 0, comm, PETSC_ERR_SUP, "No support for unit type with nonzero lower bound, write petsc-maint@mcs.anl.gov if you want this feature");
2720   *size = bytes;
2721   PetscFunctionReturn(PETSC_SUCCESS);
2722 }
2723