xref: /petsc/src/vec/is/sf/interface/sf.c (revision 9fd2a872fac71d98bdeeea4f800da4f052f7a9fb)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petsc/private/viewerimpl.h>
4 #include <petsc/private/hashmapi.h>
5 
6 #if defined(PETSC_HAVE_CUDA)
7   #include <cuda_runtime.h>
8 #endif
9 
10 #if defined(PETSC_HAVE_HIP)
11   #include <hip/hip_runtime.h>
12 #endif
13 
14 #if defined(PETSC_CLANG_STATIC_ANALYZER)
15 void PetscSFCheckGraphSet(PetscSF, int);
16 #else
17   #if defined(PETSC_USE_DEBUG)
18     #define PetscSFCheckGraphSet(sf, arg) PetscCheck((sf)->graphset, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %d \"%s\" before %s()", (arg), #sf, PETSC_FUNCTION_NAME);
19   #else
20     #define PetscSFCheckGraphSet(sf, arg) \
21       do { \
22       } while (0)
23   #endif
24 #endif
25 
26 const char *const PetscSFDuplicateOptions[]     = {"CONFONLY", "RANKS", "GRAPH", "PetscSFDuplicateOption", "PETSCSF_DUPLICATE_", NULL};
27 const char *const PetscSFConcatenateRootModes[] = {"local", "shared", "global", "PetscSFConcatenateRootMode", "PETSCSF_CONCATENATE_ROOTMODE_", NULL};
28 
29 /*@
30    PetscSFCreate - create a star forest communication context
31 
32    Collective
33 
34    Input Parameter:
35 .  comm - communicator on which the star forest will operate
36 
37    Output Parameter:
38 .  sf - new star forest context
39 
40    Options Database Key:
41 .  -sf_type type - value of type may be
42 .vb
43     basic     -Use MPI persistent Isend/Irecv for communication (Default)
44     window    -Use MPI-3 one-sided window for communication
45     neighbor  -Use MPI-3 neighborhood collectives for communication
46 .ve
47 
48    Level: intermediate
49 
50    Note:
51    When one knows the communication graph is one of the predefined graph, such as `MPI_Alltoall()`, `MPI_Allgatherv()`,
52    `MPI_Gatherv()`, one can create a `PetscSF` and then set its graph with `PetscSFSetGraphWithPattern()`. These special
53    `SF`s are optimized and they have better performance than the general `SF`s.
54 
55 .seealso: `PetscSF`, `PetscSFSetType`, PetscSFSetGraph()`, `PetscSFSetGraphWithPattern()`, `PetscSFDestroy()`
56 @*/
57 PetscErrorCode PetscSFCreate(MPI_Comm comm, PetscSF *sf)
58 {
59   PetscSF b;
60 
61   PetscFunctionBegin;
62   PetscValidPointer(sf, 2);
63   PetscCall(PetscSFInitializePackage());
64 
65   PetscCall(PetscHeaderCreate(b, PETSCSF_CLASSID, "PetscSF", "Star Forest", "PetscSF", comm, PetscSFDestroy, PetscSFView));
66 
67   b->nroots    = -1;
68   b->nleaves   = -1;
69   b->minleaf   = PETSC_MAX_INT;
70   b->maxleaf   = PETSC_MIN_INT;
71   b->nranks    = -1;
72   b->rankorder = PETSC_TRUE;
73   b->ingroup   = MPI_GROUP_NULL;
74   b->outgroup  = MPI_GROUP_NULL;
75   b->graphset  = PETSC_FALSE;
76 #if defined(PETSC_HAVE_DEVICE)
77   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
78   b->use_stream_aware_mpi = PETSC_FALSE;
79   b->unknown_input_stream = PETSC_FALSE;
80   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
81   b->backend = PETSCSF_BACKEND_KOKKOS;
82   #elif defined(PETSC_HAVE_CUDA)
83   b->backend = PETSCSF_BACKEND_CUDA;
84   #elif defined(PETSC_HAVE_HIP)
85   b->backend = PETSCSF_BACKEND_HIP;
86   #endif
87 
88   #if defined(PETSC_HAVE_NVSHMEM)
89   b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
90   b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
91   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem", &b->use_nvshmem, NULL));
92   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem_get", &b->use_nvshmem_get, NULL));
93   #endif
94 #endif
95   b->vscat.from_n = -1;
96   b->vscat.to_n   = -1;
97   b->vscat.unit   = MPIU_SCALAR;
98   *sf             = b;
99   PetscFunctionReturn(PETSC_SUCCESS);
100 }
101 
102 /*@
103    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
104 
105    Collective
106 
107    Input Parameter:
108 .  sf - star forest
109 
110    Level: advanced
111 
112 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetGraph()`, `PetscSFDestroy()`
113 @*/
114 PetscErrorCode PetscSFReset(PetscSF sf)
115 {
116   PetscFunctionBegin;
117   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
118   PetscTryTypeMethod(sf, Reset);
119   sf->nroots   = -1;
120   sf->nleaves  = -1;
121   sf->minleaf  = PETSC_MAX_INT;
122   sf->maxleaf  = PETSC_MIN_INT;
123   sf->mine     = NULL;
124   sf->remote   = NULL;
125   sf->graphset = PETSC_FALSE;
126   PetscCall(PetscFree(sf->mine_alloc));
127   PetscCall(PetscFree(sf->remote_alloc));
128   sf->nranks = -1;
129   PetscCall(PetscFree4(sf->ranks, sf->roffset, sf->rmine, sf->rremote));
130   sf->degreeknown = PETSC_FALSE;
131   PetscCall(PetscFree(sf->degree));
132   if (sf->ingroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->ingroup));
133   if (sf->outgroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->outgroup));
134   if (sf->multi) sf->multi->multi = NULL;
135   PetscCall(PetscSFDestroy(&sf->multi));
136   PetscCall(PetscLayoutDestroy(&sf->map));
137 
138 #if defined(PETSC_HAVE_DEVICE)
139   for (PetscInt i = 0; i < 2; i++) PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, sf->rmine_d[i]));
140 #endif
141 
142   sf->setupcalled = PETSC_FALSE;
143   PetscFunctionReturn(PETSC_SUCCESS);
144 }
145 
146 /*@C
147    PetscSFSetType - Set the `PetscSF` communication implementation
148 
149    Collective
150 
151    Input Parameters:
152 +  sf - the `PetscSF` context
153 -  type - a known method
154 .vb
155     PETSCSFWINDOW - MPI-2/3 one-sided
156     PETSCSFBASIC - basic implementation using MPI-1 two-sided
157 .ve
158 
159    Options Database Key:
160 .  -sf_type <type> - Sets the method; for example `basic` or `window` use -help for a list of available methods
161 
162   Level: intermediate
163 
164    Notes:
165    See `PetscSFType` for possible values
166 
167 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`
168 @*/
169 PetscErrorCode PetscSFSetType(PetscSF sf, PetscSFType type)
170 {
171   PetscBool match;
172   PetscErrorCode (*r)(PetscSF);
173 
174   PetscFunctionBegin;
175   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
176   PetscValidCharPointer(type, 2);
177 
178   PetscCall(PetscObjectTypeCompare((PetscObject)sf, type, &match));
179   if (match) PetscFunctionReturn(PETSC_SUCCESS);
180 
181   PetscCall(PetscFunctionListFind(PetscSFList, type, &r));
182   PetscCheck(r, PETSC_COMM_SELF, PETSC_ERR_ARG_UNKNOWN_TYPE, "Unable to find requested PetscSF type %s", type);
183   /* Destroy the previous PetscSF implementation context */
184   PetscTryTypeMethod(sf, Destroy);
185   PetscCall(PetscMemzero(sf->ops, sizeof(*sf->ops)));
186   PetscCall(PetscObjectChangeTypeName((PetscObject)sf, type));
187   PetscCall((*r)(sf));
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 /*@C
192   PetscSFGetType - Get the `PetscSF` communication implementation
193 
194   Not Collective
195 
196   Input Parameter:
197 . sf  - the `PetscSF` context
198 
199   Output Parameter:
200 . type - the `PetscSF` type name
201 
202   Level: intermediate
203 
204 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetType()`, `PetscSFCreate()`
205 @*/
206 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
207 {
208   PetscFunctionBegin;
209   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
210   PetscValidPointer(type, 2);
211   *type = ((PetscObject)sf)->type_name;
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 
215 /*@C
216    PetscSFDestroy - destroy a star forest
217 
218    Collective
219 
220    Input Parameter:
221 .  sf - address of star forest
222 
223    Level: intermediate
224 
225 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFReset()`
226 @*/
227 PetscErrorCode PetscSFDestroy(PetscSF *sf)
228 {
229   PetscFunctionBegin;
230   if (!*sf) PetscFunctionReturn(PETSC_SUCCESS);
231   PetscValidHeaderSpecific((*sf), PETSCSF_CLASSID, 1);
232   if (--((PetscObject)(*sf))->refct > 0) {
233     *sf = NULL;
234     PetscFunctionReturn(PETSC_SUCCESS);
235   }
236   PetscCall(PetscSFReset(*sf));
237   PetscTryTypeMethod((*sf), Destroy);
238   PetscCall(PetscSFDestroy(&(*sf)->vscat.lsf));
239   if ((*sf)->vscat.bs > 1) PetscCallMPI(MPI_Type_free(&(*sf)->vscat.unit));
240   PetscCall(PetscHeaderDestroy(sf));
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
245 {
246   PetscInt           i, nleaves;
247   PetscMPIInt        size;
248   const PetscInt    *ilocal;
249   const PetscSFNode *iremote;
250 
251   PetscFunctionBegin;
252   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(PETSC_SUCCESS);
253   PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, &iremote));
254   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
255   for (i = 0; i < nleaves; i++) {
256     const PetscInt rank   = iremote[i].rank;
257     const PetscInt remote = iremote[i].index;
258     const PetscInt leaf   = ilocal ? ilocal[i] : i;
259     PetscCheck(rank >= 0 && rank < size, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided rank (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be in [0, %d)", rank, i, size);
260     PetscCheck(remote >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided index (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be >= 0", remote, i);
261     PetscCheck(leaf >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided location (%" PetscInt_FMT ") for leaf %" PetscInt_FMT " is invalid, should be >= 0", leaf, i);
262   }
263   PetscFunctionReturn(PETSC_SUCCESS);
264 }
265 
266 /*@
267    PetscSFSetUp - set up communication structures for a `PetscSF`, after this is done it may be used to perform communication
268 
269    Collective
270 
271    Input Parameter:
272 .  sf - star forest communication object
273 
274    Level: beginner
275 
276 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetFromOptions()`, `PetscSFSetType()`
277 @*/
278 PetscErrorCode PetscSFSetUp(PetscSF sf)
279 {
280   PetscFunctionBegin;
281   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
282   PetscSFCheckGraphSet(sf, 1);
283   if (sf->setupcalled) PetscFunctionReturn(PETSC_SUCCESS);
284   PetscCall(PetscLogEventBegin(PETSCSF_SetUp, sf, 0, 0, 0));
285   PetscCall(PetscSFCheckGraphValid_Private(sf));
286   if (!((PetscObject)sf)->type_name) PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); /* Zero all sf->ops */
287   PetscTryTypeMethod(sf, SetUp);
288 #if defined(PETSC_HAVE_CUDA)
289   if (sf->backend == PETSCSF_BACKEND_CUDA) {
290     sf->ops->Malloc = PetscSFMalloc_CUDA;
291     sf->ops->Free   = PetscSFFree_CUDA;
292   }
293 #endif
294 #if defined(PETSC_HAVE_HIP)
295   if (sf->backend == PETSCSF_BACKEND_HIP) {
296     sf->ops->Malloc = PetscSFMalloc_HIP;
297     sf->ops->Free   = PetscSFFree_HIP;
298   }
299 #endif
300 
301 #
302 #if defined(PETSC_HAVE_KOKKOS)
303   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
304     sf->ops->Malloc = PetscSFMalloc_Kokkos;
305     sf->ops->Free   = PetscSFFree_Kokkos;
306   }
307 #endif
308   PetscCall(PetscLogEventEnd(PETSCSF_SetUp, sf, 0, 0, 0));
309   sf->setupcalled = PETSC_TRUE;
310   PetscFunctionReturn(PETSC_SUCCESS);
311 }
312 
313 /*@
314    PetscSFSetFromOptions - set `PetscSF` options using the options database
315 
316    Logically Collective
317 
318    Input Parameter:
319 .  sf - star forest
320 
321    Options Database Keys:
322 +  -sf_type               - implementation type, see `PetscSFSetType()`
323 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
324 .  -sf_use_default_stream - Assume callers of `PetscSF` computed the input root/leafdata with the default CUDA stream. `PetscSF` will also
325                             use the default stream to process data. Therefore, no stream synchronization is needed between `PetscSF` and its caller (default: true).
326                             If true, this option only works with `-use_gpu_aware_mpi 1`.
327 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is CUDA-stream aware and `PetscSF` won't sync streams for send/recv buffers passed to MPI (default: false).
328                                If true, this option only works with `-use_gpu_aware_mpi 1`.
329 
330 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently `PetscSF` has these backends: cuda, hip and Kokkos.
331                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
332                               the only available is kokkos.
333 
334    Level: intermediate
335 
336 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetType()`
337 @*/
338 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
339 {
340   PetscSFType deft;
341   char        type[256];
342   PetscBool   flg;
343 
344   PetscFunctionBegin;
345   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
346   PetscObjectOptionsBegin((PetscObject)sf);
347   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
348   PetscCall(PetscOptionsFList("-sf_type", "PetscSF implementation type", "PetscSFSetType", PetscSFList, deft, type, sizeof(type), &flg));
349   PetscCall(PetscSFSetType(sf, flg ? type : deft));
350   PetscCall(PetscOptionsBool("-sf_rank_order", "sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise", "PetscSFSetRankOrder", sf->rankorder, &sf->rankorder, NULL));
351 #if defined(PETSC_HAVE_DEVICE)
352   {
353     char      backendstr[32] = {0};
354     PetscBool isCuda = PETSC_FALSE, isHip = PETSC_FALSE, isKokkos = PETSC_FALSE, set;
355     /* Change the defaults set in PetscSFCreate() with command line options */
356     PetscCall(PetscOptionsBool("-sf_unknown_input_stream", "SF root/leafdata is computed on arbitrary streams unknown to SF", "PetscSFSetFromOptions", sf->unknown_input_stream, &sf->unknown_input_stream, NULL));
357     PetscCall(PetscOptionsBool("-sf_use_stream_aware_mpi", "Assume the underlying MPI is cuda-stream aware", "PetscSFSetFromOptions", sf->use_stream_aware_mpi, &sf->use_stream_aware_mpi, NULL));
358     PetscCall(PetscOptionsString("-sf_backend", "Select the device backend SF uses", "PetscSFSetFromOptions", NULL, backendstr, sizeof(backendstr), &set));
359     PetscCall(PetscStrcasecmp("cuda", backendstr, &isCuda));
360     PetscCall(PetscStrcasecmp("kokkos", backendstr, &isKokkos));
361     PetscCall(PetscStrcasecmp("hip", backendstr, &isHip));
362   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
363     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
364     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
365     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
366     else PetscCheck(!set, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
367   #elif defined(PETSC_HAVE_KOKKOS)
368     PetscCheck(!set || isKokkos, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You can only choose kokkos", backendstr);
369   #endif
370   }
371 #endif
372   PetscTryTypeMethod(sf, SetFromOptions, PetscOptionsObject);
373   PetscOptionsEnd();
374   PetscFunctionReturn(PETSC_SUCCESS);
375 }
376 
377 /*@
378    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
379 
380    Logically Collective
381 
382    Input Parameters:
383 +  sf - star forest
384 -  flg - `PETSC_TRUE` to sort, `PETSC_FALSE` to skip sorting (lower setup cost, but non-deterministic)
385 
386    Level: advanced
387 
388 .seealso: `PetscSF`, `PetscSFType`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`
389 @*/
390 PetscErrorCode PetscSFSetRankOrder(PetscSF sf, PetscBool flg)
391 {
392   PetscFunctionBegin;
393   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
394   PetscValidLogicalCollectiveBool(sf, flg, 2);
395   PetscCheck(!sf->multi, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
396   sf->rankorder = flg;
397   PetscFunctionReturn(PETSC_SUCCESS);
398 }
399 
400 /*@C
401    PetscSFSetGraph - Set a parallel star forest
402 
403    Collective
404 
405    Input Parameters:
406 +  sf - star forest
407 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
408 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
409 .  ilocal - locations of leaves in leafdata buffers, pass `NULL` for contiguous storage (locations must be >= 0, enforced
410 during setup in debug mode)
411 .  localmode - copy mode for `ilocal`
412 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
413 during setup in debug mode)
414 -  remotemode - copy mode for `iremote`
415 
416    Level: intermediate
417 
418    Notes:
419    Leaf indices in `ilocal` must be unique, otherwise an error occurs.
420 
421    Input arrays `ilocal` and `iremote` follow the `PetscCopyMode` semantics.
422    In particular, if `localmode` or `remotemode` is `PETSC_OWN_POINTER` or `PETSC_USE_POINTER`,
423    PETSc might modify the respective array;
424    if `PETSC_USE_POINTER`, the user must delete the array after `PetscSFDestroy()`.
425    Only if `PETSC_COPY_VALUES` is used, the respective array is guaranteed to stay intact and a const array can be passed (but a cast to non-const is needed).
426 
427    Fortran Note:
428    In Fortran you must use `PETSC_COPY_VALUES` for `localmode` and `remotemode`.
429 
430    Developer Note:
431    We sort leaves to check for duplicates and contiguousness and to find minleaf/maxleaf.
432    This also allows to compare leaf sets of two `PetscSF`s easily.
433 
434 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
435 @*/
436 PetscErrorCode PetscSFSetGraph(PetscSF sf, PetscInt nroots, PetscInt nleaves, PetscInt *ilocal, PetscCopyMode localmode, PetscSFNode *iremote, PetscCopyMode remotemode)
437 {
438   PetscBool unique, contiguous;
439 
440   PetscFunctionBegin;
441   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
442   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal, 4);
443   if (nleaves > 0) PetscValidPointer(iremote, 6);
444   PetscCheck(nroots >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nroots %" PetscInt_FMT ", cannot be negative", nroots);
445   PetscCheck(nleaves >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nleaves %" PetscInt_FMT ", cannot be negative", nleaves);
446   /* enums may be handled as unsigned by some compilers, NVHPC for example, the int cast
447    * below is to prevent NVHPC from warning about meaningless comparison of unsigned with zero */
448   PetscCheck((int)localmode >= PETSC_COPY_VALUES && localmode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong localmode %d", localmode);
449   PetscCheck((int)remotemode >= PETSC_COPY_VALUES && remotemode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong remotemode %d", remotemode);
450 
451   if (sf->nroots >= 0) { /* Reset only if graph already set */
452     PetscCall(PetscSFReset(sf));
453   }
454 
455   PetscCall(PetscLogEventBegin(PETSCSF_SetGraph, sf, 0, 0, 0));
456 
457   sf->nroots  = nroots;
458   sf->nleaves = nleaves;
459 
460   if (localmode == PETSC_COPY_VALUES && ilocal) {
461     PetscInt *tlocal = NULL;
462 
463     PetscCall(PetscMalloc1(nleaves, &tlocal));
464     PetscCall(PetscArraycpy(tlocal, ilocal, nleaves));
465     ilocal = tlocal;
466   }
467   if (remotemode == PETSC_COPY_VALUES) {
468     PetscSFNode *tremote = NULL;
469 
470     PetscCall(PetscMalloc1(nleaves, &tremote));
471     PetscCall(PetscArraycpy(tremote, iremote, nleaves));
472     iremote = tremote;
473   }
474 
475   if (nleaves && ilocal) {
476     PetscSFNode work;
477 
478     PetscCall(PetscSortIntWithDataArray(nleaves, ilocal, iremote, sizeof(PetscSFNode), &work));
479     PetscCall(PetscSortedCheckDupsInt(nleaves, ilocal, &unique));
480     unique = PetscNot(unique);
481     PetscCheck(sf->allow_multi_leaves || unique, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Input ilocal has duplicate entries which is not allowed for this PetscSF");
482     sf->minleaf = ilocal[0];
483     sf->maxleaf = ilocal[nleaves - 1];
484     contiguous  = (PetscBool)(unique && ilocal[0] == 0 && ilocal[nleaves - 1] == nleaves - 1);
485   } else {
486     sf->minleaf = 0;
487     sf->maxleaf = nleaves - 1;
488     unique      = PETSC_TRUE;
489     contiguous  = PETSC_TRUE;
490   }
491 
492   if (contiguous) {
493     if (localmode == PETSC_USE_POINTER) {
494       ilocal = NULL;
495     } else {
496       PetscCall(PetscFree(ilocal));
497     }
498   }
499   sf->mine = ilocal;
500   if (localmode == PETSC_USE_POINTER) {
501     sf->mine_alloc = NULL;
502   } else {
503     sf->mine_alloc = ilocal;
504   }
505   sf->remote = iremote;
506   if (remotemode == PETSC_USE_POINTER) {
507     sf->remote_alloc = NULL;
508   } else {
509     sf->remote_alloc = iremote;
510   }
511   PetscCall(PetscLogEventEnd(PETSCSF_SetGraph, sf, 0, 0, 0));
512   sf->graphset = PETSC_TRUE;
513   PetscFunctionReturn(PETSC_SUCCESS);
514 }
515 
516 /*@
517   PetscSFSetGraphWithPattern - Sets the graph of a `PetscSF` with a specific pattern
518 
519   Collective
520 
521   Input Parameters:
522 + sf      - The `PetscSF`
523 . map     - Layout of roots over all processes (insignificant when pattern is `PETSCSF_PATTERN_ALLTOALL`)
524 - pattern - One of `PETSCSF_PATTERN_ALLGATHER`, `PETSCSF_PATTERN_GATHER`, `PETSCSF_PATTERN_ALLTOALL`
525 
526   Level: intermediate
527 
528   Notes:
529   It is easier to explain `PetscSFPattern` using vectors. Suppose we have an MPI vector `x` and its `PetscLayout` is `map`.
530   `n` and `N` are the local and global sizes of `x` respectively.
531 
532   With `PETSCSF_PATTERN_ALLGATHER`, the routine creates a graph that if one does `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on it, it will copy `x` to
533   sequential vectors `y` on all MPI processes.
534 
535   With `PETSCSF_PATTERN_GATHER`, the routine creates a graph that if one does `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on it, it will copy `x` to a
536   sequential vector `y` on rank 0.
537 
538   In above cases, entries of `x` are roots and entries of `y` are leaves.
539 
540   With `PETSCSF_PATTERN_ALLTOALL`, map is insignificant. Suppose NP is size of `sf`'s communicator. The routine
541   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
542   of rank j. Here 0 <=i,j<NP. It is a kind of `MPI_Alltoall()` with sendcount/recvcount being 1. Note that it does
543   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
544   items with `MPI_Type_contiguous` and use that as the <unit> argument in SF routines.
545 
546   In this case, roots and leaves are symmetric.
547 
548 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
549  @*/
550 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf, PetscLayout map, PetscSFPattern pattern)
551 {
552   MPI_Comm    comm;
553   PetscInt    n, N, res[2];
554   PetscMPIInt rank, size;
555   PetscSFType type;
556 
557   PetscFunctionBegin;
558   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
559   if (pattern != PETSCSF_PATTERN_ALLTOALL) PetscValidPointer(map, 2);
560   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
561   PetscCheck(pattern >= PETSCSF_PATTERN_ALLGATHER && pattern <= PETSCSF_PATTERN_ALLTOALL, comm, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported PetscSFPattern %d", pattern);
562   PetscCallMPI(MPI_Comm_rank(comm, &rank));
563   PetscCallMPI(MPI_Comm_size(comm, &size));
564 
565   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
566     type = PETSCSFALLTOALL;
567     PetscCall(PetscLayoutCreate(comm, &sf->map));
568     PetscCall(PetscLayoutSetLocalSize(sf->map, size));
569     PetscCall(PetscLayoutSetSize(sf->map, ((PetscInt)size) * size));
570     PetscCall(PetscLayoutSetUp(sf->map));
571   } else {
572     PetscCall(PetscLayoutGetLocalSize(map, &n));
573     PetscCall(PetscLayoutGetSize(map, &N));
574     res[0] = n;
575     res[1] = -n;
576     /* Check if n are same over all ranks so that we can optimize it */
577     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, res, 2, MPIU_INT, MPI_MAX, comm));
578     if (res[0] == -res[1]) { /* same n */
579       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER : PETSCSFGATHER;
580     } else {
581       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
582     }
583     PetscCall(PetscLayoutReference(map, &sf->map));
584   }
585   PetscCall(PetscSFSetType(sf, type));
586 
587   sf->pattern = pattern;
588   sf->mine    = NULL; /* Contiguous */
589 
590   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
591      Also set other easy stuff.
592    */
593   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
594     sf->nleaves = N;
595     sf->nroots  = n;
596     sf->nranks  = size;
597     sf->minleaf = 0;
598     sf->maxleaf = N - 1;
599   } else if (pattern == PETSCSF_PATTERN_GATHER) {
600     sf->nleaves = rank ? 0 : N;
601     sf->nroots  = n;
602     sf->nranks  = rank ? 0 : size;
603     sf->minleaf = 0;
604     sf->maxleaf = rank ? -1 : N - 1;
605   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
606     sf->nleaves = size;
607     sf->nroots  = size;
608     sf->nranks  = size;
609     sf->minleaf = 0;
610     sf->maxleaf = size - 1;
611   }
612   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
613   sf->graphset = PETSC_TRUE;
614   PetscFunctionReturn(PETSC_SUCCESS);
615 }
616 
617 /*@
618    PetscSFCreateInverseSF - given a `PetscSF` in which all vertices have degree 1, creates the inverse map
619 
620    Collective
621 
622    Input Parameter:
623 .  sf - star forest to invert
624 
625    Output Parameter:
626 .  isf - inverse of `sf`
627 
628    Level: advanced
629 
630    Notes:
631    All roots must have degree 1.
632 
633    The local space may be a permutation, but cannot be sparse.
634 
635 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetGraph()`
636 @*/
637 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf, PetscSF *isf)
638 {
639   PetscMPIInt     rank;
640   PetscInt        i, nroots, nleaves, maxlocal, count, *newilocal;
641   const PetscInt *ilocal;
642   PetscSFNode    *roots, *leaves;
643 
644   PetscFunctionBegin;
645   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
646   PetscSFCheckGraphSet(sf, 1);
647   PetscValidPointer(isf, 2);
648 
649   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, NULL));
650   maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
651 
652   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
653   PetscCall(PetscMalloc2(nroots, &roots, maxlocal, &leaves));
654   for (i = 0; i < maxlocal; i++) {
655     leaves[i].rank  = rank;
656     leaves[i].index = i;
657   }
658   for (i = 0; i < nroots; i++) {
659     roots[i].rank  = -1;
660     roots[i].index = -1;
661   }
662   PetscCall(PetscSFReduceBegin(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
663   PetscCall(PetscSFReduceEnd(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
664 
665   /* Check whether our leaves are sparse */
666   for (i = 0, count = 0; i < nroots; i++)
667     if (roots[i].rank >= 0) count++;
668   if (count == nroots) newilocal = NULL;
669   else { /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */ PetscCall(PetscMalloc1(count, &newilocal));
670     for (i = 0, count = 0; i < nroots; i++) {
671       if (roots[i].rank >= 0) {
672         newilocal[count]   = i;
673         roots[count].rank  = roots[i].rank;
674         roots[count].index = roots[i].index;
675         count++;
676       }
677     }
678   }
679 
680   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, isf));
681   PetscCall(PetscSFSetGraph(*isf, maxlocal, count, newilocal, PETSC_OWN_POINTER, roots, PETSC_COPY_VALUES));
682   PetscCall(PetscFree2(roots, leaves));
683   PetscFunctionReturn(PETSC_SUCCESS);
684 }
685 
686 /*@
687    PetscSFDuplicate - duplicate a `PetscSF`, optionally preserving rank connectivity and graph
688 
689    Collective
690 
691    Input Parameters:
692 +  sf - communication object to duplicate
693 -  opt - `PETSCSF_DUPLICATE_CONFONLY`, `PETSCSF_DUPLICATE_RANKS`, or `PETSCSF_DUPLICATE_GRAPH` (see `PetscSFDuplicateOption`)
694 
695    Output Parameter:
696 .  newsf - new communication object
697 
698    Level: beginner
699 
700 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFSetType()`, `PetscSFSetGraph()`
701 @*/
702 PetscErrorCode PetscSFDuplicate(PetscSF sf, PetscSFDuplicateOption opt, PetscSF *newsf)
703 {
704   PetscSFType  type;
705   MPI_Datatype dtype = MPIU_SCALAR;
706 
707   PetscFunctionBegin;
708   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
709   PetscValidLogicalCollectiveEnum(sf, opt, 2);
710   PetscValidPointer(newsf, 3);
711   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sf), newsf));
712   PetscCall(PetscSFGetType(sf, &type));
713   if (type) PetscCall(PetscSFSetType(*newsf, type));
714   (*newsf)->allow_multi_leaves = sf->allow_multi_leaves; /* Dup this flag earlier since PetscSFSetGraph() below checks on this flag */
715   if (opt == PETSCSF_DUPLICATE_GRAPH) {
716     PetscSFCheckGraphSet(sf, 1);
717     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
718       PetscInt           nroots, nleaves;
719       const PetscInt    *ilocal;
720       const PetscSFNode *iremote;
721       PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
722       PetscCall(PetscSFSetGraph(*newsf, nroots, nleaves, (PetscInt *)ilocal, PETSC_COPY_VALUES, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
723     } else {
724       PetscCall(PetscSFSetGraphWithPattern(*newsf, sf->map, sf->pattern));
725     }
726   }
727   /* Since oldtype is committed, so is newtype, according to MPI */
728   if (sf->vscat.bs > 1) PetscCallMPI(MPI_Type_dup(sf->vscat.unit, &dtype));
729   (*newsf)->vscat.bs     = sf->vscat.bs;
730   (*newsf)->vscat.unit   = dtype;
731   (*newsf)->vscat.to_n   = sf->vscat.to_n;
732   (*newsf)->vscat.from_n = sf->vscat.from_n;
733   /* Do not copy lsf. Build it on demand since it is rarely used */
734 
735 #if defined(PETSC_HAVE_DEVICE)
736   (*newsf)->backend              = sf->backend;
737   (*newsf)->unknown_input_stream = sf->unknown_input_stream;
738   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
739   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
740 #endif
741   PetscTryTypeMethod(sf, Duplicate, opt, *newsf);
742   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
743   PetscFunctionReturn(PETSC_SUCCESS);
744 }
745 
746 /*@C
747    PetscSFGetGraph - Get the graph specifying a parallel star forest
748 
749    Not Collective
750 
751    Input Parameter:
752 .  sf - star forest
753 
754    Output Parameters:
755 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
756 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
757 .  ilocal - locations of leaves in leafdata buffers (if returned value is `NULL`, it means leaves are in contiguous storage)
758 -  iremote - remote locations of root vertices for each leaf on the current process
759 
760    Level: intermediate
761 
762    Notes:
763      We are not currently requiring that the graph is set, thus returning `nroots` = -1 if it has not been set yet
764 
765      The returned `ilocal` and `iremote` might contain values in different order than the input ones in `PetscSFSetGraph()`
766 
767    Fortran Notes:
768      The returned `iremote` array is a copy and must be deallocated after use. Consequently, if you
769      want to update the graph, you must call `PetscSFSetGraph()` after modifying the `iremote` array.
770 
771      To check for a `NULL` `ilocal` use
772 $      if (loc(ilocal) == loc(PETSC_NULL_INTEGER)) then
773 
774 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`
775 @*/
776 PetscErrorCode PetscSFGetGraph(PetscSF sf, PetscInt *nroots, PetscInt *nleaves, const PetscInt **ilocal, const PetscSFNode **iremote)
777 {
778   PetscFunctionBegin;
779   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
780   if (sf->ops->GetGraph) {
781     PetscCall((sf->ops->GetGraph)(sf, nroots, nleaves, ilocal, iremote));
782   } else {
783     if (nroots) *nroots = sf->nroots;
784     if (nleaves) *nleaves = sf->nleaves;
785     if (ilocal) *ilocal = sf->mine;
786     if (iremote) *iremote = sf->remote;
787   }
788   PetscFunctionReturn(PETSC_SUCCESS);
789 }
790 
791 /*@
792    PetscSFGetLeafRange - Get the active leaf ranges
793 
794    Not Collective
795 
796    Input Parameter:
797 .  sf - star forest
798 
799    Output Parameters:
800 +  minleaf - minimum active leaf on this process. Returns 0 if there are no leaves.
801 -  maxleaf - maximum active leaf on this process. Returns -1 if there are no leaves.
802 
803    Level: developer
804 
805 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
806 @*/
807 PetscErrorCode PetscSFGetLeafRange(PetscSF sf, PetscInt *minleaf, PetscInt *maxleaf)
808 {
809   PetscFunctionBegin;
810   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
811   PetscSFCheckGraphSet(sf, 1);
812   if (minleaf) *minleaf = sf->minleaf;
813   if (maxleaf) *maxleaf = sf->maxleaf;
814   PetscFunctionReturn(PETSC_SUCCESS);
815 }
816 
817 /*@C
818    PetscSFViewFromOptions - View a `PetscSF` based on arguments in the options database
819 
820    Collective on A
821 
822    Input Parameters:
823 +  A - the star forest
824 .  obj - Optional object that provides the prefix for the option names
825 -  name - command line option
826 
827    Level: intermediate
828 
829   Note:
830   See `PetscObjectViewFromOptions()` for possible `PetscViewer` and `PetscViewerFormat`
831 
832 .seealso: `PetscSF`, `PetscSFView`, `PetscObjectViewFromOptions()`, `PetscSFCreate()`
833 @*/
834 PetscErrorCode PetscSFViewFromOptions(PetscSF A, PetscObject obj, const char name[])
835 {
836   PetscFunctionBegin;
837   PetscValidHeaderSpecific(A, PETSCSF_CLASSID, 1);
838   PetscCall(PetscObjectViewFromOptions((PetscObject)A, obj, name));
839   PetscFunctionReturn(PETSC_SUCCESS);
840 }
841 
842 /*@C
843    PetscSFView - view a star forest
844 
845    Collective
846 
847    Input Parameters:
848 +  sf - star forest
849 -  viewer - viewer to display graph, for example `PETSC_VIEWER_STDOUT_WORLD`
850 
851    Level: beginner
852 
853 .seealso: `PetscSF`, `PetscViewer`, `PetscSFCreate()`, `PetscSFSetGraph()`
854 @*/
855 PetscErrorCode PetscSFView(PetscSF sf, PetscViewer viewer)
856 {
857   PetscBool         iascii;
858   PetscViewerFormat format;
859 
860   PetscFunctionBegin;
861   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
862   if (!viewer) PetscCall(PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf), &viewer));
863   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
864   PetscCheckSameComm(sf, 1, viewer, 2);
865   if (sf->graphset) PetscCall(PetscSFSetUp(sf));
866   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
867   if (iascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) {
868     PetscMPIInt rank;
869     PetscInt    ii, i, j;
870 
871     PetscCall(PetscObjectPrintClassNamePrefixType((PetscObject)sf, viewer));
872     PetscCall(PetscViewerASCIIPushTab(viewer));
873     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
874       if (!sf->graphset) {
875         PetscCall(PetscViewerASCIIPrintf(viewer, "PetscSFSetGraph() has not been called yet\n"));
876         PetscCall(PetscViewerASCIIPopTab(viewer));
877         PetscFunctionReturn(PETSC_SUCCESS);
878       }
879       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
880       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
881       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Number of roots=%" PetscInt_FMT ", leaves=%" PetscInt_FMT ", remote ranks=%" PetscInt_FMT "\n", rank, sf->nroots, sf->nleaves, sf->nranks));
882       for (i = 0; i < sf->nleaves; i++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %" PetscInt_FMT " <- (%" PetscInt_FMT ",%" PetscInt_FMT ")\n", rank, sf->mine ? sf->mine[i] : i, sf->remote[i].rank, sf->remote[i].index));
883       PetscCall(PetscViewerFlush(viewer));
884       PetscCall(PetscViewerGetFormat(viewer, &format));
885       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
886         PetscMPIInt *tmpranks, *perm;
887         PetscCall(PetscMalloc2(sf->nranks, &tmpranks, sf->nranks, &perm));
888         PetscCall(PetscArraycpy(tmpranks, sf->ranks, sf->nranks));
889         for (i = 0; i < sf->nranks; i++) perm[i] = i;
890         PetscCall(PetscSortMPIIntWithArray(sf->nranks, tmpranks, perm));
891         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Roots referenced by my leaves, by rank\n", rank));
892         for (ii = 0; ii < sf->nranks; ii++) {
893           i = perm[ii];
894           PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %d: %" PetscInt_FMT " edges\n", rank, sf->ranks[i], sf->roffset[i + 1] - sf->roffset[i]));
895           for (j = sf->roffset[i]; j < sf->roffset[i + 1]; j++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d]    %" PetscInt_FMT " <- %" PetscInt_FMT "\n", rank, sf->rmine[j], sf->rremote[j]));
896         }
897         PetscCall(PetscFree2(tmpranks, perm));
898       }
899       PetscCall(PetscViewerFlush(viewer));
900       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
901     }
902     PetscCall(PetscViewerASCIIPopTab(viewer));
903   }
904   PetscTryTypeMethod(sf, View, viewer);
905   PetscFunctionReturn(PETSC_SUCCESS);
906 }
907 
908 /*@C
909    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
910 
911    Not Collective
912 
913    Input Parameter:
914 .  sf - star forest
915 
916    Output Parameters:
917 +  nranks - number of ranks referenced by local part
918 .  ranks - [`nranks`] array of ranks
919 .  roffset - [`nranks`+1] offset in `rmine`/`rremote` for each rank
920 .  rmine - [`roffset`[`nranks`]] concatenated array holding local indices referencing each remote rank
921 -  rremote - [`roffset`[`nranks`]] concatenated array holding remote indices referenced for each remote rank
922 
923    Level: developer
924 
925 .seealso: `PetscSF`, `PetscSFGetLeafRanks()`
926 @*/
927 PetscErrorCode PetscSFGetRootRanks(PetscSF sf, PetscInt *nranks, const PetscMPIInt **ranks, const PetscInt **roffset, const PetscInt **rmine, const PetscInt **rremote)
928 {
929   PetscFunctionBegin;
930   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
931   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
932   if (sf->ops->GetRootRanks) {
933     PetscCall((sf->ops->GetRootRanks)(sf, nranks, ranks, roffset, rmine, rremote));
934   } else {
935     /* The generic implementation */
936     if (nranks) *nranks = sf->nranks;
937     if (ranks) *ranks = sf->ranks;
938     if (roffset) *roffset = sf->roffset;
939     if (rmine) *rmine = sf->rmine;
940     if (rremote) *rremote = sf->rremote;
941   }
942   PetscFunctionReturn(PETSC_SUCCESS);
943 }
944 
945 /*@C
946    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
947 
948    Not Collective
949 
950    Input Parameter:
951 .  sf - star forest
952 
953    Output Parameters:
954 +  niranks - number of leaf ranks referencing roots on this process
955 .  iranks - [`niranks`] array of ranks
956 .  ioffset - [`niranks`+1] offset in `irootloc` for each rank
957 -  irootloc - [`ioffset`[`niranks`]] concatenated array holding local indices of roots referenced by each leaf rank
958 
959    Level: developer
960 
961 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
962 @*/
963 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf, PetscInt *niranks, const PetscMPIInt **iranks, const PetscInt **ioffset, const PetscInt **irootloc)
964 {
965   PetscFunctionBegin;
966   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
967   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
968   if (sf->ops->GetLeafRanks) {
969     PetscCall((sf->ops->GetLeafRanks)(sf, niranks, iranks, ioffset, irootloc));
970   } else {
971     PetscSFType type;
972     PetscCall(PetscSFGetType(sf, &type));
973     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
974   }
975   PetscFunctionReturn(PETSC_SUCCESS);
976 }
977 
978 static PetscBool InList(PetscMPIInt needle, PetscMPIInt n, const PetscMPIInt *list)
979 {
980   PetscInt i;
981   for (i = 0; i < n; i++) {
982     if (needle == list[i]) return PETSC_TRUE;
983   }
984   return PETSC_FALSE;
985 }
986 
987 /*@C
988    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by `PetscSF` implementations.
989 
990    Collective
991 
992    Input Parameters:
993 +  sf - `PetscSF` to set up; `PetscSFSetGraph()` must have been called
994 -  dgroup - `MPI_Group` of ranks to be distinguished (e.g., for self or shared memory exchange)
995 
996    Level: developer
997 
998 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
999 @*/
1000 PetscErrorCode PetscSFSetUpRanks(PetscSF sf, MPI_Group dgroup)
1001 {
1002   PetscHMapI    table;
1003   PetscHashIter pos;
1004   PetscMPIInt   size, groupsize, *groupranks;
1005   PetscInt     *rcount, *ranks;
1006   PetscInt      i, irank = -1, orank = -1;
1007 
1008   PetscFunctionBegin;
1009   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1010   PetscSFCheckGraphSet(sf, 1);
1011   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
1012   PetscCall(PetscHMapICreateWithSize(10, &table));
1013   for (i = 0; i < sf->nleaves; i++) {
1014     /* Log 1-based rank */
1015     PetscCall(PetscHMapISetWithMode(table, sf->remote[i].rank + 1, 1, ADD_VALUES));
1016   }
1017   PetscCall(PetscHMapIGetSize(table, &sf->nranks));
1018   PetscCall(PetscMalloc4(sf->nranks, &sf->ranks, sf->nranks + 1, &sf->roffset, sf->nleaves, &sf->rmine, sf->nleaves, &sf->rremote));
1019   PetscCall(PetscMalloc2(sf->nranks, &rcount, sf->nranks, &ranks));
1020   PetscHashIterBegin(table, pos);
1021   for (i = 0; i < sf->nranks; i++) {
1022     PetscHashIterGetKey(table, pos, ranks[i]);
1023     PetscHashIterGetVal(table, pos, rcount[i]);
1024     PetscHashIterNext(table, pos);
1025     ranks[i]--; /* Convert back to 0-based */
1026   }
1027   PetscCall(PetscHMapIDestroy(&table));
1028 
1029   /* We expect that dgroup is reliably "small" while nranks could be large */
1030   {
1031     MPI_Group    group = MPI_GROUP_NULL;
1032     PetscMPIInt *dgroupranks;
1033     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1034     PetscCallMPI(MPI_Group_size(dgroup, &groupsize));
1035     PetscCall(PetscMalloc1(groupsize, &dgroupranks));
1036     PetscCall(PetscMalloc1(groupsize, &groupranks));
1037     for (i = 0; i < groupsize; i++) dgroupranks[i] = i;
1038     if (groupsize) PetscCallMPI(MPI_Group_translate_ranks(dgroup, groupsize, dgroupranks, group, groupranks));
1039     PetscCallMPI(MPI_Group_free(&group));
1040     PetscCall(PetscFree(dgroupranks));
1041   }
1042 
1043   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1044   for (sf->ndranks = 0, i = sf->nranks; sf->ndranks < i;) {
1045     for (i--; sf->ndranks < i; i--) { /* Scan i backward looking for distinguished rank */
1046       if (InList(ranks[i], groupsize, groupranks)) break;
1047     }
1048     for (; sf->ndranks <= i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1049       if (!InList(ranks[sf->ndranks], groupsize, groupranks)) break;
1050     }
1051     if (sf->ndranks < i) { /* Swap ranks[sf->ndranks] with ranks[i] */
1052       PetscInt tmprank, tmpcount;
1053 
1054       tmprank             = ranks[i];
1055       tmpcount            = rcount[i];
1056       ranks[i]            = ranks[sf->ndranks];
1057       rcount[i]           = rcount[sf->ndranks];
1058       ranks[sf->ndranks]  = tmprank;
1059       rcount[sf->ndranks] = tmpcount;
1060       sf->ndranks++;
1061     }
1062   }
1063   PetscCall(PetscFree(groupranks));
1064   PetscCall(PetscSortIntWithArray(sf->ndranks, ranks, rcount));
1065   PetscCall(PetscSortIntWithArray(sf->nranks - sf->ndranks, ranks + sf->ndranks, rcount + sf->ndranks));
1066   sf->roffset[0] = 0;
1067   for (i = 0; i < sf->nranks; i++) {
1068     PetscCall(PetscMPIIntCast(ranks[i], sf->ranks + i));
1069     sf->roffset[i + 1] = sf->roffset[i] + rcount[i];
1070     rcount[i]          = 0;
1071   }
1072   for (i = 0, irank = -1, orank = -1; i < sf->nleaves; i++) {
1073     /* short circuit */
1074     if (orank != sf->remote[i].rank) {
1075       /* Search for index of iremote[i].rank in sf->ranks */
1076       PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->ndranks, sf->ranks, &irank));
1077       if (irank < 0) {
1078         PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->nranks - sf->ndranks, sf->ranks + sf->ndranks, &irank));
1079         if (irank >= 0) irank += sf->ndranks;
1080       }
1081       orank = sf->remote[i].rank;
1082     }
1083     PetscCheck(irank >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Could not find rank %" PetscInt_FMT " in array", sf->remote[i].rank);
1084     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1085     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1086     rcount[irank]++;
1087   }
1088   PetscCall(PetscFree2(rcount, ranks));
1089   PetscFunctionReturn(PETSC_SUCCESS);
1090 }
1091 
1092 /*@C
1093    PetscSFGetGroups - gets incoming and outgoing process groups
1094 
1095    Collective
1096 
1097    Input Parameter:
1098 .  sf - star forest
1099 
1100    Output Parameters:
1101 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1102 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1103 
1104    Level: developer
1105 
1106 .seealso: `PetscSF`, `PetscSFGetWindow()`, `PetscSFRestoreWindow()`
1107 @*/
1108 PetscErrorCode PetscSFGetGroups(PetscSF sf, MPI_Group *incoming, MPI_Group *outgoing)
1109 {
1110   MPI_Group group = MPI_GROUP_NULL;
1111 
1112   PetscFunctionBegin;
1113   PetscCheck(sf->nranks >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUpRanks() before obtaining groups");
1114   if (sf->ingroup == MPI_GROUP_NULL) {
1115     PetscInt        i;
1116     const PetscInt *indegree;
1117     PetscMPIInt     rank, *outranks, *inranks;
1118     PetscSFNode    *remote;
1119     PetscSF         bgcount;
1120 
1121     /* Compute the number of incoming ranks */
1122     PetscCall(PetscMalloc1(sf->nranks, &remote));
1123     for (i = 0; i < sf->nranks; i++) {
1124       remote[i].rank  = sf->ranks[i];
1125       remote[i].index = 0;
1126     }
1127     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, &bgcount));
1128     PetscCall(PetscSFSetGraph(bgcount, 1, sf->nranks, NULL, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1129     PetscCall(PetscSFComputeDegreeBegin(bgcount, &indegree));
1130     PetscCall(PetscSFComputeDegreeEnd(bgcount, &indegree));
1131     /* Enumerate the incoming ranks */
1132     PetscCall(PetscMalloc2(indegree[0], &inranks, sf->nranks, &outranks));
1133     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1134     for (i = 0; i < sf->nranks; i++) outranks[i] = rank;
1135     PetscCall(PetscSFGatherBegin(bgcount, MPI_INT, outranks, inranks));
1136     PetscCall(PetscSFGatherEnd(bgcount, MPI_INT, outranks, inranks));
1137     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1138     PetscCallMPI(MPI_Group_incl(group, indegree[0], inranks, &sf->ingroup));
1139     PetscCallMPI(MPI_Group_free(&group));
1140     PetscCall(PetscFree2(inranks, outranks));
1141     PetscCall(PetscSFDestroy(&bgcount));
1142   }
1143   *incoming = sf->ingroup;
1144 
1145   if (sf->outgroup == MPI_GROUP_NULL) {
1146     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1147     PetscCallMPI(MPI_Group_incl(group, sf->nranks, sf->ranks, &sf->outgroup));
1148     PetscCallMPI(MPI_Group_free(&group));
1149   }
1150   *outgoing = sf->outgroup;
1151   PetscFunctionReturn(PETSC_SUCCESS);
1152 }
1153 
1154 /*@
1155    PetscSFGetMultiSF - gets the inner `PetscSF` implementing gathers and scatters
1156 
1157    Collective
1158 
1159    Input Parameter:
1160 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1161 
1162    Output Parameter:
1163 .  multi - star forest with split roots, such that each root has degree exactly 1
1164 
1165    Level: developer
1166 
1167    Note:
1168    In most cases, users should use `PetscSFGatherBegin()` and `PetscSFScatterBegin()` instead of manipulating multi
1169    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1170    edge, it is a candidate for future optimization that might involve its removal.
1171 
1172 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`, `PetscSFComputeMultiRootOriginalNumbering()`
1173 @*/
1174 PetscErrorCode PetscSFGetMultiSF(PetscSF sf, PetscSF *multi)
1175 {
1176   PetscFunctionBegin;
1177   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1178   PetscValidPointer(multi, 2);
1179   if (sf->nroots < 0) { /* Graph has not been set yet; why do we need this? */
1180     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1181     *multi           = sf->multi;
1182     sf->multi->multi = sf->multi;
1183     PetscFunctionReturn(PETSC_SUCCESS);
1184   }
1185   if (!sf->multi) {
1186     const PetscInt *indegree;
1187     PetscInt        i, *inoffset, *outones, *outoffset, maxlocal;
1188     PetscSFNode    *remote;
1189     maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
1190     PetscCall(PetscSFComputeDegreeBegin(sf, &indegree));
1191     PetscCall(PetscSFComputeDegreeEnd(sf, &indegree));
1192     PetscCall(PetscMalloc3(sf->nroots + 1, &inoffset, maxlocal, &outones, maxlocal, &outoffset));
1193     inoffset[0] = 0;
1194     for (i = 0; i < sf->nroots; i++) inoffset[i + 1] = inoffset[i] + indegree[i];
1195     for (i = 0; i < maxlocal; i++) outones[i] = 1;
1196     PetscCall(PetscSFFetchAndOpBegin(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1197     PetscCall(PetscSFFetchAndOpEnd(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1198     for (i = 0; i < sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1199     if (PetscDefined(USE_DEBUG)) {                               /* Check that the expected number of increments occurred */
1200       for (i = 0; i < sf->nroots; i++) PetscCheck(inoffset[i] + indegree[i] == inoffset[i + 1], PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incorrect result after PetscSFFetchAndOp");
1201     }
1202     PetscCall(PetscMalloc1(sf->nleaves, &remote));
1203     for (i = 0; i < sf->nleaves; i++) {
1204       remote[i].rank  = sf->remote[i].rank;
1205       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1206     }
1207     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1208     sf->multi->multi = sf->multi;
1209     PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1210     if (sf->rankorder) { /* Sort the ranks */
1211       PetscMPIInt  rank;
1212       PetscInt    *inranks, *newoffset, *outranks, *newoutoffset, *tmpoffset, maxdegree;
1213       PetscSFNode *newremote;
1214       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1215       for (i = 0, maxdegree = 0; i < sf->nroots; i++) maxdegree = PetscMax(maxdegree, indegree[i]);
1216       PetscCall(PetscMalloc5(sf->multi->nroots, &inranks, sf->multi->nroots, &newoffset, maxlocal, &outranks, maxlocal, &newoutoffset, maxdegree, &tmpoffset));
1217       for (i = 0; i < maxlocal; i++) outranks[i] = rank;
1218       PetscCall(PetscSFReduceBegin(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1219       PetscCall(PetscSFReduceEnd(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1220       /* Sort the incoming ranks at each vertex, build the inverse map */
1221       for (i = 0; i < sf->nroots; i++) {
1222         PetscInt j;
1223         for (j = 0; j < indegree[i]; j++) tmpoffset[j] = j;
1224         PetscCall(PetscSortIntWithArray(indegree[i], inranks + inoffset[i], tmpoffset));
1225         for (j = 0; j < indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1226       }
1227       PetscCall(PetscSFBcastBegin(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1228       PetscCall(PetscSFBcastEnd(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1229       PetscCall(PetscMalloc1(sf->nleaves, &newremote));
1230       for (i = 0; i < sf->nleaves; i++) {
1231         newremote[i].rank  = sf->remote[i].rank;
1232         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1233       }
1234       PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, newremote, PETSC_OWN_POINTER));
1235       PetscCall(PetscFree5(inranks, newoffset, outranks, newoutoffset, tmpoffset));
1236     }
1237     PetscCall(PetscFree3(inoffset, outones, outoffset));
1238   }
1239   *multi = sf->multi;
1240   PetscFunctionReturn(PETSC_SUCCESS);
1241 }
1242 
1243 /*@C
1244    PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots of a `PetscSF`, does not remap indices
1245 
1246    Collective
1247 
1248    Input Parameters:
1249 +  sf - original star forest
1250 .  nselected  - number of selected roots on this process
1251 -  selected   - indices of the selected roots on this process
1252 
1253    Output Parameter:
1254 .  esf - new star forest
1255 
1256    Level: advanced
1257 
1258    Note:
1259    To use the new `PetscSF`, it may be necessary to know the indices of the leaves that are still participating. This can
1260    be done by calling PetscSFGetGraph().
1261 
1262 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1263 @*/
1264 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *esf)
1265 {
1266   PetscInt           i, j, n, nroots, nleaves, esf_nleaves, *new_ilocal, minleaf, maxleaf, maxlocal;
1267   const PetscInt    *ilocal;
1268   signed char       *rootdata, *leafdata, *leafmem;
1269   const PetscSFNode *iremote;
1270   PetscSFNode       *new_iremote;
1271   MPI_Comm           comm;
1272 
1273   PetscFunctionBegin;
1274   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1275   PetscSFCheckGraphSet(sf, 1);
1276   if (nselected) PetscValidIntPointer(selected, 3);
1277   PetscValidPointer(esf, 4);
1278 
1279   PetscCall(PetscSFSetUp(sf));
1280   PetscCall(PetscLogEventBegin(PETSCSF_EmbedSF, sf, 0, 0, 0));
1281   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1282   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
1283 
1284   if (PetscDefined(USE_DEBUG)) { /* Error out if selected[] has dups or  out of range indices */
1285     PetscBool dups;
1286     PetscCall(PetscCheckDupsInt(nselected, selected, &dups));
1287     PetscCheck(!dups, comm, PETSC_ERR_ARG_WRONG, "selected[] has dups");
1288     for (i = 0; i < nselected; i++) PetscCheck(selected[i] >= 0 && selected[i] < nroots, comm, PETSC_ERR_ARG_OUTOFRANGE, "selected root indice %" PetscInt_FMT " is out of [0,%" PetscInt_FMT ")", selected[i], nroots);
1289   }
1290 
1291   if (sf->ops->CreateEmbeddedRootSF) PetscUseTypeMethod(sf, CreateEmbeddedRootSF, nselected, selected, esf);
1292   else {
1293     /* A generic version of creating embedded sf */
1294     PetscCall(PetscSFGetLeafRange(sf, &minleaf, &maxleaf));
1295     maxlocal = maxleaf - minleaf + 1;
1296     PetscCall(PetscCalloc2(nroots, &rootdata, maxlocal, &leafmem));
1297     leafdata = leafmem - minleaf;
1298     /* Tag selected roots and bcast to leaves */
1299     for (i = 0; i < nselected; i++) rootdata[selected[i]] = 1;
1300     PetscCall(PetscSFBcastBegin(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1301     PetscCall(PetscSFBcastEnd(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1302 
1303     /* Build esf with leaves that are still connected */
1304     esf_nleaves = 0;
1305     for (i = 0; i < nleaves; i++) {
1306       j = ilocal ? ilocal[i] : i;
1307       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1308          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1309       */
1310       esf_nleaves += (leafdata[j] ? 1 : 0);
1311     }
1312     PetscCall(PetscMalloc1(esf_nleaves, &new_ilocal));
1313     PetscCall(PetscMalloc1(esf_nleaves, &new_iremote));
1314     for (i = n = 0; i < nleaves; i++) {
1315       j = ilocal ? ilocal[i] : i;
1316       if (leafdata[j]) {
1317         new_ilocal[n]        = j;
1318         new_iremote[n].rank  = iremote[i].rank;
1319         new_iremote[n].index = iremote[i].index;
1320         ++n;
1321       }
1322     }
1323     PetscCall(PetscSFCreate(comm, esf));
1324     PetscCall(PetscSFSetFromOptions(*esf));
1325     PetscCall(PetscSFSetGraph(*esf, nroots, esf_nleaves, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1326     PetscCall(PetscFree2(rootdata, leafmem));
1327   }
1328   PetscCall(PetscLogEventEnd(PETSCSF_EmbedSF, sf, 0, 0, 0));
1329   PetscFunctionReturn(PETSC_SUCCESS);
1330 }
1331 
1332 /*@C
1333   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves of a `PetscSF`, does not remap indices
1334 
1335   Collective
1336 
1337   Input Parameters:
1338 + sf - original star forest
1339 . nselected  - number of selected leaves on this process
1340 - selected   - indices of the selected leaves on this process
1341 
1342   Output Parameter:
1343 .  newsf - new star forest
1344 
1345   Level: advanced
1346 
1347 .seealso: `PetscSF`, `PetscSFCreateEmbeddedRootSF()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1348 @*/
1349 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *newsf)
1350 {
1351   const PetscSFNode *iremote;
1352   PetscSFNode       *new_iremote;
1353   const PetscInt    *ilocal;
1354   PetscInt           i, nroots, *leaves, *new_ilocal;
1355   MPI_Comm           comm;
1356 
1357   PetscFunctionBegin;
1358   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1359   PetscSFCheckGraphSet(sf, 1);
1360   if (nselected) PetscValidIntPointer(selected, 3);
1361   PetscValidPointer(newsf, 4);
1362 
1363   /* Uniq selected[] and put results in leaves[] */
1364   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1365   PetscCall(PetscMalloc1(nselected, &leaves));
1366   PetscCall(PetscArraycpy(leaves, selected, nselected));
1367   PetscCall(PetscSortedRemoveDupsInt(&nselected, leaves));
1368   PetscCheck(!nselected || !(leaves[0] < 0 || leaves[nselected - 1] >= sf->nleaves), comm, PETSC_ERR_ARG_OUTOFRANGE, "Min/Max leaf indices %" PetscInt_FMT "/%" PetscInt_FMT " are not in [0,%" PetscInt_FMT ")", leaves[0], leaves[nselected - 1], sf->nleaves);
1369 
1370   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1371   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) PetscUseTypeMethod(sf, CreateEmbeddedLeafSF, nselected, leaves, newsf);
1372   else {
1373     PetscCall(PetscSFGetGraph(sf, &nroots, NULL, &ilocal, &iremote));
1374     PetscCall(PetscMalloc1(nselected, &new_ilocal));
1375     PetscCall(PetscMalloc1(nselected, &new_iremote));
1376     for (i = 0; i < nselected; ++i) {
1377       const PetscInt l     = leaves[i];
1378       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1379       new_iremote[i].rank  = iremote[l].rank;
1380       new_iremote[i].index = iremote[l].index;
1381     }
1382     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, newsf));
1383     PetscCall(PetscSFSetGraph(*newsf, nroots, nselected, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1384   }
1385   PetscCall(PetscFree(leaves));
1386   PetscFunctionReturn(PETSC_SUCCESS);
1387 }
1388 
1389 /*@C
1390    PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to `PetscSFBcastEnd()`
1391 
1392    Collective
1393 
1394    Input Parameters:
1395 +  sf - star forest on which to communicate
1396 .  unit - data type associated with each node
1397 .  rootdata - buffer to broadcast
1398 -  op - operation to use for reduction
1399 
1400    Output Parameter:
1401 .  leafdata - buffer to be reduced with values from each leaf's respective root
1402 
1403    Level: intermediate
1404 
1405    Note:
1406     When PETSc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1407     are host pointers or device pointers, which may incur a noticeable cost. If you already knew the info, you should
1408     use `PetscSFBcastWithMemTypeBegin()` instead.
1409 
1410 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastWithMemTypeBegin()`
1411 @*/
1412 PetscErrorCode PetscSFBcastBegin(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1413 {
1414   PetscMemType rootmtype, leafmtype;
1415 
1416   PetscFunctionBegin;
1417   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1418   PetscCall(PetscSFSetUp(sf));
1419   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1420   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1421   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1422   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1423   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1424   PetscFunctionReturn(PETSC_SUCCESS);
1425 }
1426 
1427 /*@C
1428    PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call
1429    to `PetscSFBcastEnd()`
1430 
1431    Collective
1432 
1433    Input Parameters:
1434 +  sf - star forest on which to communicate
1435 .  unit - data type associated with each node
1436 .  rootmtype - memory type of rootdata
1437 .  rootdata - buffer to broadcast
1438 .  leafmtype - memory type of leafdata
1439 -  op - operation to use for reduction
1440 
1441    Output Parameter:
1442 .  leafdata - buffer to be reduced with values from each leaf's respective root
1443 
1444    Level: intermediate
1445 
1446 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastBegin()`
1447 @*/
1448 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
1449 {
1450   PetscFunctionBegin;
1451   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1452   PetscCall(PetscSFSetUp(sf));
1453   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1454   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1455   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1456   PetscFunctionReturn(PETSC_SUCCESS);
1457 }
1458 
1459 /*@C
1460    PetscSFBcastEnd - end a broadcast and reduce operation started with `PetscSFBcastBegin()` or `PetscSFBcastWithMemTypeBegin()`
1461 
1462    Collective
1463 
1464    Input Parameters:
1465 +  sf - star forest
1466 .  unit - data type
1467 .  rootdata - buffer to broadcast
1468 -  op - operation to use for reduction
1469 
1470    Output Parameter:
1471 .  leafdata - buffer to be reduced with values from each leaf's respective root
1472 
1473    Level: intermediate
1474 
1475 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFReduceEnd()`
1476 @*/
1477 PetscErrorCode PetscSFBcastEnd(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1478 {
1479   PetscFunctionBegin;
1480   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1481   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastEnd, sf, 0, 0, 0));
1482   PetscUseTypeMethod(sf, BcastEnd, unit, rootdata, leafdata, op);
1483   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastEnd, sf, 0, 0, 0));
1484   PetscFunctionReturn(PETSC_SUCCESS);
1485 }
1486 
1487 /*@C
1488    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to `PetscSFReduceEnd()`
1489 
1490    Collective
1491 
1492    Input Parameters:
1493 +  sf - star forest
1494 .  unit - data type
1495 .  leafdata - values to reduce
1496 -  op - reduction operation
1497 
1498    Output Parameter:
1499 .  rootdata - result of reduction of values from all leaves of each root
1500 
1501    Level: intermediate
1502 
1503    Note:
1504     When PETSc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1505     are host pointers or device pointers, which may incur a noticeable cost. If you already knew the info, you should
1506     use `PetscSFReduceWithMemTypeBegin()` instead.
1507 
1508 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceWithMemTypeBegin()`, `PetscSFReduceEnd()`
1509 @*/
1510 PetscErrorCode PetscSFReduceBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1511 {
1512   PetscMemType rootmtype, leafmtype;
1513 
1514   PetscFunctionBegin;
1515   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1516   PetscCall(PetscSFSetUp(sf));
1517   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1518   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1519   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1520   PetscCall((sf->ops->ReduceBegin)(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1521   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1522   PetscFunctionReturn(PETSC_SUCCESS);
1523 }
1524 
1525 /*@C
1526    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to `PetscSFReduceEnd()`
1527 
1528    Collective
1529 
1530    Input Parameters:
1531 +  sf - star forest
1532 .  unit - data type
1533 .  leafmtype - memory type of leafdata
1534 .  leafdata - values to reduce
1535 .  rootmtype - memory type of rootdata
1536 -  op - reduction operation
1537 
1538    Output Parameter:
1539 .  rootdata - result of reduction of values from all leaves of each root
1540 
1541    Level: intermediate
1542 
1543 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceBegin()`, `PetscSFReduceEnd()`
1544 @*/
1545 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
1546 {
1547   PetscFunctionBegin;
1548   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1549   PetscCall(PetscSFSetUp(sf));
1550   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1551   PetscCall((sf->ops->ReduceBegin)(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1552   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1553   PetscFunctionReturn(PETSC_SUCCESS);
1554 }
1555 
1556 /*@C
1557    PetscSFReduceEnd - end a reduction operation started with `PetscSFReduceBegin()` or `PetscSFReduceWithMemTypeBegin()`
1558 
1559    Collective
1560 
1561    Input Parameters:
1562 +  sf - star forest
1563 .  unit - data type
1564 .  leafdata - values to reduce
1565 -  op - reduction operation
1566 
1567    Output Parameter:
1568 .  rootdata - result of reduction of values from all leaves of each root
1569 
1570    Level: intermediate
1571 
1572 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFBcastEnd()`, `PetscSFReduceBegin()`, `PetscSFReduceWithMemTypeBegin()`
1573 @*/
1574 PetscErrorCode PetscSFReduceEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1575 {
1576   PetscFunctionBegin;
1577   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1578   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1579   PetscUseTypeMethod(sf, ReduceEnd, unit, leafdata, rootdata, op);
1580   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1581   PetscFunctionReturn(PETSC_SUCCESS);
1582 }
1583 
1584 /*@C
1585    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value,
1586    to be completed with `PetscSFFetchAndOpEnd()`
1587 
1588    Collective
1589 
1590    Input Parameters:
1591 +  sf - star forest
1592 .  unit - data type
1593 .  leafdata - leaf values to use in reduction
1594 -  op - operation to use for reduction
1595 
1596    Output Parameters:
1597 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1598 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1599 
1600    Level: advanced
1601 
1602    Note:
1603    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1604    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1605    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1606    integers.
1607 
1608 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`
1609 @*/
1610 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1611 {
1612   PetscMemType rootmtype, leafmtype, leafupdatemtype;
1613 
1614   PetscFunctionBegin;
1615   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1616   PetscCall(PetscSFSetUp(sf));
1617   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1618   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1619   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1620   PetscCall(PetscGetMemType(leafupdate, &leafupdatemtype));
1621   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1622   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1623   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1624   PetscFunctionReturn(PETSC_SUCCESS);
1625 }
1626 
1627 /*@C
1628    PetscSFFetchAndOpWithMemTypeBegin - begin operation with explicit memory types that fetches values from root and updates atomically by
1629    applying operation using my leaf value, to be completed with `PetscSFFetchAndOpEnd()`
1630 
1631    Collective
1632 
1633    Input Parameters:
1634 +  sf - star forest
1635 .  unit - data type
1636 .  rootmtype - memory type of rootdata
1637 .  leafmtype - memory type of leafdata
1638 .  leafdata - leaf values to use in reduction
1639 .  leafupdatemtype - memory type of leafupdate
1640 -  op - operation to use for reduction
1641 
1642    Output Parameters:
1643 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1644 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1645 
1646    Level: advanced
1647 
1648    Note:
1649    See `PetscSFFetchAndOpBegin()` for more details.
1650 
1651 .seealso: `PetscSF`, `PetscSFFetchAndOpBegin()`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`, `PetscSFFetchAndOpEnd()`
1652 @*/
1653 PetscErrorCode PetscSFFetchAndOpWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, PetscMemType leafupdatemtype, void *leafupdate, MPI_Op op)
1654 {
1655   PetscFunctionBegin;
1656   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1657   PetscCall(PetscSFSetUp(sf));
1658   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1659   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1660   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1661   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1662   PetscFunctionReturn(PETSC_SUCCESS);
1663 }
1664 
1665 /*@C
1666    PetscSFFetchAndOpEnd - end operation started in matching call to `PetscSFFetchAndOpBegin()` or `PetscSFFetchAndOpWithMemTypeBegin()`
1667    to fetch values from roots and update atomically by applying operation using my leaf value
1668 
1669    Collective
1670 
1671    Input Parameters:
1672 +  sf - star forest
1673 .  unit - data type
1674 .  leafdata - leaf values to use in reduction
1675 -  op - operation to use for reduction
1676 
1677    Output Parameters:
1678 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1679 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1680 
1681    Level: advanced
1682 
1683 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFReduceEnd()`, `PetscSFSetGraph()`, `PetscSFFetchAndOpBegin()`, `PetscSFFetchAndOpWithMemTypeBegin()`
1684 @*/
1685 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1686 {
1687   PetscFunctionBegin;
1688   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1689   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1690   PetscUseTypeMethod(sf, FetchAndOpEnd, unit, rootdata, leafdata, leafupdate, op);
1691   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1692   PetscFunctionReturn(PETSC_SUCCESS);
1693 }
1694 
1695 /*@C
1696    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with `PetscSFComputeDegreeEnd()`
1697 
1698    Collective
1699 
1700    Input Parameter:
1701 .  sf - star forest
1702 
1703    Output Parameter:
1704 .  degree - degree of each root vertex
1705 
1706    Level: advanced
1707 
1708    Note:
1709    The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence there is no need to call `PetscFree()` on it.
1710 
1711 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeEnd()`
1712 @*/
1713 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf, const PetscInt **degree)
1714 {
1715   PetscFunctionBegin;
1716   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1717   PetscSFCheckGraphSet(sf, 1);
1718   PetscValidPointer(degree, 2);
1719   if (!sf->degreeknown) {
1720     PetscInt i, nroots = sf->nroots, maxlocal;
1721     PetscCheck(!sf->degree, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1722     maxlocal = sf->maxleaf - sf->minleaf + 1;
1723     PetscCall(PetscMalloc1(nroots, &sf->degree));
1724     PetscCall(PetscMalloc1(PetscMax(maxlocal, 1), &sf->degreetmp)); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1725     for (i = 0; i < nroots; i++) sf->degree[i] = 0;
1726     for (i = 0; i < maxlocal; i++) sf->degreetmp[i] = 1;
1727     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1728   }
1729   *degree = NULL;
1730   PetscFunctionReturn(PETSC_SUCCESS);
1731 }
1732 
1733 /*@C
1734    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with `PetscSFComputeDegreeBegin()`
1735 
1736    Collective
1737 
1738    Input Parameter:
1739 .  sf - star forest
1740 
1741    Output Parameter:
1742 .  degree - degree of each root vertex
1743 
1744    Level: developer
1745 
1746    Note:
1747    The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence there is no need to call `PetscFree()` on it.
1748 
1749 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeBegin()`
1750 @*/
1751 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf, const PetscInt **degree)
1752 {
1753   PetscFunctionBegin;
1754   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1755   PetscSFCheckGraphSet(sf, 1);
1756   PetscValidPointer(degree, 2);
1757   if (!sf->degreeknown) {
1758     PetscCheck(sf->degreetmp, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1759     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1760     PetscCall(PetscFree(sf->degreetmp));
1761     sf->degreeknown = PETSC_TRUE;
1762   }
1763   *degree = sf->degree;
1764   PetscFunctionReturn(PETSC_SUCCESS);
1765 }
1766 
1767 /*@C
1768    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-`PetscSF` returned by `PetscSFGetMultiSF()`).
1769    Each multi-root is assigned index of the corresponding original root.
1770 
1771    Collective
1772 
1773    Input Parameters:
1774 +  sf - star forest
1775 -  degree - degree of each root vertex, computed with `PetscSFComputeDegreeBegin()`/`PetscSFComputeDegreeEnd()`
1776 
1777    Output Parameters:
1778 +  nMultiRoots - (optional) number of multi-roots (roots of multi-`PetscSF`)
1779 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is `nMultiRoots`
1780 
1781    Level: developer
1782 
1783    Note:
1784    The returned array `multiRootsOrigNumbering` is newly allocated and should be destroyed with `PetscFree()` when no longer needed.
1785 
1786 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFComputeDegreeEnd()`, `PetscSFGetMultiSF()`
1787 @*/
1788 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1789 {
1790   PetscSF  msf;
1791   PetscInt i, j, k, nroots, nmroots;
1792 
1793   PetscFunctionBegin;
1794   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1795   PetscCall(PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL));
1796   if (nroots) PetscValidIntPointer(degree, 2);
1797   if (nMultiRoots) PetscValidIntPointer(nMultiRoots, 3);
1798   PetscValidPointer(multiRootsOrigNumbering, 4);
1799   PetscCall(PetscSFGetMultiSF(sf, &msf));
1800   PetscCall(PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL));
1801   PetscCall(PetscMalloc1(nmroots, multiRootsOrigNumbering));
1802   for (i = 0, j = 0, k = 0; i < nroots; i++) {
1803     if (!degree[i]) continue;
1804     for (j = 0; j < degree[i]; j++, k++) (*multiRootsOrigNumbering)[k] = i;
1805   }
1806   PetscCheck(k == nmroots, PETSC_COMM_SELF, PETSC_ERR_PLIB, "sanity check fail");
1807   if (nMultiRoots) *nMultiRoots = nmroots;
1808   PetscFunctionReturn(PETSC_SUCCESS);
1809 }
1810 
1811 /*@C
1812    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with `PetscSFGatherEnd()`
1813 
1814    Collective
1815 
1816    Input Parameters:
1817 +  sf - star forest
1818 .  unit - data type
1819 -  leafdata - leaf data to gather to roots
1820 
1821    Output Parameter:
1822 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1823 
1824    Level: intermediate
1825 
1826 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterBegin()`
1827 @*/
1828 PetscErrorCode PetscSFGatherBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1829 {
1830   PetscSF multi = NULL;
1831 
1832   PetscFunctionBegin;
1833   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1834   PetscCall(PetscSFSetUp(sf));
1835   PetscCall(PetscSFGetMultiSF(sf, &multi));
1836   PetscCall(PetscSFReduceBegin(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1837   PetscFunctionReturn(PETSC_SUCCESS);
1838 }
1839 
1840 /*@C
1841    PetscSFGatherEnd - ends pointwise gather operation that was started with `PetscSFGatherBegin()`
1842 
1843    Collective
1844 
1845    Input Parameters:
1846 +  sf - star forest
1847 .  unit - data type
1848 -  leafdata - leaf data to gather to roots
1849 
1850    Output Parameter:
1851 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1852 
1853    Level: intermediate
1854 
1855 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterEnd()`
1856 @*/
1857 PetscErrorCode PetscSFGatherEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1858 {
1859   PetscSF multi = NULL;
1860 
1861   PetscFunctionBegin;
1862   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1863   PetscCall(PetscSFGetMultiSF(sf, &multi));
1864   PetscCall(PetscSFReduceEnd(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1865   PetscFunctionReturn(PETSC_SUCCESS);
1866 }
1867 
1868 /*@C
1869    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with `PetscSFScatterEnd()`
1870 
1871    Collective
1872 
1873    Input Parameters:
1874 +  sf - star forest
1875 .  unit - data type
1876 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1877 
1878    Output Parameter:
1879 .  leafdata - leaf data to be update with personal data from each respective root
1880 
1881    Level: intermediate
1882 
1883 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterEnd()`
1884 @*/
1885 PetscErrorCode PetscSFScatterBegin(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1886 {
1887   PetscSF multi = NULL;
1888 
1889   PetscFunctionBegin;
1890   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1891   PetscCall(PetscSFSetUp(sf));
1892   PetscCall(PetscSFGetMultiSF(sf, &multi));
1893   PetscCall(PetscSFBcastBegin(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1894   PetscFunctionReturn(PETSC_SUCCESS);
1895 }
1896 
1897 /*@C
1898    PetscSFScatterEnd - ends pointwise scatter operation that was started with `PetscSFScatterBegin()`
1899 
1900    Collective
1901 
1902    Input Parameters:
1903 +  sf - star forest
1904 .  unit - data type
1905 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1906 
1907    Output Parameter:
1908 .  leafdata - leaf data to be update with personal data from each respective root
1909 
1910    Level: intermediate
1911 
1912 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterBegin()`
1913 @*/
1914 PetscErrorCode PetscSFScatterEnd(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1915 {
1916   PetscSF multi = NULL;
1917 
1918   PetscFunctionBegin;
1919   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1920   PetscCall(PetscSFGetMultiSF(sf, &multi));
1921   PetscCall(PetscSFBcastEnd(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1922   PetscFunctionReturn(PETSC_SUCCESS);
1923 }
1924 
1925 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1926 {
1927   PetscInt        i, n, nleaves;
1928   const PetscInt *ilocal = NULL;
1929   PetscHSetI      seen;
1930 
1931   PetscFunctionBegin;
1932   if (PetscDefined(USE_DEBUG)) {
1933     PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, NULL));
1934     PetscCall(PetscHSetICreate(&seen));
1935     for (i = 0; i < nleaves; i++) {
1936       const PetscInt leaf = ilocal ? ilocal[i] : i;
1937       PetscCall(PetscHSetIAdd(seen, leaf));
1938     }
1939     PetscCall(PetscHSetIGetSize(seen, &n));
1940     PetscCheck(n == nleaves, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided leaves have repeated values: all leaves must be unique");
1941     PetscCall(PetscHSetIDestroy(&seen));
1942   }
1943   PetscFunctionReturn(PETSC_SUCCESS);
1944 }
1945 
1946 /*@
1947   PetscSFCompose - Compose a new `PetscSF` by putting the second `PetscSF` under the first one in a top (roots) down (leaves) view
1948 
1949   Input Parameters:
1950 + sfA - The first `PetscSF`
1951 - sfB - The second `PetscSF`
1952 
1953   Output Parameters:
1954 . sfBA - The composite `PetscSF`
1955 
1956   Level: developer
1957 
1958   Notes:
1959   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
1960   forests, i.e. the same leaf is not connected with different roots.
1961 
1962   `sfA`'s leaf space and `sfB`'s root space might be partially overlapped. The composition builds
1963   a graph with `sfA`'s roots and `sfB`'s leaves only when there is a path between them. Unconnected
1964   nodes (roots or leaves) are not in `sfBA`. Doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on the new `PetscSF` is equivalent to doing a
1965   `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on `sfA`, then a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on `sfB`, on connected nodes.
1966 
1967 .seealso: `PetscSF`, `PetscSFComposeInverse()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`
1968 @*/
1969 PetscErrorCode PetscSFCompose(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
1970 {
1971   const PetscSFNode *remotePointsA, *remotePointsB;
1972   PetscSFNode       *remotePointsBA = NULL, *reorderedRemotePointsA = NULL, *leafdataB;
1973   const PetscInt    *localPointsA, *localPointsB;
1974   PetscInt          *localPointsBA;
1975   PetscInt           i, numRootsA, numLeavesA, numRootsB, numLeavesB, minleaf, maxleaf, numLeavesBA;
1976   PetscBool          denseB;
1977 
1978   PetscFunctionBegin;
1979   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
1980   PetscSFCheckGraphSet(sfA, 1);
1981   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
1982   PetscSFCheckGraphSet(sfB, 2);
1983   PetscCheckSameComm(sfA, 1, sfB, 2);
1984   PetscValidPointer(sfBA, 3);
1985   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
1986   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
1987 
1988   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
1989   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
1990   /* Make sure that PetscSFBcast{Begin, End}(sfB, ...) works with root data of size
1991      numRootsB; otherwise, garbage will be broadcasted.
1992      Example (comm size = 1):
1993      sfA: 0 <- (0, 0)
1994      sfB: 100 <- (0, 0)
1995           101 <- (0, 1)
1996      Here, we have remotePointsA = [(0, 0)], but for remotePointsA to be a valid tartget
1997      of sfB, it has to be recasted as [(0, 0), (-1, -1)] so that points 100 and 101 would
1998      receive (0, 0) and (-1, -1), respectively, when PetscSFBcast(sfB, ...) is called on
1999      remotePointsA; if not recasted, point 101 would receive a garbage value.             */
2000   PetscCall(PetscMalloc1(numRootsB, &reorderedRemotePointsA));
2001   for (i = 0; i < numRootsB; i++) {
2002     reorderedRemotePointsA[i].rank  = -1;
2003     reorderedRemotePointsA[i].index = -1;
2004   }
2005   for (i = 0; i < numLeavesA; i++) {
2006     PetscInt localp = localPointsA ? localPointsA[i] : i;
2007 
2008     if (localp >= numRootsB) continue;
2009     reorderedRemotePointsA[localp] = remotePointsA[i];
2010   }
2011   remotePointsA = reorderedRemotePointsA;
2012   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2013   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &leafdataB));
2014   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2015     leafdataB[i].rank  = -1;
2016     leafdataB[i].index = -1;
2017   }
2018   PetscCall(PetscSFBcastBegin(sfB, MPIU_2INT, remotePointsA, leafdataB - minleaf, MPI_REPLACE));
2019   PetscCall(PetscSFBcastEnd(sfB, MPIU_2INT, remotePointsA, leafdataB - minleaf, MPI_REPLACE));
2020   PetscCall(PetscFree(reorderedRemotePointsA));
2021 
2022   denseB = (PetscBool)!localPointsB;
2023   for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2024     if (leafdataB[localPointsB ? localPointsB[i] - minleaf : i].rank == -1) denseB = PETSC_FALSE;
2025     else numLeavesBA++;
2026   }
2027   if (denseB) {
2028     localPointsBA  = NULL;
2029     remotePointsBA = leafdataB;
2030   } else {
2031     PetscCall(PetscMalloc1(numLeavesBA, &localPointsBA));
2032     PetscCall(PetscMalloc1(numLeavesBA, &remotePointsBA));
2033     for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2034       const PetscInt l = localPointsB ? localPointsB[i] : i;
2035 
2036       if (leafdataB[l - minleaf].rank == -1) continue;
2037       remotePointsBA[numLeavesBA] = leafdataB[l - minleaf];
2038       localPointsBA[numLeavesBA]  = l;
2039       numLeavesBA++;
2040     }
2041     PetscCall(PetscFree(leafdataB));
2042   }
2043   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2044   PetscCall(PetscSFSetFromOptions(*sfBA));
2045   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2046   PetscFunctionReturn(PETSC_SUCCESS);
2047 }
2048 
2049 /*@
2050   PetscSFComposeInverse - Compose a new `PetscSF` by putting the inverse of the second `PetscSF` under the first one
2051 
2052   Input Parameters:
2053 + sfA - The first `PetscSF`
2054 - sfB - The second `PetscSF`
2055 
2056   Output Parameters:
2057 . sfBA - The composite `PetscSF`.
2058 
2059   Level: developer
2060 
2061   Notes:
2062   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
2063   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2064   second `PetscSF` must have a degree of 1, i.e., no roots have more than one leaf connected.
2065 
2066   `sfA`'s leaf space and `sfB`'s leaf space might be partially overlapped. The composition builds
2067   a graph with `sfA`'s roots and `sfB`'s roots only when there is a path between them. Unconnected
2068   roots are not in `sfBA`. Doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on the new `PetscSF` is equivalent to doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()`
2069   on `sfA`, then
2070   a `PetscSFReduceBegin()`/`PetscSFReduceEnd()` on `sfB`, on connected roots.
2071 
2072 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFCreateInverseSF()`
2073 @*/
2074 PetscErrorCode PetscSFComposeInverse(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
2075 {
2076   const PetscSFNode *remotePointsA, *remotePointsB;
2077   PetscSFNode       *remotePointsBA;
2078   const PetscInt    *localPointsA, *localPointsB;
2079   PetscSFNode       *reorderedRemotePointsA = NULL;
2080   PetscInt           i, numRootsA, numLeavesA, numLeavesBA, numRootsB, numLeavesB, minleaf, maxleaf, *localPointsBA;
2081   MPI_Op             op;
2082 #if defined(PETSC_USE_64BIT_INDICES)
2083   PetscBool iswin;
2084 #endif
2085 
2086   PetscFunctionBegin;
2087   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
2088   PetscSFCheckGraphSet(sfA, 1);
2089   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
2090   PetscSFCheckGraphSet(sfB, 2);
2091   PetscCheckSameComm(sfA, 1, sfB, 2);
2092   PetscValidPointer(sfBA, 3);
2093   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
2094   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
2095 
2096   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
2097   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
2098 
2099   /* TODO: Check roots of sfB have degree of 1 */
2100   /* Once we implement it, we can replace the MPI_MAXLOC
2101      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2102      We use MPI_MAXLOC only to have a deterministic output from this routine if
2103      the root condition is not meet.
2104    */
2105   op = MPI_MAXLOC;
2106 #if defined(PETSC_USE_64BIT_INDICES)
2107   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2108   PetscCall(PetscObjectTypeCompare((PetscObject)sfB, PETSCSFWINDOW, &iswin));
2109   if (iswin) op = MPI_REPLACE;
2110 #endif
2111 
2112   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2113   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &reorderedRemotePointsA));
2114   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2115     reorderedRemotePointsA[i].rank  = -1;
2116     reorderedRemotePointsA[i].index = -1;
2117   }
2118   if (localPointsA) {
2119     for (i = 0; i < numLeavesA; i++) {
2120       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2121       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2122     }
2123   } else {
2124     for (i = 0; i < numLeavesA; i++) {
2125       if (i > maxleaf || i < minleaf) continue;
2126       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2127     }
2128   }
2129 
2130   PetscCall(PetscMalloc1(numRootsB, &localPointsBA));
2131   PetscCall(PetscMalloc1(numRootsB, &remotePointsBA));
2132   for (i = 0; i < numRootsB; i++) {
2133     remotePointsBA[i].rank  = -1;
2134     remotePointsBA[i].index = -1;
2135   }
2136 
2137   PetscCall(PetscSFReduceBegin(sfB, MPIU_2INT, reorderedRemotePointsA - minleaf, remotePointsBA, op));
2138   PetscCall(PetscSFReduceEnd(sfB, MPIU_2INT, reorderedRemotePointsA - minleaf, remotePointsBA, op));
2139   PetscCall(PetscFree(reorderedRemotePointsA));
2140   for (i = 0, numLeavesBA = 0; i < numRootsB; i++) {
2141     if (remotePointsBA[i].rank == -1) continue;
2142     remotePointsBA[numLeavesBA].rank  = remotePointsBA[i].rank;
2143     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2144     localPointsBA[numLeavesBA]        = i;
2145     numLeavesBA++;
2146   }
2147   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2148   PetscCall(PetscSFSetFromOptions(*sfBA));
2149   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2150   PetscFunctionReturn(PETSC_SUCCESS);
2151 }
2152 
2153 /*
2154   PetscSFCreateLocalSF_Private - Creates a local `PetscSF` that only has intra-process edges of the global `PetscSF`
2155 
2156   Input Parameters:
2157 . sf - The global `PetscSF`
2158 
2159   Output Parameters:
2160 . out - The local `PetscSF`
2161 
2162 .seealso: `PetscSF`, `PetscSFCreate()`
2163  */
2164 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf, PetscSF *out)
2165 {
2166   MPI_Comm           comm;
2167   PetscMPIInt        myrank;
2168   const PetscInt    *ilocal;
2169   const PetscSFNode *iremote;
2170   PetscInt           i, j, nroots, nleaves, lnleaves, *lilocal;
2171   PetscSFNode       *liremote;
2172   PetscSF            lsf;
2173 
2174   PetscFunctionBegin;
2175   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2176   if (sf->ops->CreateLocalSF) PetscUseTypeMethod(sf, CreateLocalSF, out);
2177   else {
2178     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2179     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
2180     PetscCallMPI(MPI_Comm_rank(comm, &myrank));
2181 
2182     /* Find out local edges and build a local SF */
2183     PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
2184     for (i = lnleaves = 0; i < nleaves; i++) {
2185       if (iremote[i].rank == (PetscInt)myrank) lnleaves++;
2186     }
2187     PetscCall(PetscMalloc1(lnleaves, &lilocal));
2188     PetscCall(PetscMalloc1(lnleaves, &liremote));
2189 
2190     for (i = j = 0; i < nleaves; i++) {
2191       if (iremote[i].rank == (PetscInt)myrank) {
2192         lilocal[j]        = ilocal ? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2193         liremote[j].rank  = 0;                      /* rank in PETSC_COMM_SELF */
2194         liremote[j].index = iremote[i].index;
2195         j++;
2196       }
2197     }
2198     PetscCall(PetscSFCreate(PETSC_COMM_SELF, &lsf));
2199     PetscCall(PetscSFSetFromOptions(lsf));
2200     PetscCall(PetscSFSetGraph(lsf, nroots, lnleaves, lilocal, PETSC_OWN_POINTER, liremote, PETSC_OWN_POINTER));
2201     PetscCall(PetscSFSetUp(lsf));
2202     *out = lsf;
2203   }
2204   PetscFunctionReturn(PETSC_SUCCESS);
2205 }
2206 
2207 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2208 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata)
2209 {
2210   PetscMemType rootmtype, leafmtype;
2211 
2212   PetscFunctionBegin;
2213   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2214   PetscCall(PetscSFSetUp(sf));
2215   PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
2216   PetscCall(PetscGetMemType(rootdata, &rootmtype));
2217   PetscCall(PetscGetMemType(leafdata, &leafmtype));
2218   PetscUseTypeMethod(sf, BcastToZero, unit, rootmtype, rootdata, leafmtype, leafdata);
2219   PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
2220   PetscFunctionReturn(PETSC_SUCCESS);
2221 }
2222 
2223 /*@
2224   PetscSFConcatenate - concatenate multiple `PetscSF` into one
2225 
2226   Input Parameters:
2227 + comm - the communicator
2228 . nsfs - the number of input `PetscSF`
2229 . sfs  - the array of input `PetscSF`
2230 . rootMode - the root mode specifying how roots are handled
2231 - leafOffsets - the array of local leaf offsets, one for each input `PetscSF`, or `NULL` for contiguous storage
2232 
2233   Output Parameters:
2234 . newsf - The resulting `PetscSF`
2235 
2236   Level: advanced
2237 
2238   Notes:
2239   The communicator of all `PetscSF`s in `sfs` must be comm.
2240 
2241   Leaves are always concatenated locally, keeping them ordered by the input `PetscSF` index and original local order.
2242 
2243   The offsets in `leafOffsets` are added to the original leaf indices.
2244 
2245   If all input SFs use contiguous leaf storage (`ilocal` = `NULL`), `leafOffsets` can be passed as `NULL` as well.
2246   In this case, `NULL` is also passed as `ilocal` to the resulting `PetscSF`.
2247 
2248   If any input `PetscSF` has non-null `ilocal`, `leafOffsets` is needed to distinguish leaves from different input `PetscSF`s.
2249   In this case, user is responsible to provide correct offsets so that the resulting leaves are unique (otherwise an error occurs).
2250 
2251   All root modes retain the essential connectivity condition.
2252   If two leaves of the same input `PetscSF` are connected (sharing the same root), they are also connected in the output `PetscSF`.
2253   Parameter `rootMode` controls how the input root spaces are combined.
2254   For `PETSCSF_CONCATENATE_ROOTMODE_SHARED`, the root space is considered the same for each input `PetscSF` (checked in debug mode)
2255   and is also the same in the output `PetscSF`.
2256   For `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, the input root spaces are taken as separate and joined.
2257   `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` joins the root spaces locally;
2258   roots of sfs[0], sfs[1], sfs[2], ... are joined on each rank separately, ordered by input `PetscSF` and original local index, and renumbered contiguously.
2259   `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL` joins the root spaces globally;
2260   roots of sfs[0], sfs[1], sfs[2, ... are joined globally, ordered by input `PetscSF` index and original global index, and renumbered contiguously;
2261   the original root ranks are ignored.
2262   For both `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`,
2263   the output `PetscSF`'s root layout is such that the local number of roots is a sum of the input `PetscSF`'s local numbers of roots on each rank
2264   to keep the load balancing.
2265   However, for `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, roots can move to different ranks.
2266 
2267    Example:
2268    We can use src/vec/is/sf/tests/ex18.c to compare the root modes. By running
2269 .vb
2270   make -C $PETSC_DIR/src/vec/is/sf/tests ex18
2271   for m in {local,global,shared}; do
2272     mpirun -n 2 $PETSC_DIR/src/vec/is/sf/tests/ex18 -nsfs 2 -n 2 -root_mode $m -sf_view
2273   done
2274 .ve
2275    we generate two identical `PetscSF`s sf_0 and sf_1,
2276 .vb
2277   PetscSF Object: sf_0 2 MPI processes
2278     type: basic
2279     rank #leaves #roots
2280     [ 0]       4      2
2281     [ 1]       4      2
2282     leaves      roots       roots in global numbering
2283     ( 0,  0) <- ( 0,  0)  =   0
2284     ( 0,  1) <- ( 0,  1)  =   1
2285     ( 0,  2) <- ( 1,  0)  =   2
2286     ( 0,  3) <- ( 1,  1)  =   3
2287     ( 1,  0) <- ( 0,  0)  =   0
2288     ( 1,  1) <- ( 0,  1)  =   1
2289     ( 1,  2) <- ( 1,  0)  =   2
2290     ( 1,  3) <- ( 1,  1)  =   3
2291 .ve
2292    and pass them to `PetscSFConcatenate()` along with different choices of `rootMode`, yielding different result_sf:
2293 .vb
2294   rootMode = local:
2295   PetscSF Object: result_sf 2 MPI processes
2296     type: basic
2297     rank #leaves #roots
2298     [ 0]       8      4
2299     [ 1]       8      4
2300     leaves      roots       roots in global numbering
2301     ( 0,  0) <- ( 0,  0)  =   0
2302     ( 0,  1) <- ( 0,  1)  =   1
2303     ( 0,  2) <- ( 1,  0)  =   4
2304     ( 0,  3) <- ( 1,  1)  =   5
2305     ( 0,  4) <- ( 0,  2)  =   2
2306     ( 0,  5) <- ( 0,  3)  =   3
2307     ( 0,  6) <- ( 1,  2)  =   6
2308     ( 0,  7) <- ( 1,  3)  =   7
2309     ( 1,  0) <- ( 0,  0)  =   0
2310     ( 1,  1) <- ( 0,  1)  =   1
2311     ( 1,  2) <- ( 1,  0)  =   4
2312     ( 1,  3) <- ( 1,  1)  =   5
2313     ( 1,  4) <- ( 0,  2)  =   2
2314     ( 1,  5) <- ( 0,  3)  =   3
2315     ( 1,  6) <- ( 1,  2)  =   6
2316     ( 1,  7) <- ( 1,  3)  =   7
2317 
2318   rootMode = global:
2319   PetscSF Object: result_sf 2 MPI processes
2320     type: basic
2321     rank #leaves #roots
2322     [ 0]       8      4
2323     [ 1]       8      4
2324     leaves      roots       roots in global numbering
2325     ( 0,  0) <- ( 0,  0)  =   0
2326     ( 0,  1) <- ( 0,  1)  =   1
2327     ( 0,  2) <- ( 0,  2)  =   2
2328     ( 0,  3) <- ( 0,  3)  =   3
2329     ( 0,  4) <- ( 1,  0)  =   4
2330     ( 0,  5) <- ( 1,  1)  =   5
2331     ( 0,  6) <- ( 1,  2)  =   6
2332     ( 0,  7) <- ( 1,  3)  =   7
2333     ( 1,  0) <- ( 0,  0)  =   0
2334     ( 1,  1) <- ( 0,  1)  =   1
2335     ( 1,  2) <- ( 0,  2)  =   2
2336     ( 1,  3) <- ( 0,  3)  =   3
2337     ( 1,  4) <- ( 1,  0)  =   4
2338     ( 1,  5) <- ( 1,  1)  =   5
2339     ( 1,  6) <- ( 1,  2)  =   6
2340     ( 1,  7) <- ( 1,  3)  =   7
2341 
2342   rootMode = shared:
2343   PetscSF Object: result_sf 2 MPI processes
2344     type: basic
2345     rank #leaves #roots
2346     [ 0]       8      2
2347     [ 1]       8      2
2348     leaves      roots       roots in global numbering
2349     ( 0,  0) <- ( 0,  0)  =   0
2350     ( 0,  1) <- ( 0,  1)  =   1
2351     ( 0,  2) <- ( 1,  0)  =   2
2352     ( 0,  3) <- ( 1,  1)  =   3
2353     ( 0,  4) <- ( 0,  0)  =   0
2354     ( 0,  5) <- ( 0,  1)  =   1
2355     ( 0,  6) <- ( 1,  0)  =   2
2356     ( 0,  7) <- ( 1,  1)  =   3
2357     ( 1,  0) <- ( 0,  0)  =   0
2358     ( 1,  1) <- ( 0,  1)  =   1
2359     ( 1,  2) <- ( 1,  0)  =   2
2360     ( 1,  3) <- ( 1,  1)  =   3
2361     ( 1,  4) <- ( 0,  0)  =   0
2362     ( 1,  5) <- ( 0,  1)  =   1
2363     ( 1,  6) <- ( 1,  0)  =   2
2364     ( 1,  7) <- ( 1,  1)  =   3
2365 .ve
2366 
2367 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFConcatenateRootMode`
2368 @*/
2369 PetscErrorCode PetscSFConcatenate(MPI_Comm comm, PetscInt nsfs, PetscSF sfs[], PetscSFConcatenateRootMode rootMode, PetscInt leafOffsets[], PetscSF *newsf)
2370 {
2371   PetscInt     i, s, nLeaves, nRoots;
2372   PetscInt    *leafArrayOffsets;
2373   PetscInt    *ilocal_new;
2374   PetscSFNode *iremote_new;
2375   PetscBool    all_ilocal_null = PETSC_FALSE;
2376   PetscLayout  glayout         = NULL;
2377   PetscInt    *gremote         = NULL;
2378   PetscMPIInt  rank, size;
2379 
2380   PetscFunctionBegin;
2381   if (PetscDefined(USE_DEBUG)) {
2382     PetscSF dummy; /* just to have a PetscObject on comm for input validation */
2383 
2384     PetscCall(PetscSFCreate(comm, &dummy));
2385     PetscValidLogicalCollectiveInt(dummy, nsfs, 2);
2386     PetscValidPointer(sfs, 3);
2387     for (i = 0; i < nsfs; i++) {
2388       PetscValidHeaderSpecific(sfs[i], PETSCSF_CLASSID, 3);
2389       PetscCheckSameComm(dummy, 1, sfs[i], 3);
2390     }
2391     PetscValidLogicalCollectiveEnum(dummy, rootMode, 4);
2392     if (leafOffsets) PetscValidIntPointer(leafOffsets, 5);
2393     PetscValidPointer(newsf, 6);
2394     PetscCall(PetscSFDestroy(&dummy));
2395   }
2396   if (!nsfs) {
2397     PetscCall(PetscSFCreate(comm, newsf));
2398     PetscCall(PetscSFSetGraph(*newsf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
2399     PetscFunctionReturn(PETSC_SUCCESS);
2400   }
2401   PetscCallMPI(MPI_Comm_rank(comm, &rank));
2402   PetscCallMPI(MPI_Comm_size(comm, &size));
2403 
2404   /* Calculate leaf array offsets */
2405   PetscCall(PetscMalloc1(nsfs + 1, &leafArrayOffsets));
2406   leafArrayOffsets[0] = 0;
2407   for (s = 0; s < nsfs; s++) {
2408     PetscInt nl;
2409 
2410     PetscCall(PetscSFGetGraph(sfs[s], NULL, &nl, NULL, NULL));
2411     leafArrayOffsets[s + 1] = leafArrayOffsets[s] + nl;
2412   }
2413   nLeaves = leafArrayOffsets[nsfs];
2414 
2415   /* Calculate number of roots */
2416   switch (rootMode) {
2417   case PETSCSF_CONCATENATE_ROOTMODE_SHARED: {
2418     PetscCall(PetscSFGetGraph(sfs[0], &nRoots, NULL, NULL, NULL));
2419     if (PetscDefined(USE_DEBUG)) {
2420       for (s = 1; s < nsfs; s++) {
2421         PetscInt nr;
2422 
2423         PetscCall(PetscSFGetGraph(sfs[s], &nr, NULL, NULL, NULL));
2424         PetscCheck(nr == nRoots, comm, PETSC_ERR_ARG_SIZ, "rootMode = %s but sfs[%" PetscInt_FMT "] has a different number of roots (%" PetscInt_FMT ") than sfs[0] (%" PetscInt_FMT ")", PetscSFConcatenateRootModes[rootMode], s, nr, nRoots);
2425       }
2426     }
2427   } break;
2428   case PETSCSF_CONCATENATE_ROOTMODE_GLOBAL: {
2429     /* Calculate also global layout in this case */
2430     PetscInt    *nls;
2431     PetscLayout *lts;
2432     PetscInt   **inds;
2433     PetscInt     j;
2434     PetscInt     rootOffset = 0;
2435 
2436     PetscCall(PetscCalloc3(nsfs, &lts, nsfs, &nls, nsfs, &inds));
2437     PetscCall(PetscLayoutCreate(comm, &glayout));
2438     glayout->bs = 1;
2439     glayout->n  = 0;
2440     glayout->N  = 0;
2441     for (s = 0; s < nsfs; s++) {
2442       PetscCall(PetscSFGetGraphLayout(sfs[s], &lts[s], &nls[s], NULL, &inds[s]));
2443       glayout->n += lts[s]->n;
2444       glayout->N += lts[s]->N;
2445     }
2446     PetscCall(PetscLayoutSetUp(glayout));
2447     PetscCall(PetscMalloc1(nLeaves, &gremote));
2448     for (s = 0, j = 0; s < nsfs; s++) {
2449       for (i = 0; i < nls[s]; i++, j++) gremote[j] = inds[s][i] + rootOffset;
2450       rootOffset += lts[s]->N;
2451       PetscCall(PetscLayoutDestroy(&lts[s]));
2452       PetscCall(PetscFree(inds[s]));
2453     }
2454     PetscCall(PetscFree3(lts, nls, inds));
2455     nRoots = glayout->N;
2456   } break;
2457   case PETSCSF_CONCATENATE_ROOTMODE_LOCAL:
2458     /* nRoots calculated later in this case */
2459     break;
2460   default:
2461     SETERRQ(comm, PETSC_ERR_ARG_WRONG, "Invalid PetscSFConcatenateRootMode %d", rootMode);
2462   }
2463 
2464   if (!leafOffsets) {
2465     all_ilocal_null = PETSC_TRUE;
2466     for (s = 0; s < nsfs; s++) {
2467       const PetscInt *ilocal;
2468 
2469       PetscCall(PetscSFGetGraph(sfs[s], NULL, NULL, &ilocal, NULL));
2470       if (ilocal) {
2471         all_ilocal_null = PETSC_FALSE;
2472         break;
2473       }
2474     }
2475     PetscCheck(all_ilocal_null, PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "leafOffsets can be passed as NULL only if all SFs have ilocal = NULL");
2476   }
2477 
2478   /* Renumber and concatenate local leaves */
2479   ilocal_new = NULL;
2480   if (!all_ilocal_null) {
2481     PetscCall(PetscMalloc1(nLeaves, &ilocal_new));
2482     for (i = 0; i < nLeaves; i++) ilocal_new[i] = -1;
2483     for (s = 0; s < nsfs; s++) {
2484       const PetscInt *ilocal;
2485       PetscInt       *ilocal_l = &ilocal_new[leafArrayOffsets[s]];
2486       PetscInt        i, nleaves_l;
2487 
2488       PetscCall(PetscSFGetGraph(sfs[s], NULL, &nleaves_l, &ilocal, NULL));
2489       for (i = 0; i < nleaves_l; i++) ilocal_l[i] = (ilocal ? ilocal[i] : i) + leafOffsets[s];
2490     }
2491   }
2492 
2493   /* Renumber and concatenate remote roots */
2494   if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL || rootMode == PETSCSF_CONCATENATE_ROOTMODE_SHARED) {
2495     PetscInt rootOffset = 0;
2496 
2497     PetscCall(PetscMalloc1(nLeaves, &iremote_new));
2498     for (i = 0; i < nLeaves; i++) {
2499       iremote_new[i].rank  = -1;
2500       iremote_new[i].index = -1;
2501     }
2502     for (s = 0; s < nsfs; s++) {
2503       PetscInt           i, nl, nr;
2504       PetscSF            tmp_sf;
2505       const PetscSFNode *iremote;
2506       PetscSFNode       *tmp_rootdata;
2507       PetscSFNode       *tmp_leafdata = &iremote_new[leafArrayOffsets[s]];
2508 
2509       PetscCall(PetscSFGetGraph(sfs[s], &nr, &nl, NULL, &iremote));
2510       PetscCall(PetscSFCreate(comm, &tmp_sf));
2511       /* create helper SF with contiguous leaves */
2512       PetscCall(PetscSFSetGraph(tmp_sf, nr, nl, NULL, PETSC_USE_POINTER, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
2513       PetscCall(PetscSFSetUp(tmp_sf));
2514       PetscCall(PetscMalloc1(nr, &tmp_rootdata));
2515       if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) {
2516         for (i = 0; i < nr; i++) {
2517           tmp_rootdata[i].index = i + rootOffset;
2518           tmp_rootdata[i].rank  = (PetscInt)rank;
2519         }
2520         rootOffset += nr;
2521       } else {
2522         for (i = 0; i < nr; i++) {
2523           tmp_rootdata[i].index = i;
2524           tmp_rootdata[i].rank  = (PetscInt)rank;
2525         }
2526       }
2527       PetscCall(PetscSFBcastBegin(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2528       PetscCall(PetscSFBcastEnd(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2529       PetscCall(PetscSFDestroy(&tmp_sf));
2530       PetscCall(PetscFree(tmp_rootdata));
2531     }
2532     if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) nRoots = rootOffset; // else nRoots already calculated above
2533 
2534     /* Build the new SF */
2535     PetscCall(PetscSFCreate(comm, newsf));
2536     PetscCall(PetscSFSetGraph(*newsf, nRoots, nLeaves, ilocal_new, PETSC_OWN_POINTER, iremote_new, PETSC_OWN_POINTER));
2537   } else {
2538     /* Build the new SF */
2539     PetscCall(PetscSFCreate(comm, newsf));
2540     PetscCall(PetscSFSetGraphLayout(*newsf, glayout, nLeaves, ilocal_new, PETSC_OWN_POINTER, gremote));
2541   }
2542   PetscCall(PetscSFSetUp(*newsf));
2543   PetscCall(PetscSFViewFromOptions(*newsf, NULL, "-sf_concat_view"));
2544   PetscCall(PetscLayoutDestroy(&glayout));
2545   PetscCall(PetscFree(gremote));
2546   PetscCall(PetscFree(leafArrayOffsets));
2547   PetscFunctionReturn(PETSC_SUCCESS);
2548 }
2549