xref: /petsc/src/vec/is/sf/interface/sf.c (revision 66c9fbdd036b1e887ebf0d2bef6dbdcafd086d45)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petsc/private/viewerimpl.h>
4 #include <petscctable.h>
5 
6 #if defined(PETSC_HAVE_CUDA)
7   #include <cuda_runtime.h>
8 #endif
9 
10 #if defined(PETSC_HAVE_HIP)
11   #include <hip/hip_runtime.h>
12 #endif
13 
14 #if defined(PETSC_CLANG_STATIC_ANALYZER)
15 void PetscSFCheckGraphSet(PetscSF, int);
16 #else
17   #if defined(PETSC_USE_DEBUG)
18     #define PetscSFCheckGraphSet(sf, arg) PetscCheck((sf)->graphset, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %d \"%s\" before %s()", (arg), #sf, PETSC_FUNCTION_NAME);
19   #else
20     #define PetscSFCheckGraphSet(sf, arg) \
21       do { \
22       } while (0)
23   #endif
24 #endif
25 
26 const char *const PetscSFDuplicateOptions[] = {"CONFONLY", "RANKS", "GRAPH", "PetscSFDuplicateOption", "PETSCSF_DUPLICATE_", NULL};
27 
28 /*@
29    PetscSFCreate - create a star forest communication context
30 
31    Collective
32 
33    Input Parameter:
34 .  comm - communicator on which the star forest will operate
35 
36    Output Parameter:
37 .  sf - new star forest context
38 
39    Options Database Keys:
40 .  -sf_type type - value of type may be
41 .vb
42     basic     -Use MPI persistent Isend/Irecv for communication (Default)
43     window    -Use MPI-3 one-sided window for communication
44     neighbor  -Use MPI-3 neighborhood collectives for communication
45 .ve
46 
47    Level: intermediate
48 
49    Note:
50    When one knows the communication graph is one of the predefined graph, such as `MPI_Alltoall()`, `MPI_Allgatherv()`,
51    `MPI_Gatherv()`, one can create a `PetscSF` and then set its graph with `PetscSFSetGraphWithPattern()`. These special
52    `SF`s are optimized and they have better performance than general `SF`s.
53 
54 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFSetGraphWithPattern()`, `PetscSFDestroy()`
55 @*/
56 PetscErrorCode PetscSFCreate(MPI_Comm comm, PetscSF *sf)
57 {
58   PetscSF b;
59 
60   PetscFunctionBegin;
61   PetscValidPointer(sf, 2);
62   PetscCall(PetscSFInitializePackage());
63 
64   PetscCall(PetscHeaderCreate(b, PETSCSF_CLASSID, "PetscSF", "Star Forest", "PetscSF", comm, PetscSFDestroy, PetscSFView));
65 
66   b->nroots    = -1;
67   b->nleaves   = -1;
68   b->minleaf   = PETSC_MAX_INT;
69   b->maxleaf   = PETSC_MIN_INT;
70   b->nranks    = -1;
71   b->rankorder = PETSC_TRUE;
72   b->ingroup   = MPI_GROUP_NULL;
73   b->outgroup  = MPI_GROUP_NULL;
74   b->graphset  = PETSC_FALSE;
75 #if defined(PETSC_HAVE_DEVICE)
76   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
77   b->use_stream_aware_mpi = PETSC_FALSE;
78   b->unknown_input_stream = PETSC_FALSE;
79   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
80   b->backend = PETSCSF_BACKEND_KOKKOS;
81   #elif defined(PETSC_HAVE_CUDA)
82   b->backend = PETSCSF_BACKEND_CUDA;
83   #elif defined(PETSC_HAVE_HIP)
84   b->backend = PETSCSF_BACKEND_HIP;
85   #endif
86 
87   #if defined(PETSC_HAVE_NVSHMEM)
88   b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
89   b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
90   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem", &b->use_nvshmem, NULL));
91   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem_get", &b->use_nvshmem_get, NULL));
92   #endif
93 #endif
94   b->vscat.from_n = -1;
95   b->vscat.to_n   = -1;
96   b->vscat.unit   = MPIU_SCALAR;
97   *sf             = b;
98   PetscFunctionReturn(0);
99 }
100 
101 /*@
102    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
103 
104    Collective
105 
106    Input Parameter:
107 .  sf - star forest
108 
109    Level: advanced
110 
111 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetGraph()`, `PetscSFDestroy()`
112 @*/
113 PetscErrorCode PetscSFReset(PetscSF sf)
114 {
115   PetscFunctionBegin;
116   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
117   PetscTryTypeMethod(sf, Reset);
118   sf->nroots   = -1;
119   sf->nleaves  = -1;
120   sf->minleaf  = PETSC_MAX_INT;
121   sf->maxleaf  = PETSC_MIN_INT;
122   sf->mine     = NULL;
123   sf->remote   = NULL;
124   sf->graphset = PETSC_FALSE;
125   PetscCall(PetscFree(sf->mine_alloc));
126   PetscCall(PetscFree(sf->remote_alloc));
127   sf->nranks = -1;
128   PetscCall(PetscFree4(sf->ranks, sf->roffset, sf->rmine, sf->rremote));
129   sf->degreeknown = PETSC_FALSE;
130   PetscCall(PetscFree(sf->degree));
131   if (sf->ingroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->ingroup));
132   if (sf->outgroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->outgroup));
133   if (sf->multi) sf->multi->multi = NULL;
134   PetscCall(PetscSFDestroy(&sf->multi));
135   PetscCall(PetscLayoutDestroy(&sf->map));
136 
137 #if defined(PETSC_HAVE_DEVICE)
138   for (PetscInt i = 0; i < 2; i++) PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, sf->rmine_d[i]));
139 #endif
140 
141   sf->setupcalled = PETSC_FALSE;
142   PetscFunctionReturn(0);
143 }
144 
145 /*@C
146    PetscSFSetType - Set the `PetscSF` communication implementation
147 
148    Collective
149 
150    Input Parameters:
151 +  sf - the `PetscSF` context
152 -  type - a known method
153 .vb
154     PETSCSFWINDOW - MPI-2/3 one-sided
155     PETSCSFBASIC - basic implementation using MPI-1 two-sided
156 .ve
157 
158    Options Database Key:
159 .  -sf_type <type> - Sets the method; for example basic or window use -help for a list of available methods (for instance, window, basic, neighbor)
160 
161   Level: intermediate
162 
163    Notes:
164    See "include/petscsf.h" for available methods (for instance)
165 
166 .seealso: `PetscSFType`, `PetscSFCreate()`
167 @*/
168 PetscErrorCode PetscSFSetType(PetscSF sf, PetscSFType type)
169 {
170   PetscBool match;
171   PetscErrorCode (*r)(PetscSF);
172 
173   PetscFunctionBegin;
174   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
175   PetscValidCharPointer(type, 2);
176 
177   PetscCall(PetscObjectTypeCompare((PetscObject)sf, type, &match));
178   if (match) PetscFunctionReturn(0);
179 
180   PetscCall(PetscFunctionListFind(PetscSFList, type, &r));
181   PetscCheck(r, PETSC_COMM_SELF, PETSC_ERR_ARG_UNKNOWN_TYPE, "Unable to find requested PetscSF type %s", type);
182   /* Destroy the previous PetscSF implementation context */
183   PetscTryTypeMethod(sf, Destroy);
184   PetscCall(PetscMemzero(sf->ops, sizeof(*sf->ops)));
185   PetscCall(PetscObjectChangeTypeName((PetscObject)sf, type));
186   PetscCall((*r)(sf));
187   PetscFunctionReturn(0);
188 }
189 
190 /*@C
191   PetscSFGetType - Get the `PetscSF` communication implementation
192 
193   Not Collective
194 
195   Input Parameter:
196 . sf  - the `PetscSF` context
197 
198   Output Parameter:
199 . type - the `PetscSF` type name
200 
201   Level: intermediate
202 
203 .seealso: `PetscSFType`, `PetscSFSetType()`, `PetscSFCreate()`
204 @*/
205 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
206 {
207   PetscFunctionBegin;
208   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
209   PetscValidPointer(type, 2);
210   *type = ((PetscObject)sf)->type_name;
211   PetscFunctionReturn(0);
212 }
213 
214 /*@C
215    PetscSFDestroy - destroy star forest
216 
217    Collective
218 
219    Input Parameter:
220 .  sf - address of star forest
221 
222    Level: intermediate
223 
224 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFReset()`
225 @*/
226 PetscErrorCode PetscSFDestroy(PetscSF *sf)
227 {
228   PetscFunctionBegin;
229   if (!*sf) PetscFunctionReturn(0);
230   PetscValidHeaderSpecific((*sf), PETSCSF_CLASSID, 1);
231   if (--((PetscObject)(*sf))->refct > 0) {
232     *sf = NULL;
233     PetscFunctionReturn(0);
234   }
235   PetscCall(PetscSFReset(*sf));
236   PetscTryTypeMethod((*sf), Destroy);
237   PetscCall(PetscSFDestroy(&(*sf)->vscat.lsf));
238   if ((*sf)->vscat.bs > 1) PetscCallMPI(MPI_Type_free(&(*sf)->vscat.unit));
239   PetscCall(PetscHeaderDestroy(sf));
240   PetscFunctionReturn(0);
241 }
242 
243 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
244 {
245   PetscInt           i, nleaves;
246   PetscMPIInt        size;
247   const PetscInt    *ilocal;
248   const PetscSFNode *iremote;
249 
250   PetscFunctionBegin;
251   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
252   PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, &iremote));
253   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
254   for (i = 0; i < nleaves; i++) {
255     const PetscInt rank   = iremote[i].rank;
256     const PetscInt remote = iremote[i].index;
257     const PetscInt leaf   = ilocal ? ilocal[i] : i;
258     PetscCheck(rank >= 0 && rank < size, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided rank (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be in [0, %d)", rank, i, size);
259     PetscCheck(remote >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided index (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be >= 0", remote, i);
260     PetscCheck(leaf >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided location (%" PetscInt_FMT ") for leaf %" PetscInt_FMT " is invalid, should be >= 0", leaf, i);
261   }
262   PetscFunctionReturn(0);
263 }
264 
265 /*@
266    PetscSFSetUp - set up communication structures
267 
268    Collective
269 
270    Input Parameter:
271 .  sf - star forest communication object
272 
273    Level: beginner
274 
275 .seealso: `PetscSFType`, `PetscSFSetFromOptions()`, `PetscSFSetType()`
276 @*/
277 PetscErrorCode PetscSFSetUp(PetscSF sf)
278 {
279   PetscFunctionBegin;
280   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
281   PetscSFCheckGraphSet(sf, 1);
282   if (sf->setupcalled) PetscFunctionReturn(0);
283   PetscCall(PetscLogEventBegin(PETSCSF_SetUp, sf, 0, 0, 0));
284   PetscCall(PetscSFCheckGraphValid_Private(sf));
285   if (!((PetscObject)sf)->type_name) PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); /* Zero all sf->ops */
286   PetscTryTypeMethod(sf, SetUp);
287 #if defined(PETSC_HAVE_CUDA)
288   if (sf->backend == PETSCSF_BACKEND_CUDA) {
289     sf->ops->Malloc = PetscSFMalloc_CUDA;
290     sf->ops->Free   = PetscSFFree_CUDA;
291   }
292 #endif
293 #if defined(PETSC_HAVE_HIP)
294   if (sf->backend == PETSCSF_BACKEND_HIP) {
295     sf->ops->Malloc = PetscSFMalloc_HIP;
296     sf->ops->Free   = PetscSFFree_HIP;
297   }
298 #endif
299 
300 #
301 #if defined(PETSC_HAVE_KOKKOS)
302   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
303     sf->ops->Malloc = PetscSFMalloc_Kokkos;
304     sf->ops->Free   = PetscSFFree_Kokkos;
305   }
306 #endif
307   PetscCall(PetscLogEventEnd(PETSCSF_SetUp, sf, 0, 0, 0));
308   sf->setupcalled = PETSC_TRUE;
309   PetscFunctionReturn(0);
310 }
311 
312 /*@
313    PetscSFSetFromOptions - set `PetscSF` options using the options database
314 
315    Logically Collective
316 
317    Input Parameter:
318 .  sf - star forest
319 
320    Options Database Keys:
321 +  -sf_type               - implementation type, see PetscSFSetType()
322 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
323 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
324                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
325                             If true, this option only works with -use_gpu_aware_mpi 1.
326 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
327                                If true, this option only works with -use_gpu_aware_mpi 1.
328 
329 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently SF has these backends: cuda, hip and Kokkos.
330                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
331                               the only available is kokkos.
332 
333    Level: intermediate
334 
335 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetType()`
336 @*/
337 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
338 {
339   PetscSFType deft;
340   char        type[256];
341   PetscBool   flg;
342 
343   PetscFunctionBegin;
344   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
345   PetscObjectOptionsBegin((PetscObject)sf);
346   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
347   PetscCall(PetscOptionsFList("-sf_type", "PetscSF implementation type", "PetscSFSetType", PetscSFList, deft, type, sizeof(type), &flg));
348   PetscCall(PetscSFSetType(sf, flg ? type : deft));
349   PetscCall(PetscOptionsBool("-sf_rank_order", "sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise", "PetscSFSetRankOrder", sf->rankorder, &sf->rankorder, NULL));
350 #if defined(PETSC_HAVE_DEVICE)
351   {
352     char      backendstr[32] = {0};
353     PetscBool isCuda = PETSC_FALSE, isHip = PETSC_FALSE, isKokkos = PETSC_FALSE, set;
354     /* Change the defaults set in PetscSFCreate() with command line options */
355     PetscCall(PetscOptionsBool("-sf_unknown_input_stream", "SF root/leafdata is computed on arbitrary streams unknown to SF", "PetscSFSetFromOptions", sf->unknown_input_stream, &sf->unknown_input_stream, NULL));
356     PetscCall(PetscOptionsBool("-sf_use_stream_aware_mpi", "Assume the underlying MPI is cuda-stream aware", "PetscSFSetFromOptions", sf->use_stream_aware_mpi, &sf->use_stream_aware_mpi, NULL));
357     PetscCall(PetscOptionsString("-sf_backend", "Select the device backend SF uses", "PetscSFSetFromOptions", NULL, backendstr, sizeof(backendstr), &set));
358     PetscCall(PetscStrcasecmp("cuda", backendstr, &isCuda));
359     PetscCall(PetscStrcasecmp("kokkos", backendstr, &isKokkos));
360     PetscCall(PetscStrcasecmp("hip", backendstr, &isHip));
361   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
362     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
363     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
364     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
365     else PetscCheck(!set, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
366   #elif defined(PETSC_HAVE_KOKKOS)
367     PetscCheck(!set || isKokkos, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You can only choose kokkos", backendstr);
368   #endif
369   }
370 #endif
371   PetscTryTypeMethod(sf, SetFromOptions, PetscOptionsObject);
372   PetscOptionsEnd();
373   PetscFunctionReturn(0);
374 }
375 
376 /*@
377    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
378 
379    Logically Collective
380 
381    Input Parameters:
382 +  sf - star forest
383 -  flg - `PETSC_TRUE` to sort, `PETSC_FALSE` to skip sorting (lower setup cost, but non-deterministic)
384 
385    Level: advanced
386 
387 .seealso: `PetscSFType`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`
388 @*/
389 PetscErrorCode PetscSFSetRankOrder(PetscSF sf, PetscBool flg)
390 {
391   PetscFunctionBegin;
392   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
393   PetscValidLogicalCollectiveBool(sf, flg, 2);
394   PetscCheck(!sf->multi, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
395   sf->rankorder = flg;
396   PetscFunctionReturn(0);
397 }
398 
399 /*@C
400    PetscSFSetGraph - Set a parallel star forest
401 
402    Collective
403 
404    Input Parameters:
405 +  sf - star forest
406 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
407 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
408 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
409 during setup in debug mode)
410 .  localmode - copy mode for ilocal
411 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
412 during setup in debug mode)
413 -  remotemode - copy mode for iremote
414 
415    Level: intermediate
416 
417    Notes:
418    Leaf indices in ilocal must be unique, otherwise an error occurs.
419 
420    Input arrays ilocal and iremote follow the PetscCopyMode semantics.
421    In particular, if localmode/remotemode is `PETSC_OWN_POINTER` or `PETSC_USE_POINTER`,
422    PETSc might modify the respective array;
423    if `PETSC_USE_POINTER`, the user must delete the array after PetscSFDestroy().
424    Only if `PETSC_COPY_VALUES` is used, the respective array is guaranteed to stay intact and a const array can be passed (but a cast to non-const is needed).
425 
426    Fortran Note:
427    In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode.
428 
429    Developer Note:
430    We sort leaves to check for duplicates and contiguousness and to find minleaf/maxleaf.
431    This also allows to compare leaf sets of two SFs easily.
432 
433 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
434 @*/
435 PetscErrorCode PetscSFSetGraph(PetscSF sf, PetscInt nroots, PetscInt nleaves, PetscInt *ilocal, PetscCopyMode localmode, PetscSFNode *iremote, PetscCopyMode remotemode)
436 {
437   PetscBool unique, contiguous;
438 
439   PetscFunctionBegin;
440   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
441   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal, 4);
442   if (nleaves > 0) PetscValidPointer(iremote, 6);
443   PetscCheck(nroots >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nroots %" PetscInt_FMT ", cannot be negative", nroots);
444   PetscCheck(nleaves >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nleaves %" PetscInt_FMT ", cannot be negative", nleaves);
445   /* enums may be handled as unsigned by some compilers, NVHPC for example, the int cast
446    * below is to prevent NVHPC from warning about meaningless comparison of unsigned with zero */
447   PetscCheck((int)localmode >= PETSC_COPY_VALUES && localmode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong localmode %d", localmode);
448   PetscCheck((int)remotemode >= PETSC_COPY_VALUES && remotemode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong remotemode %d", remotemode);
449 
450   if (sf->nroots >= 0) { /* Reset only if graph already set */
451     PetscCall(PetscSFReset(sf));
452   }
453 
454   PetscCall(PetscLogEventBegin(PETSCSF_SetGraph, sf, 0, 0, 0));
455 
456   sf->nroots  = nroots;
457   sf->nleaves = nleaves;
458 
459   if (localmode == PETSC_COPY_VALUES && ilocal) {
460     PetscInt *tlocal = NULL;
461 
462     PetscCall(PetscMalloc1(nleaves, &tlocal));
463     PetscCall(PetscArraycpy(tlocal, ilocal, nleaves));
464     ilocal = tlocal;
465   }
466   if (remotemode == PETSC_COPY_VALUES) {
467     PetscSFNode *tremote = NULL;
468 
469     PetscCall(PetscMalloc1(nleaves, &tremote));
470     PetscCall(PetscArraycpy(tremote, iremote, nleaves));
471     iremote = tremote;
472   }
473 
474   if (nleaves && ilocal) {
475     PetscSFNode work;
476 
477     PetscCall(PetscSortIntWithDataArray(nleaves, ilocal, iremote, sizeof(PetscSFNode), &work));
478     PetscCall(PetscSortedCheckDupsInt(nleaves, ilocal, &unique));
479     unique = PetscNot(unique);
480     PetscCheck(sf->allow_multi_leaves || unique, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Input ilocal has duplicate entries which is not allowed for this PetscSF");
481     sf->minleaf = ilocal[0];
482     sf->maxleaf = ilocal[nleaves - 1];
483     contiguous  = (PetscBool)(unique && ilocal[0] == 0 && ilocal[nleaves - 1] == nleaves - 1);
484   } else {
485     sf->minleaf = 0;
486     sf->maxleaf = nleaves - 1;
487     unique      = PETSC_TRUE;
488     contiguous  = PETSC_TRUE;
489   }
490 
491   if (contiguous) {
492     if (localmode == PETSC_USE_POINTER) {
493       ilocal = NULL;
494     } else {
495       PetscCall(PetscFree(ilocal));
496     }
497   }
498   sf->mine = ilocal;
499   if (localmode == PETSC_USE_POINTER) {
500     sf->mine_alloc = NULL;
501   } else {
502     sf->mine_alloc = ilocal;
503   }
504   sf->remote = iremote;
505   if (remotemode == PETSC_USE_POINTER) {
506     sf->remote_alloc = NULL;
507   } else {
508     sf->remote_alloc = iremote;
509   }
510   PetscCall(PetscLogEventEnd(PETSCSF_SetGraph, sf, 0, 0, 0));
511   sf->graphset = PETSC_TRUE;
512   PetscFunctionReturn(0);
513 }
514 
515 /*@
516   PetscSFSetGraphWithPattern - Sets the graph of a `PetscSF` with a specific pattern
517 
518   Collective
519 
520   Input Parameters:
521 + sf      - The `PetscSF`
522 . map     - Layout of roots over all processes (insignificant when pattern is `PETSCSF_PATTERN_ALLTOALL`)
523 - pattern - One of `PETSCSF_PATTERN_ALLGATHER`, `PETSCSF_PATTERN_GATHER`, `PETSCSF_PATTERN_ALLTOALL`
524 
525   Level: intermediate
526 
527   Notes:
528   It is easier to explain `PetscSFPattern` using vectors. Suppose we have an MPI vector x and its layout is map.
529   n and N are local and global sizes of x respectively.
530 
531   With `PETSCSF_PATTERN_ALLGATHER`, the routine creates a graph that if one does Bcast on it, it will copy x to
532   sequential vectors y on all ranks.
533 
534   With `PETSCSF_PATTERN_GATHER`, the routine creates a graph that if one does Bcast on it, it will copy x to a
535   sequential vector y on rank 0.
536 
537   In above cases, entries of x are roots and entries of y are leaves.
538 
539   With `PETSCSF_PATTERN_ALLTOALL`, map is insignificant. Suppose NP is size of sf's communicator. The routine
540   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
541   of rank j. Here 0 <=i,j<NP. It is a kind of `MPI_Alltoall()` with sendcount/recvcount being 1. Note that it does
542   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
543   items with `MPI_Type_contiguous` and use that as the <unit> argument in SF routines.
544 
545   In this case, roots and leaves are symmetric.
546 
547 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
548  @*/
549 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf, PetscLayout map, PetscSFPattern pattern)
550 {
551   MPI_Comm    comm;
552   PetscInt    n, N, res[2];
553   PetscMPIInt rank, size;
554   PetscSFType type;
555 
556   PetscFunctionBegin;
557   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
558   if (pattern != PETSCSF_PATTERN_ALLTOALL) PetscValidPointer(map, 2);
559   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
560   PetscCheck(pattern >= PETSCSF_PATTERN_ALLGATHER && pattern <= PETSCSF_PATTERN_ALLTOALL, comm, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported PetscSFPattern %d", pattern);
561   PetscCallMPI(MPI_Comm_rank(comm, &rank));
562   PetscCallMPI(MPI_Comm_size(comm, &size));
563 
564   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
565     type = PETSCSFALLTOALL;
566     PetscCall(PetscLayoutCreate(comm, &sf->map));
567     PetscCall(PetscLayoutSetLocalSize(sf->map, size));
568     PetscCall(PetscLayoutSetSize(sf->map, ((PetscInt)size) * size));
569     PetscCall(PetscLayoutSetUp(sf->map));
570   } else {
571     PetscCall(PetscLayoutGetLocalSize(map, &n));
572     PetscCall(PetscLayoutGetSize(map, &N));
573     res[0] = n;
574     res[1] = -n;
575     /* Check if n are same over all ranks so that we can optimize it */
576     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, res, 2, MPIU_INT, MPI_MAX, comm));
577     if (res[0] == -res[1]) { /* same n */
578       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER : PETSCSFGATHER;
579     } else {
580       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
581     }
582     PetscCall(PetscLayoutReference(map, &sf->map));
583   }
584   PetscCall(PetscSFSetType(sf, type));
585 
586   sf->pattern = pattern;
587   sf->mine    = NULL; /* Contiguous */
588 
589   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
590      Also set other easy stuff.
591    */
592   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
593     sf->nleaves = N;
594     sf->nroots  = n;
595     sf->nranks  = size;
596     sf->minleaf = 0;
597     sf->maxleaf = N - 1;
598   } else if (pattern == PETSCSF_PATTERN_GATHER) {
599     sf->nleaves = rank ? 0 : N;
600     sf->nroots  = n;
601     sf->nranks  = rank ? 0 : size;
602     sf->minleaf = 0;
603     sf->maxleaf = rank ? -1 : N - 1;
604   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
605     sf->nleaves = size;
606     sf->nroots  = size;
607     sf->nranks  = size;
608     sf->minleaf = 0;
609     sf->maxleaf = size - 1;
610   }
611   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
612   sf->graphset = PETSC_TRUE;
613   PetscFunctionReturn(0);
614 }
615 
616 /*@
617    PetscSFCreateInverseSF - given a `PetscSF` in which all vertices have degree 1, creates the inverse map
618 
619    Collective
620 
621    Input Parameter:
622 .  sf - star forest to invert
623 
624    Output Parameter:
625 .  isf - inverse of sf
626 
627    Level: advanced
628 
629    Notes:
630    All roots must have degree 1.
631 
632    The local space may be a permutation, but cannot be sparse.
633 
634 .seealso: `PetscSFType`, `PetscSFSetGraph()`
635 @*/
636 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf, PetscSF *isf)
637 {
638   PetscMPIInt     rank;
639   PetscInt        i, nroots, nleaves, maxlocal, count, *newilocal;
640   const PetscInt *ilocal;
641   PetscSFNode    *roots, *leaves;
642 
643   PetscFunctionBegin;
644   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
645   PetscSFCheckGraphSet(sf, 1);
646   PetscValidPointer(isf, 2);
647 
648   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, NULL));
649   maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
650 
651   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
652   PetscCall(PetscMalloc2(nroots, &roots, maxlocal, &leaves));
653   for (i = 0; i < maxlocal; i++) {
654     leaves[i].rank  = rank;
655     leaves[i].index = i;
656   }
657   for (i = 0; i < nroots; i++) {
658     roots[i].rank  = -1;
659     roots[i].index = -1;
660   }
661   PetscCall(PetscSFReduceBegin(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
662   PetscCall(PetscSFReduceEnd(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
663 
664   /* Check whether our leaves are sparse */
665   for (i = 0, count = 0; i < nroots; i++)
666     if (roots[i].rank >= 0) count++;
667   if (count == nroots) newilocal = NULL;
668   else { /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */ PetscCall(PetscMalloc1(count, &newilocal));
669     for (i = 0, count = 0; i < nroots; i++) {
670       if (roots[i].rank >= 0) {
671         newilocal[count]   = i;
672         roots[count].rank  = roots[i].rank;
673         roots[count].index = roots[i].index;
674         count++;
675       }
676     }
677   }
678 
679   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, isf));
680   PetscCall(PetscSFSetGraph(*isf, maxlocal, count, newilocal, PETSC_OWN_POINTER, roots, PETSC_COPY_VALUES));
681   PetscCall(PetscFree2(roots, leaves));
682   PetscFunctionReturn(0);
683 }
684 
685 /*@
686    PetscSFDuplicate - duplicate a `PetscSF`, optionally preserving rank connectivity and graph
687 
688    Collective
689 
690    Input Parameters:
691 +  sf - communication object to duplicate
692 -  opt - `PETSCSF_DUPLICATE_CONFONLY`, `PETSCSF_DUPLICATE_RANKS`, or `PETSCSF_DUPLICATE_GRAPH` (see `PetscSFDuplicateOption`)
693 
694    Output Parameter:
695 .  newsf - new communication object
696 
697    Level: beginner
698 
699 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFSetType()`, `PetscSFSetGraph()`
700 @*/
701 PetscErrorCode PetscSFDuplicate(PetscSF sf, PetscSFDuplicateOption opt, PetscSF *newsf)
702 {
703   PetscSFType  type;
704   MPI_Datatype dtype = MPIU_SCALAR;
705 
706   PetscFunctionBegin;
707   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
708   PetscValidLogicalCollectiveEnum(sf, opt, 2);
709   PetscValidPointer(newsf, 3);
710   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sf), newsf));
711   PetscCall(PetscSFGetType(sf, &type));
712   if (type) PetscCall(PetscSFSetType(*newsf, type));
713   (*newsf)->allow_multi_leaves = sf->allow_multi_leaves; /* Dup this flag earlier since PetscSFSetGraph() below checks on this flag */
714   if (opt == PETSCSF_DUPLICATE_GRAPH) {
715     PetscSFCheckGraphSet(sf, 1);
716     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
717       PetscInt           nroots, nleaves;
718       const PetscInt    *ilocal;
719       const PetscSFNode *iremote;
720       PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
721       PetscCall(PetscSFSetGraph(*newsf, nroots, nleaves, (PetscInt *)ilocal, PETSC_COPY_VALUES, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
722     } else {
723       PetscCall(PetscSFSetGraphWithPattern(*newsf, sf->map, sf->pattern));
724     }
725   }
726   /* Since oldtype is committed, so is newtype, according to MPI */
727   if (sf->vscat.bs > 1) PetscCallMPI(MPI_Type_dup(sf->vscat.unit, &dtype));
728   (*newsf)->vscat.bs     = sf->vscat.bs;
729   (*newsf)->vscat.unit   = dtype;
730   (*newsf)->vscat.to_n   = sf->vscat.to_n;
731   (*newsf)->vscat.from_n = sf->vscat.from_n;
732   /* Do not copy lsf. Build it on demand since it is rarely used */
733 
734 #if defined(PETSC_HAVE_DEVICE)
735   (*newsf)->backend              = sf->backend;
736   (*newsf)->unknown_input_stream = sf->unknown_input_stream;
737   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
738   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
739 #endif
740   PetscTryTypeMethod(sf, Duplicate, opt, *newsf);
741   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
742   PetscFunctionReturn(0);
743 }
744 
745 /*@C
746    PetscSFGetGraph - Get the graph specifying a parallel star forest
747 
748    Not Collective
749 
750    Input Parameter:
751 .  sf - star forest
752 
753    Output Parameters:
754 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
755 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
756 .  ilocal - locations of leaves in leafdata buffers (if returned value is NULL, it means leaves are in contiguous storage)
757 -  iremote - remote locations of root vertices for each leaf on the current process
758 
759    Level: intermediate
760 
761    Notes:
762      We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
763 
764      The returned ilocal/iremote might contain values in different order than the input ones in `PetscSFSetGraph()`,
765      see its manpage for details.
766 
767    Fortran Notes:
768      The returned iremote array is a copy and must be deallocated after use. Consequently, if you
769      want to update the graph, you must call `PetscSFSetGraph()` after modifying the iremote array.
770 
771      To check for a NULL ilocal use
772 $      if (loc(ilocal) == loc(PETSC_NULL_INTEGER)) then
773 
774 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`
775 @*/
776 PetscErrorCode PetscSFGetGraph(PetscSF sf, PetscInt *nroots, PetscInt *nleaves, const PetscInt **ilocal, const PetscSFNode **iremote)
777 {
778   PetscFunctionBegin;
779   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
780   if (sf->ops->GetGraph) {
781     PetscCall((sf->ops->GetGraph)(sf, nroots, nleaves, ilocal, iremote));
782   } else {
783     if (nroots) *nroots = sf->nroots;
784     if (nleaves) *nleaves = sf->nleaves;
785     if (ilocal) *ilocal = sf->mine;
786     if (iremote) *iremote = sf->remote;
787   }
788   PetscFunctionReturn(0);
789 }
790 
791 /*@
792    PetscSFGetLeafRange - Get the active leaf ranges
793 
794    Not Collective
795 
796    Input Parameter:
797 .  sf - star forest
798 
799    Output Parameters:
800 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
801 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
802 
803    Level: developer
804 
805 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
806 @*/
807 PetscErrorCode PetscSFGetLeafRange(PetscSF sf, PetscInt *minleaf, PetscInt *maxleaf)
808 {
809   PetscFunctionBegin;
810   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
811   PetscSFCheckGraphSet(sf, 1);
812   if (minleaf) *minleaf = sf->minleaf;
813   if (maxleaf) *maxleaf = sf->maxleaf;
814   PetscFunctionReturn(0);
815 }
816 
817 /*@C
818    PetscSFViewFromOptions - View a `PetscSF` based on arguments in the options database
819 
820    Collective on A
821 
822    Input Parameters:
823 +  A - the star forest
824 .  obj - Optional object that provides the prefix for the option names
825 -  name - command line option
826 
827    Level: intermediate
828 
829 .seealso: `PetscSF`, `PetscSFView`, `PetscObjectViewFromOptions()`, `PetscSFCreate()`
830 @*/
831 PetscErrorCode PetscSFViewFromOptions(PetscSF A, PetscObject obj, const char name[])
832 {
833   PetscFunctionBegin;
834   PetscValidHeaderSpecific(A, PETSCSF_CLASSID, 1);
835   PetscCall(PetscObjectViewFromOptions((PetscObject)A, obj, name));
836   PetscFunctionReturn(0);
837 }
838 
839 /*@C
840    PetscSFView - view a star forest
841 
842    Collective
843 
844    Input Parameters:
845 +  sf - star forest
846 -  viewer - viewer to display graph, for example `PETSC_VIEWER_STDOUT_WORLD`
847 
848    Level: beginner
849 
850 .seealso: `PetscSF`, `PetscViewer`, `PetscSFCreate()`, `PetscSFSetGraph()`
851 @*/
852 PetscErrorCode PetscSFView(PetscSF sf, PetscViewer viewer)
853 {
854   PetscBool         iascii;
855   PetscViewerFormat format;
856 
857   PetscFunctionBegin;
858   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
859   if (!viewer) PetscCall(PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf), &viewer));
860   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
861   PetscCheckSameComm(sf, 1, viewer, 2);
862   if (sf->graphset) PetscCall(PetscSFSetUp(sf));
863   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
864   if (iascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) {
865     PetscMPIInt rank;
866     PetscInt    ii, i, j;
867 
868     PetscCall(PetscObjectPrintClassNamePrefixType((PetscObject)sf, viewer));
869     PetscCall(PetscViewerASCIIPushTab(viewer));
870     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
871       if (!sf->graphset) {
872         PetscCall(PetscViewerASCIIPrintf(viewer, "PetscSFSetGraph() has not been called yet\n"));
873         PetscCall(PetscViewerASCIIPopTab(viewer));
874         PetscFunctionReturn(0);
875       }
876       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
877       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
878       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Number of roots=%" PetscInt_FMT ", leaves=%" PetscInt_FMT ", remote ranks=%" PetscInt_FMT "\n", rank, sf->nroots, sf->nleaves, sf->nranks));
879       for (i = 0; i < sf->nleaves; i++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %" PetscInt_FMT " <- (%" PetscInt_FMT ",%" PetscInt_FMT ")\n", rank, sf->mine ? sf->mine[i] : i, sf->remote[i].rank, sf->remote[i].index));
880       PetscCall(PetscViewerFlush(viewer));
881       PetscCall(PetscViewerGetFormat(viewer, &format));
882       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
883         PetscMPIInt *tmpranks, *perm;
884         PetscCall(PetscMalloc2(sf->nranks, &tmpranks, sf->nranks, &perm));
885         PetscCall(PetscArraycpy(tmpranks, sf->ranks, sf->nranks));
886         for (i = 0; i < sf->nranks; i++) perm[i] = i;
887         PetscCall(PetscSortMPIIntWithArray(sf->nranks, tmpranks, perm));
888         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Roots referenced by my leaves, by rank\n", rank));
889         for (ii = 0; ii < sf->nranks; ii++) {
890           i = perm[ii];
891           PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %d: %" PetscInt_FMT " edges\n", rank, sf->ranks[i], sf->roffset[i + 1] - sf->roffset[i]));
892           for (j = sf->roffset[i]; j < sf->roffset[i + 1]; j++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d]    %" PetscInt_FMT " <- %" PetscInt_FMT "\n", rank, sf->rmine[j], sf->rremote[j]));
893         }
894         PetscCall(PetscFree2(tmpranks, perm));
895       }
896       PetscCall(PetscViewerFlush(viewer));
897       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
898     }
899     PetscCall(PetscViewerASCIIPopTab(viewer));
900   }
901   PetscTryTypeMethod(sf, View, viewer);
902   PetscFunctionReturn(0);
903 }
904 
905 /*@C
906    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
907 
908    Not Collective
909 
910    Input Parameter:
911 .  sf - star forest
912 
913    Output Parameters:
914 +  nranks - number of ranks referenced by local part
915 .  ranks - array of ranks
916 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
917 .  rmine - concatenated array holding local indices referencing each remote rank
918 -  rremote - concatenated array holding remote indices referenced for each remote rank
919 
920    Level: developer
921 
922 .seealso: `PetscSF`, `PetscSFGetLeafRanks()`
923 @*/
924 PetscErrorCode PetscSFGetRootRanks(PetscSF sf, PetscInt *nranks, const PetscMPIInt **ranks, const PetscInt **roffset, const PetscInt **rmine, const PetscInt **rremote)
925 {
926   PetscFunctionBegin;
927   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
928   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
929   if (sf->ops->GetRootRanks) {
930     PetscCall((sf->ops->GetRootRanks)(sf, nranks, ranks, roffset, rmine, rremote));
931   } else {
932     /* The generic implementation */
933     if (nranks) *nranks = sf->nranks;
934     if (ranks) *ranks = sf->ranks;
935     if (roffset) *roffset = sf->roffset;
936     if (rmine) *rmine = sf->rmine;
937     if (rremote) *rremote = sf->rremote;
938   }
939   PetscFunctionReturn(0);
940 }
941 
942 /*@C
943    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
944 
945    Not Collective
946 
947    Input Parameter:
948 .  sf - star forest
949 
950    Output Parameters:
951 +  niranks - number of leaf ranks referencing roots on this process
952 .  iranks - array of ranks
953 .  ioffset - offset in irootloc for each rank (length niranks+1)
954 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
955 
956    Level: developer
957 
958 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
959 @*/
960 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf, PetscInt *niranks, const PetscMPIInt **iranks, const PetscInt **ioffset, const PetscInt **irootloc)
961 {
962   PetscFunctionBegin;
963   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
964   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
965   if (sf->ops->GetLeafRanks) {
966     PetscCall((sf->ops->GetLeafRanks)(sf, niranks, iranks, ioffset, irootloc));
967   } else {
968     PetscSFType type;
969     PetscCall(PetscSFGetType(sf, &type));
970     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
971   }
972   PetscFunctionReturn(0);
973 }
974 
975 static PetscBool InList(PetscMPIInt needle, PetscMPIInt n, const PetscMPIInt *list)
976 {
977   PetscInt i;
978   for (i = 0; i < n; i++) {
979     if (needle == list[i]) return PETSC_TRUE;
980   }
981   return PETSC_FALSE;
982 }
983 
984 /*@C
985    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by `PetscSF` implementations.
986 
987    Collective
988 
989    Input Parameters:
990 +  sf - `PetscSF` to set up; `PetscSFSetGraph()` must have been called
991 -  dgroup - `MPI_Group` of ranks to be distinguished (e.g., for self or shared memory exchange)
992 
993    Level: developer
994 
995 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
996 @*/
997 PetscErrorCode PetscSFSetUpRanks(PetscSF sf, MPI_Group dgroup)
998 {
999   PetscTable         table;
1000   PetscTablePosition pos;
1001   PetscMPIInt        size, groupsize, *groupranks;
1002   PetscInt          *rcount, *ranks;
1003   PetscInt           i, irank = -1, orank = -1;
1004 
1005   PetscFunctionBegin;
1006   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1007   PetscSFCheckGraphSet(sf, 1);
1008   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
1009   PetscCall(PetscTableCreate(10, size, &table));
1010   for (i = 0; i < sf->nleaves; i++) {
1011     /* Log 1-based rank */
1012     PetscCall(PetscTableAdd(table, sf->remote[i].rank + 1, 1, ADD_VALUES));
1013   }
1014   PetscCall(PetscTableGetCount(table, &sf->nranks));
1015   PetscCall(PetscMalloc4(sf->nranks, &sf->ranks, sf->nranks + 1, &sf->roffset, sf->nleaves, &sf->rmine, sf->nleaves, &sf->rremote));
1016   PetscCall(PetscMalloc2(sf->nranks, &rcount, sf->nranks, &ranks));
1017   PetscCall(PetscTableGetHeadPosition(table, &pos));
1018   for (i = 0; i < sf->nranks; i++) {
1019     PetscCall(PetscTableGetNext(table, &pos, &ranks[i], &rcount[i]));
1020     ranks[i]--; /* Convert back to 0-based */
1021   }
1022   PetscCall(PetscTableDestroy(&table));
1023 
1024   /* We expect that dgroup is reliably "small" while nranks could be large */
1025   {
1026     MPI_Group    group = MPI_GROUP_NULL;
1027     PetscMPIInt *dgroupranks;
1028     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1029     PetscCallMPI(MPI_Group_size(dgroup, &groupsize));
1030     PetscCall(PetscMalloc1(groupsize, &dgroupranks));
1031     PetscCall(PetscMalloc1(groupsize, &groupranks));
1032     for (i = 0; i < groupsize; i++) dgroupranks[i] = i;
1033     if (groupsize) PetscCallMPI(MPI_Group_translate_ranks(dgroup, groupsize, dgroupranks, group, groupranks));
1034     PetscCallMPI(MPI_Group_free(&group));
1035     PetscCall(PetscFree(dgroupranks));
1036   }
1037 
1038   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1039   for (sf->ndranks = 0, i = sf->nranks; sf->ndranks < i;) {
1040     for (i--; sf->ndranks < i; i--) { /* Scan i backward looking for distinguished rank */
1041       if (InList(ranks[i], groupsize, groupranks)) break;
1042     }
1043     for (; sf->ndranks <= i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1044       if (!InList(ranks[sf->ndranks], groupsize, groupranks)) break;
1045     }
1046     if (sf->ndranks < i) { /* Swap ranks[sf->ndranks] with ranks[i] */
1047       PetscInt tmprank, tmpcount;
1048 
1049       tmprank             = ranks[i];
1050       tmpcount            = rcount[i];
1051       ranks[i]            = ranks[sf->ndranks];
1052       rcount[i]           = rcount[sf->ndranks];
1053       ranks[sf->ndranks]  = tmprank;
1054       rcount[sf->ndranks] = tmpcount;
1055       sf->ndranks++;
1056     }
1057   }
1058   PetscCall(PetscFree(groupranks));
1059   PetscCall(PetscSortIntWithArray(sf->ndranks, ranks, rcount));
1060   PetscCall(PetscSortIntWithArray(sf->nranks - sf->ndranks, ranks + sf->ndranks, rcount + sf->ndranks));
1061   sf->roffset[0] = 0;
1062   for (i = 0; i < sf->nranks; i++) {
1063     PetscCall(PetscMPIIntCast(ranks[i], sf->ranks + i));
1064     sf->roffset[i + 1] = sf->roffset[i] + rcount[i];
1065     rcount[i]          = 0;
1066   }
1067   for (i = 0, irank = -1, orank = -1; i < sf->nleaves; i++) {
1068     /* short circuit */
1069     if (orank != sf->remote[i].rank) {
1070       /* Search for index of iremote[i].rank in sf->ranks */
1071       PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->ndranks, sf->ranks, &irank));
1072       if (irank < 0) {
1073         PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->nranks - sf->ndranks, sf->ranks + sf->ndranks, &irank));
1074         if (irank >= 0) irank += sf->ndranks;
1075       }
1076       orank = sf->remote[i].rank;
1077     }
1078     PetscCheck(irank >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Could not find rank %" PetscInt_FMT " in array", sf->remote[i].rank);
1079     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1080     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1081     rcount[irank]++;
1082   }
1083   PetscCall(PetscFree2(rcount, ranks));
1084   PetscFunctionReturn(0);
1085 }
1086 
1087 /*@C
1088    PetscSFGetGroups - gets incoming and outgoing process groups
1089 
1090    Collective
1091 
1092    Input Parameter:
1093 .  sf - star forest
1094 
1095    Output Parameters:
1096 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1097 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1098 
1099    Level: developer
1100 
1101 .seealso: `PetscSF`, `PetscSFGetWindow()`, `PetscSFRestoreWindow()`
1102 @*/
1103 PetscErrorCode PetscSFGetGroups(PetscSF sf, MPI_Group *incoming, MPI_Group *outgoing)
1104 {
1105   MPI_Group group = MPI_GROUP_NULL;
1106 
1107   PetscFunctionBegin;
1108   PetscCheck(sf->nranks >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUpRanks() before obtaining groups");
1109   if (sf->ingroup == MPI_GROUP_NULL) {
1110     PetscInt        i;
1111     const PetscInt *indegree;
1112     PetscMPIInt     rank, *outranks, *inranks;
1113     PetscSFNode    *remote;
1114     PetscSF         bgcount;
1115 
1116     /* Compute the number of incoming ranks */
1117     PetscCall(PetscMalloc1(sf->nranks, &remote));
1118     for (i = 0; i < sf->nranks; i++) {
1119       remote[i].rank  = sf->ranks[i];
1120       remote[i].index = 0;
1121     }
1122     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, &bgcount));
1123     PetscCall(PetscSFSetGraph(bgcount, 1, sf->nranks, NULL, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1124     PetscCall(PetscSFComputeDegreeBegin(bgcount, &indegree));
1125     PetscCall(PetscSFComputeDegreeEnd(bgcount, &indegree));
1126     /* Enumerate the incoming ranks */
1127     PetscCall(PetscMalloc2(indegree[0], &inranks, sf->nranks, &outranks));
1128     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1129     for (i = 0; i < sf->nranks; i++) outranks[i] = rank;
1130     PetscCall(PetscSFGatherBegin(bgcount, MPI_INT, outranks, inranks));
1131     PetscCall(PetscSFGatherEnd(bgcount, MPI_INT, outranks, inranks));
1132     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1133     PetscCallMPI(MPI_Group_incl(group, indegree[0], inranks, &sf->ingroup));
1134     PetscCallMPI(MPI_Group_free(&group));
1135     PetscCall(PetscFree2(inranks, outranks));
1136     PetscCall(PetscSFDestroy(&bgcount));
1137   }
1138   *incoming = sf->ingroup;
1139 
1140   if (sf->outgroup == MPI_GROUP_NULL) {
1141     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1142     PetscCallMPI(MPI_Group_incl(group, sf->nranks, sf->ranks, &sf->outgroup));
1143     PetscCallMPI(MPI_Group_free(&group));
1144   }
1145   *outgoing = sf->outgroup;
1146   PetscFunctionReturn(0);
1147 }
1148 
1149 /*@
1150    PetscSFGetMultiSF - gets the inner `PetscSF` implementing gathers and scatters
1151 
1152    Collective
1153 
1154    Input Parameter:
1155 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1156 
1157    Output Parameter:
1158 .  multi - star forest with split roots, such that each root has degree exactly 1
1159 
1160    Level: developer
1161 
1162    Note:
1163    In most cases, users should use `PetscSFGatherBegin()` and `PetscSFScatterBegin()` instead of manipulating multi
1164    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1165    edge, it is a candidate for future optimization that might involve its removal.
1166 
1167 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`, `PetscSFComputeMultiRootOriginalNumbering()`
1168 @*/
1169 PetscErrorCode PetscSFGetMultiSF(PetscSF sf, PetscSF *multi)
1170 {
1171   PetscFunctionBegin;
1172   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1173   PetscValidPointer(multi, 2);
1174   if (sf->nroots < 0) { /* Graph has not been set yet; why do we need this? */
1175     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1176     *multi           = sf->multi;
1177     sf->multi->multi = sf->multi;
1178     PetscFunctionReturn(0);
1179   }
1180   if (!sf->multi) {
1181     const PetscInt *indegree;
1182     PetscInt        i, *inoffset, *outones, *outoffset, maxlocal;
1183     PetscSFNode    *remote;
1184     maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
1185     PetscCall(PetscSFComputeDegreeBegin(sf, &indegree));
1186     PetscCall(PetscSFComputeDegreeEnd(sf, &indegree));
1187     PetscCall(PetscMalloc3(sf->nroots + 1, &inoffset, maxlocal, &outones, maxlocal, &outoffset));
1188     inoffset[0] = 0;
1189     for (i = 0; i < sf->nroots; i++) inoffset[i + 1] = inoffset[i] + indegree[i];
1190     for (i = 0; i < maxlocal; i++) outones[i] = 1;
1191     PetscCall(PetscSFFetchAndOpBegin(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1192     PetscCall(PetscSFFetchAndOpEnd(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1193     for (i = 0; i < sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1194     if (PetscDefined(USE_DEBUG)) {                               /* Check that the expected number of increments occurred */
1195       for (i = 0; i < sf->nroots; i++) PetscCheck(inoffset[i] + indegree[i] == inoffset[i + 1], PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incorrect result after PetscSFFetchAndOp");
1196     }
1197     PetscCall(PetscMalloc1(sf->nleaves, &remote));
1198     for (i = 0; i < sf->nleaves; i++) {
1199       remote[i].rank  = sf->remote[i].rank;
1200       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1201     }
1202     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1203     sf->multi->multi = sf->multi;
1204     PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1205     if (sf->rankorder) { /* Sort the ranks */
1206       PetscMPIInt  rank;
1207       PetscInt    *inranks, *newoffset, *outranks, *newoutoffset, *tmpoffset, maxdegree;
1208       PetscSFNode *newremote;
1209       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1210       for (i = 0, maxdegree = 0; i < sf->nroots; i++) maxdegree = PetscMax(maxdegree, indegree[i]);
1211       PetscCall(PetscMalloc5(sf->multi->nroots, &inranks, sf->multi->nroots, &newoffset, maxlocal, &outranks, maxlocal, &newoutoffset, maxdegree, &tmpoffset));
1212       for (i = 0; i < maxlocal; i++) outranks[i] = rank;
1213       PetscCall(PetscSFReduceBegin(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1214       PetscCall(PetscSFReduceEnd(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1215       /* Sort the incoming ranks at each vertex, build the inverse map */
1216       for (i = 0; i < sf->nroots; i++) {
1217         PetscInt j;
1218         for (j = 0; j < indegree[i]; j++) tmpoffset[j] = j;
1219         PetscCall(PetscSortIntWithArray(indegree[i], inranks + inoffset[i], tmpoffset));
1220         for (j = 0; j < indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1221       }
1222       PetscCall(PetscSFBcastBegin(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1223       PetscCall(PetscSFBcastEnd(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1224       PetscCall(PetscMalloc1(sf->nleaves, &newremote));
1225       for (i = 0; i < sf->nleaves; i++) {
1226         newremote[i].rank  = sf->remote[i].rank;
1227         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1228       }
1229       PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, newremote, PETSC_OWN_POINTER));
1230       PetscCall(PetscFree5(inranks, newoffset, outranks, newoutoffset, tmpoffset));
1231     }
1232     PetscCall(PetscFree3(inoffset, outones, outoffset));
1233   }
1234   *multi = sf->multi;
1235   PetscFunctionReturn(0);
1236 }
1237 
1238 /*@C
1239    PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots, does not remap indices
1240 
1241    Collective
1242 
1243    Input Parameters:
1244 +  sf - original star forest
1245 .  nselected  - number of selected roots on this process
1246 -  selected   - indices of the selected roots on this process
1247 
1248    Output Parameter:
1249 .  esf - new star forest
1250 
1251    Level: advanced
1252 
1253    Note:
1254    To use the new `PetscSF`, it may be necessary to know the indices of the leaves that are still participating. This can
1255    be done by calling PetscSFGetGraph().
1256 
1257 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1258 @*/
1259 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *esf)
1260 {
1261   PetscInt           i, j, n, nroots, nleaves, esf_nleaves, *new_ilocal, minleaf, maxleaf, maxlocal;
1262   const PetscInt    *ilocal;
1263   signed char       *rootdata, *leafdata, *leafmem;
1264   const PetscSFNode *iremote;
1265   PetscSFNode       *new_iremote;
1266   MPI_Comm           comm;
1267 
1268   PetscFunctionBegin;
1269   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1270   PetscSFCheckGraphSet(sf, 1);
1271   if (nselected) PetscValidIntPointer(selected, 3);
1272   PetscValidPointer(esf, 4);
1273 
1274   PetscCall(PetscSFSetUp(sf));
1275   PetscCall(PetscLogEventBegin(PETSCSF_EmbedSF, sf, 0, 0, 0));
1276   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1277   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
1278 
1279   if (PetscDefined(USE_DEBUG)) { /* Error out if selected[] has dups or  out of range indices */
1280     PetscBool dups;
1281     PetscCall(PetscCheckDupsInt(nselected, selected, &dups));
1282     PetscCheck(!dups, comm, PETSC_ERR_ARG_WRONG, "selected[] has dups");
1283     for (i = 0; i < nselected; i++) PetscCheck(selected[i] >= 0 && selected[i] < nroots, comm, PETSC_ERR_ARG_OUTOFRANGE, "selected root indice %" PetscInt_FMT " is out of [0,%" PetscInt_FMT ")", selected[i], nroots);
1284   }
1285 
1286   if (sf->ops->CreateEmbeddedRootSF) PetscUseTypeMethod(sf, CreateEmbeddedRootSF, nselected, selected, esf);
1287   else {
1288     /* A generic version of creating embedded sf */
1289     PetscCall(PetscSFGetLeafRange(sf, &minleaf, &maxleaf));
1290     maxlocal = maxleaf - minleaf + 1;
1291     PetscCall(PetscCalloc2(nroots, &rootdata, maxlocal, &leafmem));
1292     leafdata = leafmem - minleaf;
1293     /* Tag selected roots and bcast to leaves */
1294     for (i = 0; i < nselected; i++) rootdata[selected[i]] = 1;
1295     PetscCall(PetscSFBcastBegin(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1296     PetscCall(PetscSFBcastEnd(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1297 
1298     /* Build esf with leaves that are still connected */
1299     esf_nleaves = 0;
1300     for (i = 0; i < nleaves; i++) {
1301       j = ilocal ? ilocal[i] : i;
1302       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1303          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1304       */
1305       esf_nleaves += (leafdata[j] ? 1 : 0);
1306     }
1307     PetscCall(PetscMalloc1(esf_nleaves, &new_ilocal));
1308     PetscCall(PetscMalloc1(esf_nleaves, &new_iremote));
1309     for (i = n = 0; i < nleaves; i++) {
1310       j = ilocal ? ilocal[i] : i;
1311       if (leafdata[j]) {
1312         new_ilocal[n]        = j;
1313         new_iremote[n].rank  = iremote[i].rank;
1314         new_iremote[n].index = iremote[i].index;
1315         ++n;
1316       }
1317     }
1318     PetscCall(PetscSFCreate(comm, esf));
1319     PetscCall(PetscSFSetFromOptions(*esf));
1320     PetscCall(PetscSFSetGraph(*esf, nroots, esf_nleaves, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1321     PetscCall(PetscFree2(rootdata, leafmem));
1322   }
1323   PetscCall(PetscLogEventEnd(PETSCSF_EmbedSF, sf, 0, 0, 0));
1324   PetscFunctionReturn(0);
1325 }
1326 
1327 /*@C
1328   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1329 
1330   Collective
1331 
1332   Input Parameters:
1333 + sf - original star forest
1334 . nselected  - number of selected leaves on this process
1335 - selected   - indices of the selected leaves on this process
1336 
1337   Output Parameter:
1338 .  newsf - new star forest
1339 
1340   Level: advanced
1341 
1342 .seealso: `PetscSF`, `PetscSFCreateEmbeddedRootSF()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1343 @*/
1344 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *newsf)
1345 {
1346   const PetscSFNode *iremote;
1347   PetscSFNode       *new_iremote;
1348   const PetscInt    *ilocal;
1349   PetscInt           i, nroots, *leaves, *new_ilocal;
1350   MPI_Comm           comm;
1351 
1352   PetscFunctionBegin;
1353   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1354   PetscSFCheckGraphSet(sf, 1);
1355   if (nselected) PetscValidIntPointer(selected, 3);
1356   PetscValidPointer(newsf, 4);
1357 
1358   /* Uniq selected[] and put results in leaves[] */
1359   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1360   PetscCall(PetscMalloc1(nselected, &leaves));
1361   PetscCall(PetscArraycpy(leaves, selected, nselected));
1362   PetscCall(PetscSortedRemoveDupsInt(&nselected, leaves));
1363   PetscCheck(!nselected || !(leaves[0] < 0 || leaves[nselected - 1] >= sf->nleaves), comm, PETSC_ERR_ARG_OUTOFRANGE, "Min/Max leaf indices %" PetscInt_FMT "/%" PetscInt_FMT " are not in [0,%" PetscInt_FMT ")", leaves[0], leaves[nselected - 1], sf->nleaves);
1364 
1365   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1366   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) PetscUseTypeMethod(sf, CreateEmbeddedLeafSF, nselected, leaves, newsf);
1367   else {
1368     PetscCall(PetscSFGetGraph(sf, &nroots, NULL, &ilocal, &iremote));
1369     PetscCall(PetscMalloc1(nselected, &new_ilocal));
1370     PetscCall(PetscMalloc1(nselected, &new_iremote));
1371     for (i = 0; i < nselected; ++i) {
1372       const PetscInt l     = leaves[i];
1373       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1374       new_iremote[i].rank  = iremote[l].rank;
1375       new_iremote[i].index = iremote[l].index;
1376     }
1377     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, newsf));
1378     PetscCall(PetscSFSetGraph(*newsf, nroots, nselected, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1379   }
1380   PetscCall(PetscFree(leaves));
1381   PetscFunctionReturn(0);
1382 }
1383 
1384 /*@C
1385    PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to `PetscSFBcastEnd()`
1386 
1387    Collective
1388 
1389    Input Parameters:
1390 +  sf - star forest on which to communicate
1391 .  unit - data type associated with each node
1392 .  rootdata - buffer to broadcast
1393 -  op - operation to use for reduction
1394 
1395    Output Parameter:
1396 .  leafdata - buffer to be reduced with values from each leaf's respective root
1397 
1398    Level: intermediate
1399 
1400    Notes:
1401     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1402     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1403     use `PetscSFBcastWithMemTypeBegin()` instead.
1404 
1405 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastWithMemTypeBegin()`
1406 @*/
1407 PetscErrorCode PetscSFBcastBegin(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1408 {
1409   PetscMemType rootmtype, leafmtype;
1410 
1411   PetscFunctionBegin;
1412   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1413   PetscCall(PetscSFSetUp(sf));
1414   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1415   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1416   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1417   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1418   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1419   PetscFunctionReturn(0);
1420 }
1421 
1422 /*@C
1423    PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to `PetscSFBcastEnd()`
1424 
1425    Collective
1426 
1427    Input Parameters:
1428 +  sf - star forest on which to communicate
1429 .  unit - data type associated with each node
1430 .  rootmtype - memory type of rootdata
1431 .  rootdata - buffer to broadcast
1432 .  leafmtype - memory type of leafdata
1433 -  op - operation to use for reduction
1434 
1435    Output Parameter:
1436 .  leafdata - buffer to be reduced with values from each leaf's respective root
1437 
1438    Level: intermediate
1439 
1440 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastBegin()`
1441 @*/
1442 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
1443 {
1444   PetscFunctionBegin;
1445   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1446   PetscCall(PetscSFSetUp(sf));
1447   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1448   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1449   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1450   PetscFunctionReturn(0);
1451 }
1452 
1453 /*@C
1454    PetscSFBcastEnd - end a broadcast & reduce operation started with `PetscSFBcastBegin()`
1455 
1456    Collective
1457 
1458    Input Parameters:
1459 +  sf - star forest
1460 .  unit - data type
1461 .  rootdata - buffer to broadcast
1462 -  op - operation to use for reduction
1463 
1464    Output Parameter:
1465 .  leafdata - buffer to be reduced with values from each leaf's respective root
1466 
1467    Level: intermediate
1468 
1469 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFReduceEnd()`
1470 @*/
1471 PetscErrorCode PetscSFBcastEnd(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1472 {
1473   PetscFunctionBegin;
1474   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1475   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastEnd, sf, 0, 0, 0));
1476   PetscUseTypeMethod(sf, BcastEnd, unit, rootdata, leafdata, op);
1477   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastEnd, sf, 0, 0, 0));
1478   PetscFunctionReturn(0);
1479 }
1480 
1481 /*@C
1482    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to `PetscSFReduceEnd()`
1483 
1484    Collective
1485 
1486    Input Parameters:
1487 +  sf - star forest
1488 .  unit - data type
1489 .  leafdata - values to reduce
1490 -  op - reduction operation
1491 
1492    Output Parameter:
1493 .  rootdata - result of reduction of values from all leaves of each root
1494 
1495    Level: intermediate
1496 
1497    Notes:
1498     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1499     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1500     use `PetscSFReduceWithMemTypeBegin()` instead.
1501 
1502 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceWithMemTypeBegin()`
1503 @*/
1504 PetscErrorCode PetscSFReduceBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1505 {
1506   PetscMemType rootmtype, leafmtype;
1507 
1508   PetscFunctionBegin;
1509   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1510   PetscCall(PetscSFSetUp(sf));
1511   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1512   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1513   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1514   PetscCall((sf->ops->ReduceBegin)(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1515   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1516   PetscFunctionReturn(0);
1517 }
1518 
1519 /*@C
1520    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to `PetscSFReduceEnd()`
1521 
1522    Collective
1523 
1524    Input Parameters:
1525 +  sf - star forest
1526 .  unit - data type
1527 .  leafmtype - memory type of leafdata
1528 .  leafdata - values to reduce
1529 .  rootmtype - memory type of rootdata
1530 -  op - reduction operation
1531 
1532    Output Parameter:
1533 .  rootdata - result of reduction of values from all leaves of each root
1534 
1535    Level: intermediate
1536 
1537 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceBegin()`
1538 @*/
1539 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
1540 {
1541   PetscFunctionBegin;
1542   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1543   PetscCall(PetscSFSetUp(sf));
1544   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1545   PetscCall((sf->ops->ReduceBegin)(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1546   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1547   PetscFunctionReturn(0);
1548 }
1549 
1550 /*@C
1551    PetscSFReduceEnd - end a reduction operation started with `PetscSFReduceBegin()`
1552 
1553    Collective
1554 
1555    Input Parameters:
1556 +  sf - star forest
1557 .  unit - data type
1558 .  leafdata - values to reduce
1559 -  op - reduction operation
1560 
1561    Output Parameter:
1562 .  rootdata - result of reduction of values from all leaves of each root
1563 
1564    Level: intermediate
1565 
1566 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFBcastEnd()`
1567 @*/
1568 PetscErrorCode PetscSFReduceEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1569 {
1570   PetscFunctionBegin;
1571   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1572   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1573   PetscUseTypeMethod(sf, ReduceEnd, unit, leafdata, rootdata, op);
1574   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1575   PetscFunctionReturn(0);
1576 }
1577 
1578 /*@C
1579    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value,
1580    to be completed with `PetscSFFetchAndOpEnd()`
1581 
1582    Collective
1583 
1584    Input Parameters:
1585 +  sf - star forest
1586 .  unit - data type
1587 .  leafdata - leaf values to use in reduction
1588 -  op - operation to use for reduction
1589 
1590    Output Parameters:
1591 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1592 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1593 
1594    Level: advanced
1595 
1596    Note:
1597    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1598    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1599    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1600    integers.
1601 
1602 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`
1603 @*/
1604 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1605 {
1606   PetscMemType rootmtype, leafmtype, leafupdatemtype;
1607 
1608   PetscFunctionBegin;
1609   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1610   PetscCall(PetscSFSetUp(sf));
1611   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1612   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1613   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1614   PetscCall(PetscGetMemType(leafupdate, &leafupdatemtype));
1615   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1616   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1617   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1618   PetscFunctionReturn(0);
1619 }
1620 
1621 /*@C
1622    PetscSFFetchAndOpWithMemTypeBegin - begin operation with explicit memory types that fetches values from root and updates atomically by
1623    applying operation using my leaf value, to be completed with `PetscSFFetchAndOpEnd()`
1624 
1625    Collective
1626 
1627    Input Parameters:
1628 +  sf - star forest
1629 .  unit - data type
1630 .  rootmtype - memory type of rootdata
1631 .  leafmtype - memory type of leafdata
1632 .  leafdata - leaf values to use in reduction
1633 .  leafupdatemtype - memory type of leafupdate
1634 -  op - operation to use for reduction
1635 
1636    Output Parameters:
1637 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1638 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1639 
1640    Level: advanced
1641 
1642    Note:
1643    See `PetscSFFetchAndOpBegin()` for more details.
1644 
1645 .seealso: `PetscSF`, `PetscSFFetchAndOpBegin()`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`
1646 @*/
1647 PetscErrorCode PetscSFFetchAndOpWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, PetscMemType leafupdatemtype, void *leafupdate, MPI_Op op)
1648 {
1649   PetscFunctionBegin;
1650   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1651   PetscCall(PetscSFSetUp(sf));
1652   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1653   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1654   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1655   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 /*@C
1660    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1661 
1662    Collective
1663 
1664    Input Parameters:
1665 +  sf - star forest
1666 .  unit - data type
1667 .  leafdata - leaf values to use in reduction
1668 -  op - operation to use for reduction
1669 
1670    Output Parameters:
1671 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1672 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1673 
1674    Level: advanced
1675 
1676 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFReduceEnd()`, `PetscSFSetGraph()`
1677 @*/
1678 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1679 {
1680   PetscFunctionBegin;
1681   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1682   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1683   PetscUseTypeMethod(sf, FetchAndOpEnd, unit, rootdata, leafdata, leafupdate, op);
1684   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1685   PetscFunctionReturn(0);
1686 }
1687 
1688 /*@C
1689    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with `PetscSFComputeDegreeEnd()`
1690 
1691    Collective
1692 
1693    Input Parameter:
1694 .  sf - star forest
1695 
1696    Output Parameter:
1697 .  degree - degree of each root vertex
1698 
1699    Level: advanced
1700 
1701    Note:
1702    The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence no need to call `PetscFree()` on it.
1703 
1704 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeEnd()`
1705 @*/
1706 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf, const PetscInt **degree)
1707 {
1708   PetscFunctionBegin;
1709   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1710   PetscSFCheckGraphSet(sf, 1);
1711   PetscValidPointer(degree, 2);
1712   if (!sf->degreeknown) {
1713     PetscInt i, nroots = sf->nroots, maxlocal;
1714     PetscCheck(!sf->degree, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1715     maxlocal = sf->maxleaf - sf->minleaf + 1;
1716     PetscCall(PetscMalloc1(nroots, &sf->degree));
1717     PetscCall(PetscMalloc1(PetscMax(maxlocal, 1), &sf->degreetmp)); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1718     for (i = 0; i < nroots; i++) sf->degree[i] = 0;
1719     for (i = 0; i < maxlocal; i++) sf->degreetmp[i] = 1;
1720     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1721   }
1722   *degree = NULL;
1723   PetscFunctionReturn(0);
1724 }
1725 
1726 /*@C
1727    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with `PetscSFComputeDegreeBegin()`
1728 
1729    Collective
1730 
1731    Input Parameter:
1732 .  sf - star forest
1733 
1734    Output Parameter:
1735 .  degree - degree of each root vertex
1736 
1737    Level: developer
1738 
1739    Note:
1740    The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence no need to call `PetscFree()` on it.
1741 
1742 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeBegin()`
1743 @*/
1744 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf, const PetscInt **degree)
1745 {
1746   PetscFunctionBegin;
1747   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1748   PetscSFCheckGraphSet(sf, 1);
1749   PetscValidPointer(degree, 2);
1750   if (!sf->degreeknown) {
1751     PetscCheck(sf->degreetmp, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1752     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1753     PetscCall(PetscFree(sf->degreetmp));
1754     sf->degreeknown = PETSC_TRUE;
1755   }
1756   *degree = sf->degree;
1757   PetscFunctionReturn(0);
1758 }
1759 
1760 /*@C
1761    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by `PetscSFGetMultiSF()`).
1762    Each multi-root is assigned index of the corresponding original root.
1763 
1764    Collective
1765 
1766    Input Parameters:
1767 +  sf - star forest
1768 -  degree - degree of each root vertex, computed with `PetscSFComputeDegreeBegin()`/`PetscSFComputeDegreeEnd()`
1769 
1770    Output Parameters:
1771 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1772 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1773 
1774    Level: developer
1775 
1776    Note:
1777    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with `PetscFree()` when no longer needed.
1778 
1779 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFComputeDegreeEnd()`, `PetscSFGetMultiSF()`
1780 @*/
1781 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1782 {
1783   PetscSF  msf;
1784   PetscInt i, j, k, nroots, nmroots;
1785 
1786   PetscFunctionBegin;
1787   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1788   PetscCall(PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL));
1789   if (nroots) PetscValidIntPointer(degree, 2);
1790   if (nMultiRoots) PetscValidIntPointer(nMultiRoots, 3);
1791   PetscValidPointer(multiRootsOrigNumbering, 4);
1792   PetscCall(PetscSFGetMultiSF(sf, &msf));
1793   PetscCall(PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL));
1794   PetscCall(PetscMalloc1(nmroots, multiRootsOrigNumbering));
1795   for (i = 0, j = 0, k = 0; i < nroots; i++) {
1796     if (!degree[i]) continue;
1797     for (j = 0; j < degree[i]; j++, k++) (*multiRootsOrigNumbering)[k] = i;
1798   }
1799   PetscCheck(k == nmroots, PETSC_COMM_SELF, PETSC_ERR_PLIB, "sanity check fail");
1800   if (nMultiRoots) *nMultiRoots = nmroots;
1801   PetscFunctionReturn(0);
1802 }
1803 
1804 /*@C
1805    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with `PetscSFGatherEnd()`
1806 
1807    Collective
1808 
1809    Input Parameters:
1810 +  sf - star forest
1811 .  unit - data type
1812 -  leafdata - leaf data to gather to roots
1813 
1814    Output Parameter:
1815 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1816 
1817    Level: intermediate
1818 
1819 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterBegin()`
1820 @*/
1821 PetscErrorCode PetscSFGatherBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1822 {
1823   PetscSF multi = NULL;
1824 
1825   PetscFunctionBegin;
1826   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1827   PetscCall(PetscSFSetUp(sf));
1828   PetscCall(PetscSFGetMultiSF(sf, &multi));
1829   PetscCall(PetscSFReduceBegin(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1830   PetscFunctionReturn(0);
1831 }
1832 
1833 /*@C
1834    PetscSFGatherEnd - ends pointwise gather operation that was started with `PetscSFGatherBegin()`
1835 
1836    Collective
1837 
1838    Input Parameters:
1839 +  sf - star forest
1840 .  unit - data type
1841 -  leafdata - leaf data to gather to roots
1842 
1843    Output Parameter:
1844 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1845 
1846    Level: intermediate
1847 
1848 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterEnd()`
1849 @*/
1850 PetscErrorCode PetscSFGatherEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1851 {
1852   PetscSF multi = NULL;
1853 
1854   PetscFunctionBegin;
1855   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1856   PetscCall(PetscSFGetMultiSF(sf, &multi));
1857   PetscCall(PetscSFReduceEnd(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1858   PetscFunctionReturn(0);
1859 }
1860 
1861 /*@C
1862    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with `PetscSFScatterEnd()`
1863 
1864    Collective
1865 
1866    Input Parameters:
1867 +  sf - star forest
1868 .  unit - data type
1869 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1870 
1871    Output Parameter:
1872 .  leafdata - leaf data to be update with personal data from each respective root
1873 
1874    Level: intermediate
1875 
1876 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterBegin()`
1877 @*/
1878 PetscErrorCode PetscSFScatterBegin(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1879 {
1880   PetscSF multi = NULL;
1881 
1882   PetscFunctionBegin;
1883   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1884   PetscCall(PetscSFSetUp(sf));
1885   PetscCall(PetscSFGetMultiSF(sf, &multi));
1886   PetscCall(PetscSFBcastBegin(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1887   PetscFunctionReturn(0);
1888 }
1889 
1890 /*@C
1891    PetscSFScatterEnd - ends pointwise scatter operation that was started with `PetscSFScatterBegin()`
1892 
1893    Collective
1894 
1895    Input Parameters:
1896 +  sf - star forest
1897 .  unit - data type
1898 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1899 
1900    Output Parameter:
1901 .  leafdata - leaf data to be update with personal data from each respective root
1902 
1903    Level: intermediate
1904 
1905 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterEnd()`
1906 @*/
1907 PetscErrorCode PetscSFScatterEnd(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1908 {
1909   PetscSF multi = NULL;
1910 
1911   PetscFunctionBegin;
1912   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1913   PetscCall(PetscSFGetMultiSF(sf, &multi));
1914   PetscCall(PetscSFBcastEnd(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1915   PetscFunctionReturn(0);
1916 }
1917 
1918 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1919 {
1920   PetscInt        i, n, nleaves;
1921   const PetscInt *ilocal = NULL;
1922   PetscHSetI      seen;
1923 
1924   PetscFunctionBegin;
1925   if (PetscDefined(USE_DEBUG)) {
1926     PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, NULL));
1927     PetscCall(PetscHSetICreate(&seen));
1928     for (i = 0; i < nleaves; i++) {
1929       const PetscInt leaf = ilocal ? ilocal[i] : i;
1930       PetscCall(PetscHSetIAdd(seen, leaf));
1931     }
1932     PetscCall(PetscHSetIGetSize(seen, &n));
1933     PetscCheck(n == nleaves, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided leaves have repeated values: all leaves must be unique");
1934     PetscCall(PetscHSetIDestroy(&seen));
1935   }
1936   PetscFunctionReturn(0);
1937 }
1938 
1939 /*@
1940   PetscSFCompose - Compose a new `PetscSF` by putting the second `PetscSF` under the first one in a top (roots) down (leaves) view
1941 
1942   Input Parameters:
1943 + sfA - The first `PetscSF`
1944 - sfB - The second `PetscSF`
1945 
1946   Output Parameters:
1947 . sfBA - The composite `PetscSF`
1948 
1949   Level: developer
1950 
1951   Notes:
1952   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
1953   forests, i.e. the same leaf is not connected with different roots.
1954 
1955   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1956   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1957   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1958   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1959 
1960 .seealso: `PetscSF`, `PetscSFComposeInverse()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`
1961 @*/
1962 PetscErrorCode PetscSFCompose(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
1963 {
1964   const PetscSFNode *remotePointsA, *remotePointsB;
1965   PetscSFNode       *remotePointsBA = NULL, *reorderedRemotePointsA = NULL, *leafdataB;
1966   const PetscInt    *localPointsA, *localPointsB;
1967   PetscInt          *localPointsBA;
1968   PetscInt           i, numRootsA, numLeavesA, numRootsB, numLeavesB, minleaf, maxleaf, numLeavesBA;
1969   PetscBool          denseB;
1970 
1971   PetscFunctionBegin;
1972   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
1973   PetscSFCheckGraphSet(sfA, 1);
1974   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
1975   PetscSFCheckGraphSet(sfB, 2);
1976   PetscCheckSameComm(sfA, 1, sfB, 2);
1977   PetscValidPointer(sfBA, 3);
1978   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
1979   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
1980 
1981   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
1982   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
1983   /* Make sure that PetscSFBcast{Begin, End}(sfB, ...) works with root data of size       */
1984   /* numRootsB; otherwise, garbage will be broadcasted.                                   */
1985   /* Example (comm size = 1):                                                             */
1986   /* sfA: 0 <- (0, 0)                                                                     */
1987   /* sfB: 100 <- (0, 0)                                                                   */
1988   /*      101 <- (0, 1)                                                                   */
1989   /* Here, we have remotePointsA = [(0, 0)], but for remotePointsA to be a valid tartget  */
1990   /* of sfB, it has to be recasted as [(0, 0), (-1, -1)] so that points 100 and 101 would */
1991   /* receive (0, 0) and (-1, -1), respectively, when PetscSFBcast(sfB, ...) is called on  */
1992   /* remotePointsA; if not recasted, point 101 would receive a garbage value.             */
1993   PetscCall(PetscMalloc1(numRootsB, &reorderedRemotePointsA));
1994   for (i = 0; i < numRootsB; i++) {
1995     reorderedRemotePointsA[i].rank  = -1;
1996     reorderedRemotePointsA[i].index = -1;
1997   }
1998   for (i = 0; i < numLeavesA; i++) {
1999     PetscInt localp = localPointsA ? localPointsA[i] : i;
2000 
2001     if (localp >= numRootsB) continue;
2002     reorderedRemotePointsA[localp] = remotePointsA[i];
2003   }
2004   remotePointsA = reorderedRemotePointsA;
2005   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2006   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &leafdataB));
2007   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2008     leafdataB[i].rank  = -1;
2009     leafdataB[i].index = -1;
2010   }
2011   PetscCall(PetscSFBcastBegin(sfB, MPIU_2INT, remotePointsA, leafdataB - minleaf, MPI_REPLACE));
2012   PetscCall(PetscSFBcastEnd(sfB, MPIU_2INT, remotePointsA, leafdataB - minleaf, MPI_REPLACE));
2013   PetscCall(PetscFree(reorderedRemotePointsA));
2014 
2015   denseB = (PetscBool)!localPointsB;
2016   for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2017     if (leafdataB[localPointsB ? localPointsB[i] - minleaf : i].rank == -1) denseB = PETSC_FALSE;
2018     else numLeavesBA++;
2019   }
2020   if (denseB) {
2021     localPointsBA  = NULL;
2022     remotePointsBA = leafdataB;
2023   } else {
2024     PetscCall(PetscMalloc1(numLeavesBA, &localPointsBA));
2025     PetscCall(PetscMalloc1(numLeavesBA, &remotePointsBA));
2026     for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2027       const PetscInt l = localPointsB ? localPointsB[i] : i;
2028 
2029       if (leafdataB[l - minleaf].rank == -1) continue;
2030       remotePointsBA[numLeavesBA] = leafdataB[l - minleaf];
2031       localPointsBA[numLeavesBA]  = l;
2032       numLeavesBA++;
2033     }
2034     PetscCall(PetscFree(leafdataB));
2035   }
2036   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2037   PetscCall(PetscSFSetFromOptions(*sfBA));
2038   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2039   PetscFunctionReturn(0);
2040 }
2041 
2042 /*@
2043   PetscSFComposeInverse - Compose a new `PetscSF` by putting the inverse of the second `PetscSF` under the first one
2044 
2045   Input Parameters:
2046 + sfA - The first `PetscSF`
2047 - sfB - The second `PetscSF`
2048 
2049   Output Parameters:
2050 . sfBA - The composite `PetscSF`.
2051 
2052   Level: developer
2053 
2054   Notes:
2055   Currently, the two SFs must be defined on congruent communicators and they must be true star
2056   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2057   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2058 
2059   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2060   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2061   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2062   a Reduce on sfB, on connected roots.
2063 
2064 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFCreateInverseSF()`
2065 @*/
2066 PetscErrorCode PetscSFComposeInverse(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
2067 {
2068   const PetscSFNode *remotePointsA, *remotePointsB;
2069   PetscSFNode       *remotePointsBA;
2070   const PetscInt    *localPointsA, *localPointsB;
2071   PetscSFNode       *reorderedRemotePointsA = NULL;
2072   PetscInt           i, numRootsA, numLeavesA, numLeavesBA, numRootsB, numLeavesB, minleaf, maxleaf, *localPointsBA;
2073   MPI_Op             op;
2074 #if defined(PETSC_USE_64BIT_INDICES)
2075   PetscBool iswin;
2076 #endif
2077 
2078   PetscFunctionBegin;
2079   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
2080   PetscSFCheckGraphSet(sfA, 1);
2081   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
2082   PetscSFCheckGraphSet(sfB, 2);
2083   PetscCheckSameComm(sfA, 1, sfB, 2);
2084   PetscValidPointer(sfBA, 3);
2085   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
2086   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
2087 
2088   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
2089   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
2090 
2091   /* TODO: Check roots of sfB have degree of 1 */
2092   /* Once we implement it, we can replace the MPI_MAXLOC
2093      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2094      We use MPI_MAXLOC only to have a deterministic output from this routine if
2095      the root condition is not meet.
2096    */
2097   op = MPI_MAXLOC;
2098 #if defined(PETSC_USE_64BIT_INDICES)
2099   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2100   PetscCall(PetscObjectTypeCompare((PetscObject)sfB, PETSCSFWINDOW, &iswin));
2101   if (iswin) op = MPI_REPLACE;
2102 #endif
2103 
2104   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2105   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &reorderedRemotePointsA));
2106   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2107     reorderedRemotePointsA[i].rank  = -1;
2108     reorderedRemotePointsA[i].index = -1;
2109   }
2110   if (localPointsA) {
2111     for (i = 0; i < numLeavesA; i++) {
2112       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2113       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2114     }
2115   } else {
2116     for (i = 0; i < numLeavesA; i++) {
2117       if (i > maxleaf || i < minleaf) continue;
2118       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2119     }
2120   }
2121 
2122   PetscCall(PetscMalloc1(numRootsB, &localPointsBA));
2123   PetscCall(PetscMalloc1(numRootsB, &remotePointsBA));
2124   for (i = 0; i < numRootsB; i++) {
2125     remotePointsBA[i].rank  = -1;
2126     remotePointsBA[i].index = -1;
2127   }
2128 
2129   PetscCall(PetscSFReduceBegin(sfB, MPIU_2INT, reorderedRemotePointsA - minleaf, remotePointsBA, op));
2130   PetscCall(PetscSFReduceEnd(sfB, MPIU_2INT, reorderedRemotePointsA - minleaf, remotePointsBA, op));
2131   PetscCall(PetscFree(reorderedRemotePointsA));
2132   for (i = 0, numLeavesBA = 0; i < numRootsB; i++) {
2133     if (remotePointsBA[i].rank == -1) continue;
2134     remotePointsBA[numLeavesBA].rank  = remotePointsBA[i].rank;
2135     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2136     localPointsBA[numLeavesBA]        = i;
2137     numLeavesBA++;
2138   }
2139   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2140   PetscCall(PetscSFSetFromOptions(*sfBA));
2141   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2142   PetscFunctionReturn(0);
2143 }
2144 
2145 /*
2146   PetscSFCreateLocalSF_Private - Creates a local `PetscSF` that only has intra-process edges of the global `PetscSF`
2147 
2148   Input Parameters:
2149 . sf - The global `PetscSF`
2150 
2151   Output Parameters:
2152 . out - The local `PetscSF`
2153 
2154 .seealso: `PetscSF`, `PetscSFCreate()`
2155  */
2156 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf, PetscSF *out)
2157 {
2158   MPI_Comm           comm;
2159   PetscMPIInt        myrank;
2160   const PetscInt    *ilocal;
2161   const PetscSFNode *iremote;
2162   PetscInt           i, j, nroots, nleaves, lnleaves, *lilocal;
2163   PetscSFNode       *liremote;
2164   PetscSF            lsf;
2165 
2166   PetscFunctionBegin;
2167   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2168   if (sf->ops->CreateLocalSF) PetscUseTypeMethod(sf, CreateLocalSF, out);
2169   else {
2170     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2171     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
2172     PetscCallMPI(MPI_Comm_rank(comm, &myrank));
2173 
2174     /* Find out local edges and build a local SF */
2175     PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
2176     for (i = lnleaves = 0; i < nleaves; i++) {
2177       if (iremote[i].rank == (PetscInt)myrank) lnleaves++;
2178     }
2179     PetscCall(PetscMalloc1(lnleaves, &lilocal));
2180     PetscCall(PetscMalloc1(lnleaves, &liremote));
2181 
2182     for (i = j = 0; i < nleaves; i++) {
2183       if (iremote[i].rank == (PetscInt)myrank) {
2184         lilocal[j]        = ilocal ? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2185         liremote[j].rank  = 0;                      /* rank in PETSC_COMM_SELF */
2186         liremote[j].index = iremote[i].index;
2187         j++;
2188       }
2189     }
2190     PetscCall(PetscSFCreate(PETSC_COMM_SELF, &lsf));
2191     PetscCall(PetscSFSetFromOptions(lsf));
2192     PetscCall(PetscSFSetGraph(lsf, nroots, lnleaves, lilocal, PETSC_OWN_POINTER, liremote, PETSC_OWN_POINTER));
2193     PetscCall(PetscSFSetUp(lsf));
2194     *out = lsf;
2195   }
2196   PetscFunctionReturn(0);
2197 }
2198 
2199 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2200 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata)
2201 {
2202   PetscMemType rootmtype, leafmtype;
2203 
2204   PetscFunctionBegin;
2205   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2206   PetscCall(PetscSFSetUp(sf));
2207   PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
2208   PetscCall(PetscGetMemType(rootdata, &rootmtype));
2209   PetscCall(PetscGetMemType(leafdata, &leafmtype));
2210   PetscUseTypeMethod(sf, BcastToZero, unit, rootmtype, rootdata, leafmtype, leafdata);
2211   PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
2212   PetscFunctionReturn(0);
2213 }
2214 
2215 /*@
2216   PetscSFConcatenate - concatenate multiple `PetscSF` into one
2217 
2218   Input Parameters:
2219 + comm - the communicator
2220 . nsfs - the number of input `PetscSF`
2221 . sfs  - the array of input `PetscSF`
2222 . shareRoots - the flag whether roots of input `PetscSF` are taken as shared (`PETSC_TRUE`), or separate and concatenated (`PETSC_FALSE`)
2223 - leafOffsets - the array of local leaf offsets, one for each input `PetscSF`, or NULL for contiguous storage
2224 
2225   Output Parameters:
2226 . newsf - The resulting `PetscSF`
2227 
2228   Level: developer
2229 
2230   Notes:
2231   The communicator of all SFs in sfs must be comm.
2232 
2233   The offsets in leafOffsets are added to the original leaf indices.
2234 
2235   If all input SFs use contiguous leaf storage (ilocal = NULL), leafOffsets can be passed as NULL as well.
2236   In this case, NULL is also passed as ilocal to the resulting SF.
2237 
2238   If any input SF has non-null ilocal, leafOffsets is needed to distinguish leaves from different input SFs.
2239   In this case, user is responsible to provide correct offsets so that the resulting leaves are unique (otherwise an error occurs).
2240 
2241 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`
2242 @*/
2243 PetscErrorCode PetscSFConcatenate(MPI_Comm comm, PetscInt nsfs, PetscSF sfs[], PetscBool shareRoots, PetscInt leafOffsets[], PetscSF *newsf)
2244 {
2245   PetscInt     i, s, nLeaves, nRoots;
2246   PetscInt    *leafArrayOffsets;
2247   PetscInt    *ilocal_new;
2248   PetscSFNode *iremote_new;
2249   PetscInt    *rootOffsets;
2250   PetscBool    all_ilocal_null = PETSC_FALSE;
2251   PetscMPIInt  rank;
2252 
2253   PetscFunctionBegin;
2254   {
2255     PetscSF dummy; /* just to have a PetscObject on comm for input validation */
2256 
2257     PetscCall(PetscSFCreate(comm, &dummy));
2258     PetscValidLogicalCollectiveInt(dummy, nsfs, 2);
2259     PetscValidPointer(sfs, 3);
2260     for (i = 0; i < nsfs; i++) {
2261       PetscValidHeaderSpecific(sfs[i], PETSCSF_CLASSID, 3);
2262       PetscCheckSameComm(dummy, 1, sfs[i], 3);
2263     }
2264     PetscValidLogicalCollectiveBool(dummy, shareRoots, 4);
2265     if (leafOffsets) PetscValidIntPointer(leafOffsets, 5);
2266     PetscValidPointer(newsf, 6);
2267     PetscCall(PetscSFDestroy(&dummy));
2268   }
2269   if (!nsfs) {
2270     PetscCall(PetscSFCreate(comm, newsf));
2271     PetscCall(PetscSFSetGraph(*newsf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
2272     PetscFunctionReturn(0);
2273   }
2274   PetscCallMPI(MPI_Comm_rank(comm, &rank));
2275 
2276   PetscCall(PetscCalloc1(nsfs + 1, &rootOffsets));
2277   if (shareRoots) {
2278     PetscCall(PetscSFGetGraph(sfs[0], &nRoots, NULL, NULL, NULL));
2279     if (PetscDefined(USE_DEBUG)) {
2280       for (s = 1; s < nsfs; s++) {
2281         PetscInt nr;
2282 
2283         PetscCall(PetscSFGetGraph(sfs[s], &nr, NULL, NULL, NULL));
2284         PetscCheck(nr == nRoots, comm, PETSC_ERR_ARG_SIZ, "shareRoots = PETSC_TRUE but sfs[%" PetscInt_FMT "] has a different number of roots (%" PetscInt_FMT ") than sfs[0] (%" PetscInt_FMT ")", s, nr, nRoots);
2285       }
2286     }
2287   } else {
2288     for (s = 0; s < nsfs; s++) {
2289       PetscInt nr;
2290 
2291       PetscCall(PetscSFGetGraph(sfs[s], &nr, NULL, NULL, NULL));
2292       rootOffsets[s + 1] = rootOffsets[s] + nr;
2293     }
2294     nRoots = rootOffsets[nsfs];
2295   }
2296 
2297   /* Calculate leaf array offsets and automatic root offsets */
2298   PetscCall(PetscMalloc1(nsfs + 1, &leafArrayOffsets));
2299   leafArrayOffsets[0] = 0;
2300   for (s = 0; s < nsfs; s++) {
2301     PetscInt nl;
2302 
2303     PetscCall(PetscSFGetGraph(sfs[s], NULL, &nl, NULL, NULL));
2304     leafArrayOffsets[s + 1] = leafArrayOffsets[s] + nl;
2305   }
2306   nLeaves = leafArrayOffsets[nsfs];
2307 
2308   if (!leafOffsets) {
2309     all_ilocal_null = PETSC_TRUE;
2310     for (s = 0; s < nsfs; s++) {
2311       const PetscInt *ilocal;
2312 
2313       PetscCall(PetscSFGetGraph(sfs[s], NULL, NULL, &ilocal, NULL));
2314       if (ilocal) {
2315         all_ilocal_null = PETSC_FALSE;
2316         break;
2317       }
2318     }
2319     PetscCheck(all_ilocal_null, PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "leafOffsets can be passed as NULL only if all SFs have ilocal = NULL");
2320   }
2321 
2322   /* Renumber and concatenate local leaves */
2323   ilocal_new = NULL;
2324   if (!all_ilocal_null) {
2325     PetscCall(PetscMalloc1(nLeaves, &ilocal_new));
2326     for (i = 0; i < nLeaves; i++) ilocal_new[i] = -1;
2327     for (s = 0; s < nsfs; s++) {
2328       const PetscInt *ilocal;
2329       PetscInt       *ilocal_l = &ilocal_new[leafArrayOffsets[s]];
2330       PetscInt        i, nleaves_l;
2331 
2332       PetscCall(PetscSFGetGraph(sfs[s], NULL, &nleaves_l, &ilocal, NULL));
2333       for (i = 0; i < nleaves_l; i++) ilocal_l[i] = (ilocal ? ilocal[i] : i) + leafOffsets[s];
2334     }
2335   }
2336 
2337   /* Renumber and concatenate remote roots */
2338   PetscCall(PetscMalloc1(nLeaves, &iremote_new));
2339   for (i = 0; i < nLeaves; i++) {
2340     iremote_new[i].rank  = -1;
2341     iremote_new[i].index = -1;
2342   }
2343   for (s = 0; s < nsfs; s++) {
2344     PetscInt           i, nl, nr;
2345     PetscSF            tmp_sf;
2346     const PetscSFNode *iremote;
2347     PetscSFNode       *tmp_rootdata;
2348     PetscSFNode       *tmp_leafdata = &iremote_new[leafArrayOffsets[s]];
2349 
2350     PetscCall(PetscSFGetGraph(sfs[s], &nr, &nl, NULL, &iremote));
2351     PetscCall(PetscSFCreate(comm, &tmp_sf));
2352     /* create helper SF with contiguous leaves */
2353     PetscCall(PetscSFSetGraph(tmp_sf, nr, nl, NULL, PETSC_USE_POINTER, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
2354     PetscCall(PetscSFSetUp(tmp_sf));
2355     PetscCall(PetscMalloc1(nr, &tmp_rootdata));
2356     for (i = 0; i < nr; i++) {
2357       tmp_rootdata[i].index = i + rootOffsets[s];
2358       tmp_rootdata[i].rank  = (PetscInt)rank;
2359     }
2360     PetscCall(PetscSFBcastBegin(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2361     PetscCall(PetscSFBcastEnd(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2362     PetscCall(PetscSFDestroy(&tmp_sf));
2363     PetscCall(PetscFree(tmp_rootdata));
2364   }
2365 
2366   /* Build the new SF */
2367   PetscCall(PetscSFCreate(comm, newsf));
2368   PetscCall(PetscSFSetGraph(*newsf, nRoots, nLeaves, ilocal_new, PETSC_OWN_POINTER, iremote_new, PETSC_OWN_POINTER));
2369   PetscCall(PetscSFSetUp(*newsf));
2370   PetscCall(PetscFree(rootOffsets));
2371   PetscCall(PetscFree(leafArrayOffsets));
2372   PetscFunctionReturn(0);
2373 }
2374