xref: /petsc/src/vec/is/sf/interface/sf.c (revision c1730cc1cd3b6fc6205d70c430a9ca69e7268eec)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petsc/private/viewerimpl.h>
4 #include <petscctable.h>
5 
6 #if defined(PETSC_HAVE_CUDA)
7   #include <cuda_runtime.h>
8 #endif
9 
10 #if defined(PETSC_HAVE_HIP)
11   #include <hip/hip_runtime.h>
12 #endif
13 
14 #if defined(PETSC_CLANG_STATIC_ANALYZER)
15 void PetscSFCheckGraphSet(PetscSF, int);
16 #else
17   #if defined(PETSC_USE_DEBUG)
18     #define PetscSFCheckGraphSet(sf, arg) PetscCheck((sf)->graphset, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %d \"%s\" before %s()", (arg), #sf, PETSC_FUNCTION_NAME);
19   #else
20     #define PetscSFCheckGraphSet(sf, arg) \
21       do { \
22       } while (0)
23   #endif
24 #endif
25 
26 const char *const PetscSFDuplicateOptions[]     = {"CONFONLY", "RANKS", "GRAPH", "PetscSFDuplicateOption", "PETSCSF_DUPLICATE_", NULL};
27 const char *const PetscSFConcatenateRootModes[] = {"local", "shared", "global", "PetscSFConcatenateRootMode", "PETSCSF_CONCATENATE_ROOTMODE_", NULL};
28 
29 /*@
30    PetscSFCreate - create a star forest communication context
31 
32    Collective
33 
34    Input Parameter:
35 .  comm - communicator on which the star forest will operate
36 
37    Output Parameter:
38 .  sf - new star forest context
39 
40    Options Database Keys:
41 .  -sf_type type - value of type may be
42 .vb
43     basic     -Use MPI persistent Isend/Irecv for communication (Default)
44     window    -Use MPI-3 one-sided window for communication
45     neighbor  -Use MPI-3 neighborhood collectives for communication
46 .ve
47 
48    Level: intermediate
49 
50    Note:
51    When one knows the communication graph is one of the predefined graph, such as `MPI_Alltoall()`, `MPI_Allgatherv()`,
52    `MPI_Gatherv()`, one can create a `PetscSF` and then set its graph with `PetscSFSetGraphWithPattern()`. These special
53    `SF`s are optimized and they have better performance than general `SF`s.
54 
55 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFSetGraphWithPattern()`, `PetscSFDestroy()`
56 @*/
57 PetscErrorCode PetscSFCreate(MPI_Comm comm, PetscSF *sf)
58 {
59   PetscSF b;
60 
61   PetscFunctionBegin;
62   PetscValidPointer(sf, 2);
63   PetscCall(PetscSFInitializePackage());
64 
65   PetscCall(PetscHeaderCreate(b, PETSCSF_CLASSID, "PetscSF", "Star Forest", "PetscSF", comm, PetscSFDestroy, PetscSFView));
66 
67   b->nroots    = -1;
68   b->nleaves   = -1;
69   b->minleaf   = PETSC_MAX_INT;
70   b->maxleaf   = PETSC_MIN_INT;
71   b->nranks    = -1;
72   b->rankorder = PETSC_TRUE;
73   b->ingroup   = MPI_GROUP_NULL;
74   b->outgroup  = MPI_GROUP_NULL;
75   b->graphset  = PETSC_FALSE;
76 #if defined(PETSC_HAVE_DEVICE)
77   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
78   b->use_stream_aware_mpi = PETSC_FALSE;
79   b->unknown_input_stream = PETSC_FALSE;
80   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
81   b->backend = PETSCSF_BACKEND_KOKKOS;
82   #elif defined(PETSC_HAVE_CUDA)
83   b->backend = PETSCSF_BACKEND_CUDA;
84   #elif defined(PETSC_HAVE_HIP)
85   b->backend = PETSCSF_BACKEND_HIP;
86   #endif
87 
88   #if defined(PETSC_HAVE_NVSHMEM)
89   b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
90   b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
91   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem", &b->use_nvshmem, NULL));
92   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem_get", &b->use_nvshmem_get, NULL));
93   #endif
94 #endif
95   b->vscat.from_n = -1;
96   b->vscat.to_n   = -1;
97   b->vscat.unit   = MPIU_SCALAR;
98   *sf             = b;
99   PetscFunctionReturn(0);
100 }
101 
102 /*@
103    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
104 
105    Collective
106 
107    Input Parameter:
108 .  sf - star forest
109 
110    Level: advanced
111 
112 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetGraph()`, `PetscSFDestroy()`
113 @*/
114 PetscErrorCode PetscSFReset(PetscSF sf)
115 {
116   PetscFunctionBegin;
117   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
118   PetscTryTypeMethod(sf, Reset);
119   sf->nroots   = -1;
120   sf->nleaves  = -1;
121   sf->minleaf  = PETSC_MAX_INT;
122   sf->maxleaf  = PETSC_MIN_INT;
123   sf->mine     = NULL;
124   sf->remote   = NULL;
125   sf->graphset = PETSC_FALSE;
126   PetscCall(PetscFree(sf->mine_alloc));
127   PetscCall(PetscFree(sf->remote_alloc));
128   sf->nranks = -1;
129   PetscCall(PetscFree4(sf->ranks, sf->roffset, sf->rmine, sf->rremote));
130   sf->degreeknown = PETSC_FALSE;
131   PetscCall(PetscFree(sf->degree));
132   if (sf->ingroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->ingroup));
133   if (sf->outgroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->outgroup));
134   if (sf->multi) sf->multi->multi = NULL;
135   PetscCall(PetscSFDestroy(&sf->multi));
136   PetscCall(PetscLayoutDestroy(&sf->map));
137 
138 #if defined(PETSC_HAVE_DEVICE)
139   for (PetscInt i = 0; i < 2; i++) PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, sf->rmine_d[i]));
140 #endif
141 
142   sf->setupcalled = PETSC_FALSE;
143   PetscFunctionReturn(0);
144 }
145 
146 /*@C
147    PetscSFSetType - Set the `PetscSF` communication implementation
148 
149    Collective on sf
150 
151    Input Parameters:
152 +  sf - the `PetscSF` context
153 -  type - a known method
154 .vb
155     PETSCSFWINDOW - MPI-2/3 one-sided
156     PETSCSFBASIC - basic implementation using MPI-1 two-sided
157 .ve
158 
159    Options Database Key:
160 .  -sf_type <type> - Sets the method; for example basic or window use -help for a list of available methods (for instance, window, basic, neighbor)
161 
162   Level: intermediate
163 
164    Notes:
165    See "include/petscsf.h" for available methods (for instance)
166 
167 .seealso: `PetscSFType`, `PetscSFCreate()`
168 @*/
169 PetscErrorCode PetscSFSetType(PetscSF sf, PetscSFType type)
170 {
171   PetscBool match;
172   PetscErrorCode (*r)(PetscSF);
173 
174   PetscFunctionBegin;
175   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
176   PetscValidCharPointer(type, 2);
177 
178   PetscCall(PetscObjectTypeCompare((PetscObject)sf, type, &match));
179   if (match) PetscFunctionReturn(0);
180 
181   PetscCall(PetscFunctionListFind(PetscSFList, type, &r));
182   PetscCheck(r, PETSC_COMM_SELF, PETSC_ERR_ARG_UNKNOWN_TYPE, "Unable to find requested PetscSF type %s", type);
183   /* Destroy the previous PetscSF implementation context */
184   PetscTryTypeMethod(sf, Destroy);
185   PetscCall(PetscMemzero(sf->ops, sizeof(*sf->ops)));
186   PetscCall(PetscObjectChangeTypeName((PetscObject)sf, type));
187   PetscCall((*r)(sf));
188   PetscFunctionReturn(0);
189 }
190 
191 /*@C
192   PetscSFGetType - Get the `PetscSF` communication implementation
193 
194   Not Collective
195 
196   Input Parameter:
197 . sf  - the `PetscSF` context
198 
199   Output Parameter:
200 . type - the `PetscSF` type name
201 
202   Level: intermediate
203 
204 .seealso: `PetscSFType`, `PetscSFSetType()`, `PetscSFCreate()`
205 @*/
206 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
207 {
208   PetscFunctionBegin;
209   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
210   PetscValidPointer(type, 2);
211   *type = ((PetscObject)sf)->type_name;
212   PetscFunctionReturn(0);
213 }
214 
215 /*@C
216    PetscSFDestroy - destroy star forest
217 
218    Collective
219 
220    Input Parameter:
221 .  sf - address of star forest
222 
223    Level: intermediate
224 
225 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFReset()`
226 @*/
227 PetscErrorCode PetscSFDestroy(PetscSF *sf)
228 {
229   PetscFunctionBegin;
230   if (!*sf) PetscFunctionReturn(0);
231   PetscValidHeaderSpecific((*sf), PETSCSF_CLASSID, 1);
232   if (--((PetscObject)(*sf))->refct > 0) {
233     *sf = NULL;
234     PetscFunctionReturn(0);
235   }
236   PetscCall(PetscSFReset(*sf));
237   PetscTryTypeMethod((*sf), Destroy);
238   PetscCall(PetscSFDestroy(&(*sf)->vscat.lsf));
239   if ((*sf)->vscat.bs > 1) PetscCallMPI(MPI_Type_free(&(*sf)->vscat.unit));
240   PetscCall(PetscHeaderDestroy(sf));
241   PetscFunctionReturn(0);
242 }
243 
244 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
245 {
246   PetscInt           i, nleaves;
247   PetscMPIInt        size;
248   const PetscInt    *ilocal;
249   const PetscSFNode *iremote;
250 
251   PetscFunctionBegin;
252   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
253   PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, &iremote));
254   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
255   for (i = 0; i < nleaves; i++) {
256     const PetscInt rank   = iremote[i].rank;
257     const PetscInt remote = iremote[i].index;
258     const PetscInt leaf   = ilocal ? ilocal[i] : i;
259     PetscCheck(rank >= 0 && rank < size, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided rank (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be in [0, %d)", rank, i, size);
260     PetscCheck(remote >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided index (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be >= 0", remote, i);
261     PetscCheck(leaf >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided location (%" PetscInt_FMT ") for leaf %" PetscInt_FMT " is invalid, should be >= 0", leaf, i);
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 /*@
267    PetscSFSetUp - set up communication structures
268 
269    Collective
270 
271    Input Parameter:
272 .  sf - star forest communication object
273 
274    Level: beginner
275 
276 .seealso: `PetscSFType`, `PetscSFSetFromOptions()`, `PetscSFSetType()`
277 @*/
278 PetscErrorCode PetscSFSetUp(PetscSF sf)
279 {
280   PetscFunctionBegin;
281   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
282   PetscSFCheckGraphSet(sf, 1);
283   if (sf->setupcalled) PetscFunctionReturn(0);
284   PetscCall(PetscLogEventBegin(PETSCSF_SetUp, sf, 0, 0, 0));
285   PetscCall(PetscSFCheckGraphValid_Private(sf));
286   if (!((PetscObject)sf)->type_name) PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); /* Zero all sf->ops */
287   PetscTryTypeMethod(sf, SetUp);
288 #if defined(PETSC_HAVE_CUDA)
289   if (sf->backend == PETSCSF_BACKEND_CUDA) {
290     sf->ops->Malloc = PetscSFMalloc_CUDA;
291     sf->ops->Free   = PetscSFFree_CUDA;
292   }
293 #endif
294 #if defined(PETSC_HAVE_HIP)
295   if (sf->backend == PETSCSF_BACKEND_HIP) {
296     sf->ops->Malloc = PetscSFMalloc_HIP;
297     sf->ops->Free   = PetscSFFree_HIP;
298   }
299 #endif
300 
301 #
302 #if defined(PETSC_HAVE_KOKKOS)
303   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
304     sf->ops->Malloc = PetscSFMalloc_Kokkos;
305     sf->ops->Free   = PetscSFFree_Kokkos;
306   }
307 #endif
308   PetscCall(PetscLogEventEnd(PETSCSF_SetUp, sf, 0, 0, 0));
309   sf->setupcalled = PETSC_TRUE;
310   PetscFunctionReturn(0);
311 }
312 
313 /*@
314    PetscSFSetFromOptions - set `PetscSF` options using the options database
315 
316    Logically Collective
317 
318    Input Parameter:
319 .  sf - star forest
320 
321    Options Database Keys:
322 +  -sf_type               - implementation type, see PetscSFSetType()
323 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
324 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
325                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
326                             If true, this option only works with -use_gpu_aware_mpi 1.
327 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
328                                If true, this option only works with -use_gpu_aware_mpi 1.
329 
330 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently SF has these backends: cuda, hip and Kokkos.
331                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
332                               the only available is kokkos.
333 
334    Level: intermediate
335 
336 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetType()`
337 @*/
338 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
339 {
340   PetscSFType deft;
341   char        type[256];
342   PetscBool   flg;
343 
344   PetscFunctionBegin;
345   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
346   PetscObjectOptionsBegin((PetscObject)sf);
347   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
348   PetscCall(PetscOptionsFList("-sf_type", "PetscSF implementation type", "PetscSFSetType", PetscSFList, deft, type, sizeof(type), &flg));
349   PetscCall(PetscSFSetType(sf, flg ? type : deft));
350   PetscCall(PetscOptionsBool("-sf_rank_order", "sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise", "PetscSFSetRankOrder", sf->rankorder, &sf->rankorder, NULL));
351 #if defined(PETSC_HAVE_DEVICE)
352   {
353     char      backendstr[32] = {0};
354     PetscBool isCuda = PETSC_FALSE, isHip = PETSC_FALSE, isKokkos = PETSC_FALSE, set;
355     /* Change the defaults set in PetscSFCreate() with command line options */
356     PetscCall(PetscOptionsBool("-sf_unknown_input_stream", "SF root/leafdata is computed on arbitrary streams unknown to SF", "PetscSFSetFromOptions", sf->unknown_input_stream, &sf->unknown_input_stream, NULL));
357     PetscCall(PetscOptionsBool("-sf_use_stream_aware_mpi", "Assume the underlying MPI is cuda-stream aware", "PetscSFSetFromOptions", sf->use_stream_aware_mpi, &sf->use_stream_aware_mpi, NULL));
358     PetscCall(PetscOptionsString("-sf_backend", "Select the device backend SF uses", "PetscSFSetFromOptions", NULL, backendstr, sizeof(backendstr), &set));
359     PetscCall(PetscStrcasecmp("cuda", backendstr, &isCuda));
360     PetscCall(PetscStrcasecmp("kokkos", backendstr, &isKokkos));
361     PetscCall(PetscStrcasecmp("hip", backendstr, &isHip));
362   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
363     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
364     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
365     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
366     else PetscCheck(!set, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
367   #elif defined(PETSC_HAVE_KOKKOS)
368     PetscCheck(!set || isKokkos, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You can only choose kokkos", backendstr);
369   #endif
370   }
371 #endif
372   PetscTryTypeMethod(sf, SetFromOptions, PetscOptionsObject);
373   PetscOptionsEnd();
374   PetscFunctionReturn(0);
375 }
376 
377 /*@
378    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
379 
380    Logically Collective
381 
382    Input Parameters:
383 +  sf - star forest
384 -  flg - `PETSC_TRUE` to sort, `PETSC_FALSE` to skip sorting (lower setup cost, but non-deterministic)
385 
386    Level: advanced
387 
388 .seealso: `PetscSFType`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`
389 @*/
390 PetscErrorCode PetscSFSetRankOrder(PetscSF sf, PetscBool flg)
391 {
392   PetscFunctionBegin;
393   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
394   PetscValidLogicalCollectiveBool(sf, flg, 2);
395   PetscCheck(!sf->multi, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
396   sf->rankorder = flg;
397   PetscFunctionReturn(0);
398 }
399 
400 /*@C
401    PetscSFSetGraph - Set a parallel star forest
402 
403    Collective
404 
405    Input Parameters:
406 +  sf - star forest
407 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
408 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
409 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
410 during setup in debug mode)
411 .  localmode - copy mode for ilocal
412 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
413 during setup in debug mode)
414 -  remotemode - copy mode for iremote
415 
416    Level: intermediate
417 
418    Notes:
419    Leaf indices in ilocal must be unique, otherwise an error occurs.
420 
421    Input arrays ilocal and iremote follow the PetscCopyMode semantics.
422    In particular, if localmode/remotemode is `PETSC_OWN_POINTER` or `PETSC_USE_POINTER`,
423    PETSc might modify the respective array;
424    if `PETSC_USE_POINTER`, the user must delete the array after PetscSFDestroy().
425    Only if `PETSC_COPY_VALUES` is used, the respective array is guaranteed to stay intact and a const array can be passed (but a cast to non-const is needed).
426 
427    Fortran Note:
428    In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode.
429 
430    Developer Note:
431    We sort leaves to check for duplicates and contiguousness and to find minleaf/maxleaf.
432    This also allows to compare leaf sets of two SFs easily.
433 
434 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
435 @*/
436 PetscErrorCode PetscSFSetGraph(PetscSF sf, PetscInt nroots, PetscInt nleaves, PetscInt *ilocal, PetscCopyMode localmode, PetscSFNode *iremote, PetscCopyMode remotemode)
437 {
438   PetscBool unique, contiguous;
439 
440   PetscFunctionBegin;
441   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
442   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal, 4);
443   if (nleaves > 0) PetscValidPointer(iremote, 6);
444   PetscCheck(nroots >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nroots %" PetscInt_FMT ", cannot be negative", nroots);
445   PetscCheck(nleaves >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nleaves %" PetscInt_FMT ", cannot be negative", nleaves);
446   /* enums may be handled as unsigned by some compilers, NVHPC for example, the int cast
447    * below is to prevent NVHPC from warning about meaningless comparison of unsigned with zero */
448   PetscCheck((int)localmode >= PETSC_COPY_VALUES && localmode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong localmode %d", localmode);
449   PetscCheck((int)remotemode >= PETSC_COPY_VALUES && remotemode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong remotemode %d", remotemode);
450 
451   if (sf->nroots >= 0) { /* Reset only if graph already set */
452     PetscCall(PetscSFReset(sf));
453   }
454 
455   PetscCall(PetscLogEventBegin(PETSCSF_SetGraph, sf, 0, 0, 0));
456 
457   sf->nroots  = nroots;
458   sf->nleaves = nleaves;
459 
460   if (localmode == PETSC_COPY_VALUES && ilocal) {
461     PetscInt *tlocal = NULL;
462 
463     PetscCall(PetscMalloc1(nleaves, &tlocal));
464     PetscCall(PetscArraycpy(tlocal, ilocal, nleaves));
465     ilocal = tlocal;
466   }
467   if (remotemode == PETSC_COPY_VALUES) {
468     PetscSFNode *tremote = NULL;
469 
470     PetscCall(PetscMalloc1(nleaves, &tremote));
471     PetscCall(PetscArraycpy(tremote, iremote, nleaves));
472     iremote = tremote;
473   }
474 
475   if (nleaves && ilocal) {
476     PetscSFNode work;
477 
478     PetscCall(PetscSortIntWithDataArray(nleaves, ilocal, iremote, sizeof(PetscSFNode), &work));
479     PetscCall(PetscSortedCheckDupsInt(nleaves, ilocal, &unique));
480     unique = PetscNot(unique);
481     PetscCheck(sf->allow_multi_leaves || unique, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Input ilocal has duplicate entries which is not allowed for this PetscSF");
482     sf->minleaf = ilocal[0];
483     sf->maxleaf = ilocal[nleaves - 1];
484     contiguous  = (PetscBool)(unique && ilocal[0] == 0 && ilocal[nleaves - 1] == nleaves - 1);
485   } else {
486     sf->minleaf = 0;
487     sf->maxleaf = nleaves - 1;
488     unique      = PETSC_TRUE;
489     contiguous  = PETSC_TRUE;
490   }
491 
492   if (contiguous) {
493     if (localmode == PETSC_USE_POINTER) {
494       ilocal = NULL;
495     } else {
496       PetscCall(PetscFree(ilocal));
497     }
498   }
499   sf->mine = ilocal;
500   if (localmode == PETSC_USE_POINTER) {
501     sf->mine_alloc = NULL;
502   } else {
503     sf->mine_alloc = ilocal;
504   }
505   sf->remote = iremote;
506   if (remotemode == PETSC_USE_POINTER) {
507     sf->remote_alloc = NULL;
508   } else {
509     sf->remote_alloc = iremote;
510   }
511   PetscCall(PetscLogEventEnd(PETSCSF_SetGraph, sf, 0, 0, 0));
512   sf->graphset = PETSC_TRUE;
513   PetscFunctionReturn(0);
514 }
515 
516 /*@
517   PetscSFSetGraphWithPattern - Sets the graph of a `PetscSF` with a specific pattern
518 
519   Collective
520 
521   Input Parameters:
522 + sf      - The `PetscSF`
523 . map     - Layout of roots over all processes (insignificant when pattern is `PETSCSF_PATTERN_ALLTOALL`)
524 - pattern - One of `PETSCSF_PATTERN_ALLGATHER`, `PETSCSF_PATTERN_GATHER`, `PETSCSF_PATTERN_ALLTOALL`
525 
526   Level: intermediate
527 
528   Notes:
529   It is easier to explain `PetscSFPattern` using vectors. Suppose we have an MPI vector x and its layout is map.
530   n and N are local and global sizes of x respectively.
531 
532   With `PETSCSF_PATTERN_ALLGATHER`, the routine creates a graph that if one does Bcast on it, it will copy x to
533   sequential vectors y on all ranks.
534 
535   With `PETSCSF_PATTERN_GATHER`, the routine creates a graph that if one does Bcast on it, it will copy x to a
536   sequential vector y on rank 0.
537 
538   In above cases, entries of x are roots and entries of y are leaves.
539 
540   With `PETSCSF_PATTERN_ALLTOALL`, map is insignificant. Suppose NP is size of sf's communicator. The routine
541   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
542   of rank j. Here 0 <=i,j<NP. It is a kind of `MPI_Alltoall()` with sendcount/recvcount being 1. Note that it does
543   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
544   items with `MPI_Type_contiguous` and use that as the <unit> argument in SF routines.
545 
546   In this case, roots and leaves are symmetric.
547 
548 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
549  @*/
550 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf, PetscLayout map, PetscSFPattern pattern)
551 {
552   MPI_Comm    comm;
553   PetscInt    n, N, res[2];
554   PetscMPIInt rank, size;
555   PetscSFType type;
556 
557   PetscFunctionBegin;
558   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
559   if (pattern != PETSCSF_PATTERN_ALLTOALL) PetscValidPointer(map, 2);
560   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
561   PetscCheck(pattern >= PETSCSF_PATTERN_ALLGATHER && pattern <= PETSCSF_PATTERN_ALLTOALL, comm, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported PetscSFPattern %d", pattern);
562   PetscCallMPI(MPI_Comm_rank(comm, &rank));
563   PetscCallMPI(MPI_Comm_size(comm, &size));
564 
565   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
566     type = PETSCSFALLTOALL;
567     PetscCall(PetscLayoutCreate(comm, &sf->map));
568     PetscCall(PetscLayoutSetLocalSize(sf->map, size));
569     PetscCall(PetscLayoutSetSize(sf->map, ((PetscInt)size) * size));
570     PetscCall(PetscLayoutSetUp(sf->map));
571   } else {
572     PetscCall(PetscLayoutGetLocalSize(map, &n));
573     PetscCall(PetscLayoutGetSize(map, &N));
574     res[0] = n;
575     res[1] = -n;
576     /* Check if n are same over all ranks so that we can optimize it */
577     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, res, 2, MPIU_INT, MPI_MAX, comm));
578     if (res[0] == -res[1]) { /* same n */
579       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER : PETSCSFGATHER;
580     } else {
581       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
582     }
583     PetscCall(PetscLayoutReference(map, &sf->map));
584   }
585   PetscCall(PetscSFSetType(sf, type));
586 
587   sf->pattern = pattern;
588   sf->mine    = NULL; /* Contiguous */
589 
590   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
591      Also set other easy stuff.
592    */
593   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
594     sf->nleaves = N;
595     sf->nroots  = n;
596     sf->nranks  = size;
597     sf->minleaf = 0;
598     sf->maxleaf = N - 1;
599   } else if (pattern == PETSCSF_PATTERN_GATHER) {
600     sf->nleaves = rank ? 0 : N;
601     sf->nroots  = n;
602     sf->nranks  = rank ? 0 : size;
603     sf->minleaf = 0;
604     sf->maxleaf = rank ? -1 : N - 1;
605   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
606     sf->nleaves = size;
607     sf->nroots  = size;
608     sf->nranks  = size;
609     sf->minleaf = 0;
610     sf->maxleaf = size - 1;
611   }
612   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
613   sf->graphset = PETSC_TRUE;
614   PetscFunctionReturn(0);
615 }
616 
617 /*@
618    PetscSFCreateInverseSF - given a `PetscSF` in which all vertices have degree 1, creates the inverse map
619 
620    Collective
621 
622    Input Parameter:
623 .  sf - star forest to invert
624 
625    Output Parameter:
626 .  isf - inverse of sf
627 
628    Level: advanced
629 
630    Notes:
631    All roots must have degree 1.
632 
633    The local space may be a permutation, but cannot be sparse.
634 
635 .seealso: `PetscSFType`, `PetscSFSetGraph()`
636 @*/
637 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf, PetscSF *isf)
638 {
639   PetscMPIInt     rank;
640   PetscInt        i, nroots, nleaves, maxlocal, count, *newilocal;
641   const PetscInt *ilocal;
642   PetscSFNode    *roots, *leaves;
643 
644   PetscFunctionBegin;
645   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
646   PetscSFCheckGraphSet(sf, 1);
647   PetscValidPointer(isf, 2);
648 
649   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, NULL));
650   maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
651 
652   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
653   PetscCall(PetscMalloc2(nroots, &roots, maxlocal, &leaves));
654   for (i = 0; i < maxlocal; i++) {
655     leaves[i].rank  = rank;
656     leaves[i].index = i;
657   }
658   for (i = 0; i < nroots; i++) {
659     roots[i].rank  = -1;
660     roots[i].index = -1;
661   }
662   PetscCall(PetscSFReduceBegin(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
663   PetscCall(PetscSFReduceEnd(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
664 
665   /* Check whether our leaves are sparse */
666   for (i = 0, count = 0; i < nroots; i++)
667     if (roots[i].rank >= 0) count++;
668   if (count == nroots) newilocal = NULL;
669   else { /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */ PetscCall(PetscMalloc1(count, &newilocal));
670     for (i = 0, count = 0; i < nroots; i++) {
671       if (roots[i].rank >= 0) {
672         newilocal[count]   = i;
673         roots[count].rank  = roots[i].rank;
674         roots[count].index = roots[i].index;
675         count++;
676       }
677     }
678   }
679 
680   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, isf));
681   PetscCall(PetscSFSetGraph(*isf, maxlocal, count, newilocal, PETSC_OWN_POINTER, roots, PETSC_COPY_VALUES));
682   PetscCall(PetscFree2(roots, leaves));
683   PetscFunctionReturn(0);
684 }
685 
686 /*@
687    PetscSFDuplicate - duplicate a `PetscSF`, optionally preserving rank connectivity and graph
688 
689    Collective
690 
691    Input Parameters:
692 +  sf - communication object to duplicate
693 -  opt - `PETSCSF_DUPLICATE_CONFONLY`, `PETSCSF_DUPLICATE_RANKS`, or `PETSCSF_DUPLICATE_GRAPH` (see `PetscSFDuplicateOption`)
694 
695    Output Parameter:
696 .  newsf - new communication object
697 
698    Level: beginner
699 
700 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFSetType()`, `PetscSFSetGraph()`
701 @*/
702 PetscErrorCode PetscSFDuplicate(PetscSF sf, PetscSFDuplicateOption opt, PetscSF *newsf)
703 {
704   PetscSFType  type;
705   MPI_Datatype dtype = MPIU_SCALAR;
706 
707   PetscFunctionBegin;
708   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
709   PetscValidLogicalCollectiveEnum(sf, opt, 2);
710   PetscValidPointer(newsf, 3);
711   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sf), newsf));
712   PetscCall(PetscSFGetType(sf, &type));
713   if (type) PetscCall(PetscSFSetType(*newsf, type));
714   (*newsf)->allow_multi_leaves = sf->allow_multi_leaves; /* Dup this flag ealier since PetscSFSetGraph() below checks on this flag */
715   if (opt == PETSCSF_DUPLICATE_GRAPH) {
716     PetscSFCheckGraphSet(sf, 1);
717     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
718       PetscInt           nroots, nleaves;
719       const PetscInt    *ilocal;
720       const PetscSFNode *iremote;
721       PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
722       PetscCall(PetscSFSetGraph(*newsf, nroots, nleaves, (PetscInt *)ilocal, PETSC_COPY_VALUES, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
723     } else {
724       PetscCall(PetscSFSetGraphWithPattern(*newsf, sf->map, sf->pattern));
725     }
726   }
727   /* Since oldtype is committed, so is newtype, according to MPI */
728   if (sf->vscat.bs > 1) PetscCallMPI(MPI_Type_dup(sf->vscat.unit, &dtype));
729   (*newsf)->vscat.bs     = sf->vscat.bs;
730   (*newsf)->vscat.unit   = dtype;
731   (*newsf)->vscat.to_n   = sf->vscat.to_n;
732   (*newsf)->vscat.from_n = sf->vscat.from_n;
733   /* Do not copy lsf. Build it on demand since it is rarely used */
734 
735 #if defined(PETSC_HAVE_DEVICE)
736   (*newsf)->backend              = sf->backend;
737   (*newsf)->unknown_input_stream = sf->unknown_input_stream;
738   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
739   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
740 #endif
741   PetscTryTypeMethod(sf, Duplicate, opt, *newsf);
742   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
743   PetscFunctionReturn(0);
744 }
745 
746 /*@C
747    PetscSFGetGraph - Get the graph specifying a parallel star forest
748 
749    Not Collective
750 
751    Input Parameter:
752 .  sf - star forest
753 
754    Output Parameters:
755 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
756 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
757 .  ilocal - locations of leaves in leafdata buffers (if returned value is NULL, it means leaves are in contiguous storage)
758 -  iremote - remote locations of root vertices for each leaf on the current process
759 
760    Level: intermediate
761 
762    Notes:
763      We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
764 
765      The returned ilocal/iremote might contain values in different order than the input ones in `PetscSFSetGraph()`,
766      see its manpage for details.
767 
768    Fortran Notes:
769      The returned iremote array is a copy and must be deallocated after use. Consequently, if you
770      want to update the graph, you must call `PetscSFSetGraph()` after modifying the iremote array.
771 
772      To check for a NULL ilocal use
773 $      if (loc(ilocal) == loc(PETSC_NULL_INTEGER)) then
774 
775 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`
776 @*/
777 PetscErrorCode PetscSFGetGraph(PetscSF sf, PetscInt *nroots, PetscInt *nleaves, const PetscInt **ilocal, const PetscSFNode **iremote)
778 {
779   PetscFunctionBegin;
780   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
781   if (sf->ops->GetGraph) {
782     PetscCall((sf->ops->GetGraph)(sf, nroots, nleaves, ilocal, iremote));
783   } else {
784     if (nroots) *nroots = sf->nroots;
785     if (nleaves) *nleaves = sf->nleaves;
786     if (ilocal) *ilocal = sf->mine;
787     if (iremote) *iremote = sf->remote;
788   }
789   PetscFunctionReturn(0);
790 }
791 
792 /*@
793    PetscSFGetLeafRange - Get the active leaf ranges
794 
795    Not Collective
796 
797    Input Parameter:
798 .  sf - star forest
799 
800    Output Parameters:
801 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
802 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
803 
804    Level: developer
805 
806 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
807 @*/
808 PetscErrorCode PetscSFGetLeafRange(PetscSF sf, PetscInt *minleaf, PetscInt *maxleaf)
809 {
810   PetscFunctionBegin;
811   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
812   PetscSFCheckGraphSet(sf, 1);
813   if (minleaf) *minleaf = sf->minleaf;
814   if (maxleaf) *maxleaf = sf->maxleaf;
815   PetscFunctionReturn(0);
816 }
817 
818 /*@C
819    PetscSFViewFromOptions - View a `PetscSF` based on arguments in the options database
820 
821    Collective on A
822 
823    Input Parameters:
824 +  A - the star forest
825 .  obj - Optional object that provides the prefix for the option names
826 -  name - command line option
827 
828    Level: intermediate
829 
830 .seealso: `PetscSF`, `PetscSFView`, `PetscObjectViewFromOptions()`, `PetscSFCreate()`
831 @*/
832 PetscErrorCode PetscSFViewFromOptions(PetscSF A, PetscObject obj, const char name[])
833 {
834   PetscFunctionBegin;
835   PetscValidHeaderSpecific(A, PETSCSF_CLASSID, 1);
836   PetscCall(PetscObjectViewFromOptions((PetscObject)A, obj, name));
837   PetscFunctionReturn(0);
838 }
839 
840 /*@C
841    PetscSFView - view a star forest
842 
843    Collective
844 
845    Input Parameters:
846 +  sf - star forest
847 -  viewer - viewer to display graph, for example `PETSC_VIEWER_STDOUT_WORLD`
848 
849    Level: beginner
850 
851 .seealso: `PetscSF`, `PetscViewer`, `PetscSFCreate()`, `PetscSFSetGraph()`
852 @*/
853 PetscErrorCode PetscSFView(PetscSF sf, PetscViewer viewer)
854 {
855   PetscBool         iascii;
856   PetscViewerFormat format;
857 
858   PetscFunctionBegin;
859   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
860   if (!viewer) PetscCall(PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf), &viewer));
861   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
862   PetscCheckSameComm(sf, 1, viewer, 2);
863   if (sf->graphset) PetscCall(PetscSFSetUp(sf));
864   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
865   if (iascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) {
866     PetscMPIInt rank;
867     PetscInt    ii, i, j;
868 
869     PetscCall(PetscObjectPrintClassNamePrefixType((PetscObject)sf, viewer));
870     PetscCall(PetscViewerASCIIPushTab(viewer));
871     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
872       if (!sf->graphset) {
873         PetscCall(PetscViewerASCIIPrintf(viewer, "PetscSFSetGraph() has not been called yet\n"));
874         PetscCall(PetscViewerASCIIPopTab(viewer));
875         PetscFunctionReturn(0);
876       }
877       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
878       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
879       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Number of roots=%" PetscInt_FMT ", leaves=%" PetscInt_FMT ", remote ranks=%" PetscInt_FMT "\n", rank, sf->nroots, sf->nleaves, sf->nranks));
880       for (i = 0; i < sf->nleaves; i++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %" PetscInt_FMT " <- (%" PetscInt_FMT ",%" PetscInt_FMT ")\n", rank, sf->mine ? sf->mine[i] : i, sf->remote[i].rank, sf->remote[i].index));
881       PetscCall(PetscViewerFlush(viewer));
882       PetscCall(PetscViewerGetFormat(viewer, &format));
883       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
884         PetscMPIInt *tmpranks, *perm;
885         PetscCall(PetscMalloc2(sf->nranks, &tmpranks, sf->nranks, &perm));
886         PetscCall(PetscArraycpy(tmpranks, sf->ranks, sf->nranks));
887         for (i = 0; i < sf->nranks; i++) perm[i] = i;
888         PetscCall(PetscSortMPIIntWithArray(sf->nranks, tmpranks, perm));
889         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Roots referenced by my leaves, by rank\n", rank));
890         for (ii = 0; ii < sf->nranks; ii++) {
891           i = perm[ii];
892           PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %d: %" PetscInt_FMT " edges\n", rank, sf->ranks[i], sf->roffset[i + 1] - sf->roffset[i]));
893           for (j = sf->roffset[i]; j < sf->roffset[i + 1]; j++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d]    %" PetscInt_FMT " <- %" PetscInt_FMT "\n", rank, sf->rmine[j], sf->rremote[j]));
894         }
895         PetscCall(PetscFree2(tmpranks, perm));
896       }
897       PetscCall(PetscViewerFlush(viewer));
898       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
899     }
900     PetscCall(PetscViewerASCIIPopTab(viewer));
901   }
902   PetscTryTypeMethod(sf, View, viewer);
903   PetscFunctionReturn(0);
904 }
905 
906 /*@C
907    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
908 
909    Not Collective
910 
911    Input Parameter:
912 .  sf - star forest
913 
914    Output Parameters:
915 +  nranks - number of ranks referenced by local part
916 .  ranks - array of ranks
917 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
918 .  rmine - concatenated array holding local indices referencing each remote rank
919 -  rremote - concatenated array holding remote indices referenced for each remote rank
920 
921    Level: developer
922 
923 .seealso: `PetscSF`, `PetscSFGetLeafRanks()`
924 @*/
925 PetscErrorCode PetscSFGetRootRanks(PetscSF sf, PetscInt *nranks, const PetscMPIInt **ranks, const PetscInt **roffset, const PetscInt **rmine, const PetscInt **rremote)
926 {
927   PetscFunctionBegin;
928   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
929   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
930   if (sf->ops->GetRootRanks) {
931     PetscCall((sf->ops->GetRootRanks)(sf, nranks, ranks, roffset, rmine, rremote));
932   } else {
933     /* The generic implementation */
934     if (nranks) *nranks = sf->nranks;
935     if (ranks) *ranks = sf->ranks;
936     if (roffset) *roffset = sf->roffset;
937     if (rmine) *rmine = sf->rmine;
938     if (rremote) *rremote = sf->rremote;
939   }
940   PetscFunctionReturn(0);
941 }
942 
943 /*@C
944    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
945 
946    Not Collective
947 
948    Input Parameter:
949 .  sf - star forest
950 
951    Output Parameters:
952 +  niranks - number of leaf ranks referencing roots on this process
953 .  iranks - array of ranks
954 .  ioffset - offset in irootloc for each rank (length niranks+1)
955 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
956 
957    Level: developer
958 
959 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
960 @*/
961 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf, PetscInt *niranks, const PetscMPIInt **iranks, const PetscInt **ioffset, const PetscInt **irootloc)
962 {
963   PetscFunctionBegin;
964   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
965   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
966   if (sf->ops->GetLeafRanks) {
967     PetscCall((sf->ops->GetLeafRanks)(sf, niranks, iranks, ioffset, irootloc));
968   } else {
969     PetscSFType type;
970     PetscCall(PetscSFGetType(sf, &type));
971     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
972   }
973   PetscFunctionReturn(0);
974 }
975 
976 static PetscBool InList(PetscMPIInt needle, PetscMPIInt n, const PetscMPIInt *list)
977 {
978   PetscInt i;
979   for (i = 0; i < n; i++) {
980     if (needle == list[i]) return PETSC_TRUE;
981   }
982   return PETSC_FALSE;
983 }
984 
985 /*@C
986    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by `PetscSF` implementations.
987 
988    Collective
989 
990    Input Parameters:
991 +  sf - `PetscSF` to set up; `PetscSFSetGraph()` must have been called
992 -  dgroup - `MPI_Group` of ranks to be distinguished (e.g., for self or shared memory exchange)
993 
994    Level: developer
995 
996 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
997 @*/
998 PetscErrorCode PetscSFSetUpRanks(PetscSF sf, MPI_Group dgroup)
999 {
1000   PetscTable         table;
1001   PetscTablePosition pos;
1002   PetscMPIInt        size, groupsize, *groupranks;
1003   PetscInt          *rcount, *ranks;
1004   PetscInt           i, irank = -1, orank = -1;
1005 
1006   PetscFunctionBegin;
1007   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1008   PetscSFCheckGraphSet(sf, 1);
1009   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
1010   PetscCall(PetscTableCreate(10, size, &table));
1011   for (i = 0; i < sf->nleaves; i++) {
1012     /* Log 1-based rank */
1013     PetscCall(PetscTableAdd(table, sf->remote[i].rank + 1, 1, ADD_VALUES));
1014   }
1015   PetscCall(PetscTableGetCount(table, &sf->nranks));
1016   PetscCall(PetscMalloc4(sf->nranks, &sf->ranks, sf->nranks + 1, &sf->roffset, sf->nleaves, &sf->rmine, sf->nleaves, &sf->rremote));
1017   PetscCall(PetscMalloc2(sf->nranks, &rcount, sf->nranks, &ranks));
1018   PetscCall(PetscTableGetHeadPosition(table, &pos));
1019   for (i = 0; i < sf->nranks; i++) {
1020     PetscCall(PetscTableGetNext(table, &pos, &ranks[i], &rcount[i]));
1021     ranks[i]--; /* Convert back to 0-based */
1022   }
1023   PetscCall(PetscTableDestroy(&table));
1024 
1025   /* We expect that dgroup is reliably "small" while nranks could be large */
1026   {
1027     MPI_Group    group = MPI_GROUP_NULL;
1028     PetscMPIInt *dgroupranks;
1029     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1030     PetscCallMPI(MPI_Group_size(dgroup, &groupsize));
1031     PetscCall(PetscMalloc1(groupsize, &dgroupranks));
1032     PetscCall(PetscMalloc1(groupsize, &groupranks));
1033     for (i = 0; i < groupsize; i++) dgroupranks[i] = i;
1034     if (groupsize) PetscCallMPI(MPI_Group_translate_ranks(dgroup, groupsize, dgroupranks, group, groupranks));
1035     PetscCallMPI(MPI_Group_free(&group));
1036     PetscCall(PetscFree(dgroupranks));
1037   }
1038 
1039   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1040   for (sf->ndranks = 0, i = sf->nranks; sf->ndranks < i;) {
1041     for (i--; sf->ndranks < i; i--) { /* Scan i backward looking for distinguished rank */
1042       if (InList(ranks[i], groupsize, groupranks)) break;
1043     }
1044     for (; sf->ndranks <= i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1045       if (!InList(ranks[sf->ndranks], groupsize, groupranks)) break;
1046     }
1047     if (sf->ndranks < i) { /* Swap ranks[sf->ndranks] with ranks[i] */
1048       PetscInt tmprank, tmpcount;
1049 
1050       tmprank             = ranks[i];
1051       tmpcount            = rcount[i];
1052       ranks[i]            = ranks[sf->ndranks];
1053       rcount[i]           = rcount[sf->ndranks];
1054       ranks[sf->ndranks]  = tmprank;
1055       rcount[sf->ndranks] = tmpcount;
1056       sf->ndranks++;
1057     }
1058   }
1059   PetscCall(PetscFree(groupranks));
1060   PetscCall(PetscSortIntWithArray(sf->ndranks, ranks, rcount));
1061   PetscCall(PetscSortIntWithArray(sf->nranks - sf->ndranks, ranks + sf->ndranks, rcount + sf->ndranks));
1062   sf->roffset[0] = 0;
1063   for (i = 0; i < sf->nranks; i++) {
1064     PetscCall(PetscMPIIntCast(ranks[i], sf->ranks + i));
1065     sf->roffset[i + 1] = sf->roffset[i] + rcount[i];
1066     rcount[i]          = 0;
1067   }
1068   for (i = 0, irank = -1, orank = -1; i < sf->nleaves; i++) {
1069     /* short circuit */
1070     if (orank != sf->remote[i].rank) {
1071       /* Search for index of iremote[i].rank in sf->ranks */
1072       PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->ndranks, sf->ranks, &irank));
1073       if (irank < 0) {
1074         PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->nranks - sf->ndranks, sf->ranks + sf->ndranks, &irank));
1075         if (irank >= 0) irank += sf->ndranks;
1076       }
1077       orank = sf->remote[i].rank;
1078     }
1079     PetscCheck(irank >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Could not find rank %" PetscInt_FMT " in array", sf->remote[i].rank);
1080     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1081     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1082     rcount[irank]++;
1083   }
1084   PetscCall(PetscFree2(rcount, ranks));
1085   PetscFunctionReturn(0);
1086 }
1087 
1088 /*@C
1089    PetscSFGetGroups - gets incoming and outgoing process groups
1090 
1091    Collective
1092 
1093    Input Parameter:
1094 .  sf - star forest
1095 
1096    Output Parameters:
1097 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1098 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1099 
1100    Level: developer
1101 
1102 .seealso: `PetscSF`, `PetscSFGetWindow()`, `PetscSFRestoreWindow()`
1103 @*/
1104 PetscErrorCode PetscSFGetGroups(PetscSF sf, MPI_Group *incoming, MPI_Group *outgoing)
1105 {
1106   MPI_Group group = MPI_GROUP_NULL;
1107 
1108   PetscFunctionBegin;
1109   PetscCheck(sf->nranks >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUpRanks() before obtaining groups");
1110   if (sf->ingroup == MPI_GROUP_NULL) {
1111     PetscInt        i;
1112     const PetscInt *indegree;
1113     PetscMPIInt     rank, *outranks, *inranks;
1114     PetscSFNode    *remote;
1115     PetscSF         bgcount;
1116 
1117     /* Compute the number of incoming ranks */
1118     PetscCall(PetscMalloc1(sf->nranks, &remote));
1119     for (i = 0; i < sf->nranks; i++) {
1120       remote[i].rank  = sf->ranks[i];
1121       remote[i].index = 0;
1122     }
1123     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, &bgcount));
1124     PetscCall(PetscSFSetGraph(bgcount, 1, sf->nranks, NULL, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1125     PetscCall(PetscSFComputeDegreeBegin(bgcount, &indegree));
1126     PetscCall(PetscSFComputeDegreeEnd(bgcount, &indegree));
1127     /* Enumerate the incoming ranks */
1128     PetscCall(PetscMalloc2(indegree[0], &inranks, sf->nranks, &outranks));
1129     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1130     for (i = 0; i < sf->nranks; i++) outranks[i] = rank;
1131     PetscCall(PetscSFGatherBegin(bgcount, MPI_INT, outranks, inranks));
1132     PetscCall(PetscSFGatherEnd(bgcount, MPI_INT, outranks, inranks));
1133     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1134     PetscCallMPI(MPI_Group_incl(group, indegree[0], inranks, &sf->ingroup));
1135     PetscCallMPI(MPI_Group_free(&group));
1136     PetscCall(PetscFree2(inranks, outranks));
1137     PetscCall(PetscSFDestroy(&bgcount));
1138   }
1139   *incoming = sf->ingroup;
1140 
1141   if (sf->outgroup == MPI_GROUP_NULL) {
1142     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1143     PetscCallMPI(MPI_Group_incl(group, sf->nranks, sf->ranks, &sf->outgroup));
1144     PetscCallMPI(MPI_Group_free(&group));
1145   }
1146   *outgoing = sf->outgroup;
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 /*@
1151    PetscSFGetMultiSF - gets the inner `PetscSF` implementing gathers and scatters
1152 
1153    Collective
1154 
1155    Input Parameter:
1156 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1157 
1158    Output Parameter:
1159 .  multi - star forest with split roots, such that each root has degree exactly 1
1160 
1161    Level: developer
1162 
1163    Note:
1164    In most cases, users should use `PetscSFGatherBegin()` and `PetscSFScatterBegin()` instead of manipulating multi
1165    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1166    edge, it is a candidate for future optimization that might involve its removal.
1167 
1168 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`, `PetscSFComputeMultiRootOriginalNumbering()`
1169 @*/
1170 PetscErrorCode PetscSFGetMultiSF(PetscSF sf, PetscSF *multi)
1171 {
1172   PetscFunctionBegin;
1173   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1174   PetscValidPointer(multi, 2);
1175   if (sf->nroots < 0) { /* Graph has not been set yet; why do we need this? */
1176     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1177     *multi           = sf->multi;
1178     sf->multi->multi = sf->multi;
1179     PetscFunctionReturn(0);
1180   }
1181   if (!sf->multi) {
1182     const PetscInt *indegree;
1183     PetscInt        i, *inoffset, *outones, *outoffset, maxlocal;
1184     PetscSFNode    *remote;
1185     maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
1186     PetscCall(PetscSFComputeDegreeBegin(sf, &indegree));
1187     PetscCall(PetscSFComputeDegreeEnd(sf, &indegree));
1188     PetscCall(PetscMalloc3(sf->nroots + 1, &inoffset, maxlocal, &outones, maxlocal, &outoffset));
1189     inoffset[0] = 0;
1190     for (i = 0; i < sf->nroots; i++) inoffset[i + 1] = inoffset[i] + indegree[i];
1191     for (i = 0; i < maxlocal; i++) outones[i] = 1;
1192     PetscCall(PetscSFFetchAndOpBegin(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1193     PetscCall(PetscSFFetchAndOpEnd(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1194     for (i = 0; i < sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1195     if (PetscDefined(USE_DEBUG)) {                               /* Check that the expected number of increments occurred */
1196       for (i = 0; i < sf->nroots; i++) PetscCheck(inoffset[i] + indegree[i] == inoffset[i + 1], PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incorrect result after PetscSFFetchAndOp");
1197     }
1198     PetscCall(PetscMalloc1(sf->nleaves, &remote));
1199     for (i = 0; i < sf->nleaves; i++) {
1200       remote[i].rank  = sf->remote[i].rank;
1201       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1202     }
1203     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1204     sf->multi->multi = sf->multi;
1205     PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1206     if (sf->rankorder) { /* Sort the ranks */
1207       PetscMPIInt  rank;
1208       PetscInt    *inranks, *newoffset, *outranks, *newoutoffset, *tmpoffset, maxdegree;
1209       PetscSFNode *newremote;
1210       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1211       for (i = 0, maxdegree = 0; i < sf->nroots; i++) maxdegree = PetscMax(maxdegree, indegree[i]);
1212       PetscCall(PetscMalloc5(sf->multi->nroots, &inranks, sf->multi->nroots, &newoffset, maxlocal, &outranks, maxlocal, &newoutoffset, maxdegree, &tmpoffset));
1213       for (i = 0; i < maxlocal; i++) outranks[i] = rank;
1214       PetscCall(PetscSFReduceBegin(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1215       PetscCall(PetscSFReduceEnd(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1216       /* Sort the incoming ranks at each vertex, build the inverse map */
1217       for (i = 0; i < sf->nroots; i++) {
1218         PetscInt j;
1219         for (j = 0; j < indegree[i]; j++) tmpoffset[j] = j;
1220         PetscCall(PetscSortIntWithArray(indegree[i], inranks + inoffset[i], tmpoffset));
1221         for (j = 0; j < indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1222       }
1223       PetscCall(PetscSFBcastBegin(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1224       PetscCall(PetscSFBcastEnd(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1225       PetscCall(PetscMalloc1(sf->nleaves, &newremote));
1226       for (i = 0; i < sf->nleaves; i++) {
1227         newremote[i].rank  = sf->remote[i].rank;
1228         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1229       }
1230       PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, newremote, PETSC_OWN_POINTER));
1231       PetscCall(PetscFree5(inranks, newoffset, outranks, newoutoffset, tmpoffset));
1232     }
1233     PetscCall(PetscFree3(inoffset, outones, outoffset));
1234   }
1235   *multi = sf->multi;
1236   PetscFunctionReturn(0);
1237 }
1238 
1239 /*@C
1240    PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots, does not remap indices
1241 
1242    Collective
1243 
1244    Input Parameters:
1245 +  sf - original star forest
1246 .  nselected  - number of selected roots on this process
1247 -  selected   - indices of the selected roots on this process
1248 
1249    Output Parameter:
1250 .  esf - new star forest
1251 
1252    Level: advanced
1253 
1254    Note:
1255    To use the new `PetscSF`, it may be necessary to know the indices of the leaves that are still participating. This can
1256    be done by calling PetscSFGetGraph().
1257 
1258 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1259 @*/
1260 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *esf)
1261 {
1262   PetscInt           i, j, n, nroots, nleaves, esf_nleaves, *new_ilocal, minleaf, maxleaf, maxlocal;
1263   const PetscInt    *ilocal;
1264   signed char       *rootdata, *leafdata, *leafmem;
1265   const PetscSFNode *iremote;
1266   PetscSFNode       *new_iremote;
1267   MPI_Comm           comm;
1268 
1269   PetscFunctionBegin;
1270   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1271   PetscSFCheckGraphSet(sf, 1);
1272   if (nselected) PetscValidIntPointer(selected, 3);
1273   PetscValidPointer(esf, 4);
1274 
1275   PetscCall(PetscSFSetUp(sf));
1276   PetscCall(PetscLogEventBegin(PETSCSF_EmbedSF, sf, 0, 0, 0));
1277   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1278   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
1279 
1280   if (PetscDefined(USE_DEBUG)) { /* Error out if selected[] has dups or  out of range indices */
1281     PetscBool dups;
1282     PetscCall(PetscCheckDupsInt(nselected, selected, &dups));
1283     PetscCheck(!dups, comm, PETSC_ERR_ARG_WRONG, "selected[] has dups");
1284     for (i = 0; i < nselected; i++) PetscCheck(selected[i] >= 0 && selected[i] < nroots, comm, PETSC_ERR_ARG_OUTOFRANGE, "selected root indice %" PetscInt_FMT " is out of [0,%" PetscInt_FMT ")", selected[i], nroots);
1285   }
1286 
1287   if (sf->ops->CreateEmbeddedRootSF) PetscUseTypeMethod(sf, CreateEmbeddedRootSF, nselected, selected, esf);
1288   else {
1289     /* A generic version of creating embedded sf */
1290     PetscCall(PetscSFGetLeafRange(sf, &minleaf, &maxleaf));
1291     maxlocal = maxleaf - minleaf + 1;
1292     PetscCall(PetscCalloc2(nroots, &rootdata, maxlocal, &leafmem));
1293     leafdata = leafmem - minleaf;
1294     /* Tag selected roots and bcast to leaves */
1295     for (i = 0; i < nselected; i++) rootdata[selected[i]] = 1;
1296     PetscCall(PetscSFBcastBegin(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1297     PetscCall(PetscSFBcastEnd(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1298 
1299     /* Build esf with leaves that are still connected */
1300     esf_nleaves = 0;
1301     for (i = 0; i < nleaves; i++) {
1302       j = ilocal ? ilocal[i] : i;
1303       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1304          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1305       */
1306       esf_nleaves += (leafdata[j] ? 1 : 0);
1307     }
1308     PetscCall(PetscMalloc1(esf_nleaves, &new_ilocal));
1309     PetscCall(PetscMalloc1(esf_nleaves, &new_iremote));
1310     for (i = n = 0; i < nleaves; i++) {
1311       j = ilocal ? ilocal[i] : i;
1312       if (leafdata[j]) {
1313         new_ilocal[n]        = j;
1314         new_iremote[n].rank  = iremote[i].rank;
1315         new_iremote[n].index = iremote[i].index;
1316         ++n;
1317       }
1318     }
1319     PetscCall(PetscSFCreate(comm, esf));
1320     PetscCall(PetscSFSetFromOptions(*esf));
1321     PetscCall(PetscSFSetGraph(*esf, nroots, esf_nleaves, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1322     PetscCall(PetscFree2(rootdata, leafmem));
1323   }
1324   PetscCall(PetscLogEventEnd(PETSCSF_EmbedSF, sf, 0, 0, 0));
1325   PetscFunctionReturn(0);
1326 }
1327 
1328 /*@C
1329   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1330 
1331   Collective
1332 
1333   Input Parameters:
1334 + sf - original star forest
1335 . nselected  - number of selected leaves on this process
1336 - selected   - indices of the selected leaves on this process
1337 
1338   Output Parameter:
1339 .  newsf - new star forest
1340 
1341   Level: advanced
1342 
1343 .seealso: `PetscSF`, `PetscSFCreateEmbeddedRootSF()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1344 @*/
1345 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *newsf)
1346 {
1347   const PetscSFNode *iremote;
1348   PetscSFNode       *new_iremote;
1349   const PetscInt    *ilocal;
1350   PetscInt           i, nroots, *leaves, *new_ilocal;
1351   MPI_Comm           comm;
1352 
1353   PetscFunctionBegin;
1354   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1355   PetscSFCheckGraphSet(sf, 1);
1356   if (nselected) PetscValidIntPointer(selected, 3);
1357   PetscValidPointer(newsf, 4);
1358 
1359   /* Uniq selected[] and put results in leaves[] */
1360   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1361   PetscCall(PetscMalloc1(nselected, &leaves));
1362   PetscCall(PetscArraycpy(leaves, selected, nselected));
1363   PetscCall(PetscSortedRemoveDupsInt(&nselected, leaves));
1364   PetscCheck(!nselected || !(leaves[0] < 0 || leaves[nselected - 1] >= sf->nleaves), comm, PETSC_ERR_ARG_OUTOFRANGE, "Min/Max leaf indices %" PetscInt_FMT "/%" PetscInt_FMT " are not in [0,%" PetscInt_FMT ")", leaves[0], leaves[nselected - 1], sf->nleaves);
1365 
1366   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1367   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) PetscUseTypeMethod(sf, CreateEmbeddedLeafSF, nselected, leaves, newsf);
1368   else {
1369     PetscCall(PetscSFGetGraph(sf, &nroots, NULL, &ilocal, &iremote));
1370     PetscCall(PetscMalloc1(nselected, &new_ilocal));
1371     PetscCall(PetscMalloc1(nselected, &new_iremote));
1372     for (i = 0; i < nselected; ++i) {
1373       const PetscInt l     = leaves[i];
1374       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1375       new_iremote[i].rank  = iremote[l].rank;
1376       new_iremote[i].index = iremote[l].index;
1377     }
1378     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, newsf));
1379     PetscCall(PetscSFSetGraph(*newsf, nroots, nselected, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1380   }
1381   PetscCall(PetscFree(leaves));
1382   PetscFunctionReturn(0);
1383 }
1384 
1385 /*@C
1386    PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to `PetscSFBcastEnd()`
1387 
1388    Collective on sf
1389 
1390    Input Parameters:
1391 +  sf - star forest on which to communicate
1392 .  unit - data type associated with each node
1393 .  rootdata - buffer to broadcast
1394 -  op - operation to use for reduction
1395 
1396    Output Parameter:
1397 .  leafdata - buffer to be reduced with values from each leaf's respective root
1398 
1399    Level: intermediate
1400 
1401    Notes:
1402     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1403     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1404     use `PetscSFBcastWithMemTypeBegin()` instead.
1405 
1406 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastWithMemTypeBegin()`
1407 @*/
1408 PetscErrorCode PetscSFBcastBegin(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1409 {
1410   PetscMemType rootmtype, leafmtype;
1411 
1412   PetscFunctionBegin;
1413   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1414   PetscCall(PetscSFSetUp(sf));
1415   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1416   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1417   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1418   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1419   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1420   PetscFunctionReturn(0);
1421 }
1422 
1423 /*@C
1424    PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to `PetscSFBcastEnd()`
1425 
1426    Collective on sf
1427 
1428    Input Parameters:
1429 +  sf - star forest on which to communicate
1430 .  unit - data type associated with each node
1431 .  rootmtype - memory type of rootdata
1432 .  rootdata - buffer to broadcast
1433 .  leafmtype - memory type of leafdata
1434 -  op - operation to use for reduction
1435 
1436    Output Parameter:
1437 .  leafdata - buffer to be reduced with values from each leaf's respective root
1438 
1439    Level: intermediate
1440 
1441 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastBegin()`
1442 @*/
1443 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
1444 {
1445   PetscFunctionBegin;
1446   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1447   PetscCall(PetscSFSetUp(sf));
1448   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1449   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1450   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1451   PetscFunctionReturn(0);
1452 }
1453 
1454 /*@C
1455    PetscSFBcastEnd - end a broadcast & reduce operation started with `PetscSFBcastBegin()`
1456 
1457    Collective
1458 
1459    Input Parameters:
1460 +  sf - star forest
1461 .  unit - data type
1462 .  rootdata - buffer to broadcast
1463 -  op - operation to use for reduction
1464 
1465    Output Parameter:
1466 .  leafdata - buffer to be reduced with values from each leaf's respective root
1467 
1468    Level: intermediate
1469 
1470 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFReduceEnd()`
1471 @*/
1472 PetscErrorCode PetscSFBcastEnd(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1473 {
1474   PetscFunctionBegin;
1475   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1476   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastEnd, sf, 0, 0, 0));
1477   PetscUseTypeMethod(sf, BcastEnd, unit, rootdata, leafdata, op);
1478   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastEnd, sf, 0, 0, 0));
1479   PetscFunctionReturn(0);
1480 }
1481 
1482 /*@C
1483    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to `PetscSFReduceEnd()`
1484 
1485    Collective
1486 
1487    Input Parameters:
1488 +  sf - star forest
1489 .  unit - data type
1490 .  leafdata - values to reduce
1491 -  op - reduction operation
1492 
1493    Output Parameter:
1494 .  rootdata - result of reduction of values from all leaves of each root
1495 
1496    Level: intermediate
1497 
1498    Notes:
1499     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1500     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1501     use `PetscSFReduceWithMemTypeBegin()` instead.
1502 
1503 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceWithMemTypeBegin()`
1504 @*/
1505 PetscErrorCode PetscSFReduceBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1506 {
1507   PetscMemType rootmtype, leafmtype;
1508 
1509   PetscFunctionBegin;
1510   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1511   PetscCall(PetscSFSetUp(sf));
1512   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1513   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1514   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1515   PetscCall((sf->ops->ReduceBegin)(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1516   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1517   PetscFunctionReturn(0);
1518 }
1519 
1520 /*@C
1521    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to `PetscSFReduceEnd()`
1522 
1523    Collective
1524 
1525    Input Parameters:
1526 +  sf - star forest
1527 .  unit - data type
1528 .  leafmtype - memory type of leafdata
1529 .  leafdata - values to reduce
1530 .  rootmtype - memory type of rootdata
1531 -  op - reduction operation
1532 
1533    Output Parameter:
1534 .  rootdata - result of reduction of values from all leaves of each root
1535 
1536    Level: intermediate
1537 
1538 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceBegin()`
1539 @*/
1540 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
1541 {
1542   PetscFunctionBegin;
1543   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1544   PetscCall(PetscSFSetUp(sf));
1545   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1546   PetscCall((sf->ops->ReduceBegin)(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1547   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 /*@C
1552    PetscSFReduceEnd - end a reduction operation started with `PetscSFReduceBegin()`
1553 
1554    Collective
1555 
1556    Input Parameters:
1557 +  sf - star forest
1558 .  unit - data type
1559 .  leafdata - values to reduce
1560 -  op - reduction operation
1561 
1562    Output Parameter:
1563 .  rootdata - result of reduction of values from all leaves of each root
1564 
1565    Level: intermediate
1566 
1567 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFBcastEnd()`
1568 @*/
1569 PetscErrorCode PetscSFReduceEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1570 {
1571   PetscFunctionBegin;
1572   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1573   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1574   PetscUseTypeMethod(sf, ReduceEnd, unit, leafdata, rootdata, op);
1575   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1576   PetscFunctionReturn(0);
1577 }
1578 
1579 /*@C
1580    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value,
1581    to be completed with `PetscSFFetchAndOpEnd()`
1582 
1583    Collective
1584 
1585    Input Parameters:
1586 +  sf - star forest
1587 .  unit - data type
1588 .  leafdata - leaf values to use in reduction
1589 -  op - operation to use for reduction
1590 
1591    Output Parameters:
1592 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1593 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1594 
1595    Level: advanced
1596 
1597    Note:
1598    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1599    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1600    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1601    integers.
1602 
1603 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`
1604 @*/
1605 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1606 {
1607   PetscMemType rootmtype, leafmtype, leafupdatemtype;
1608 
1609   PetscFunctionBegin;
1610   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1611   PetscCall(PetscSFSetUp(sf));
1612   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1613   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1614   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1615   PetscCall(PetscGetMemType(leafupdate, &leafupdatemtype));
1616   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1617   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1618   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1619   PetscFunctionReturn(0);
1620 }
1621 
1622 /*@C
1623    PetscSFFetchAndOpWithMemTypeBegin - begin operation with explicit memory types that fetches values from root and updates atomically by
1624    applying operation using my leaf value, to be completed with `PetscSFFetchAndOpEnd()`
1625 
1626    Collective
1627 
1628    Input Parameters:
1629 +  sf - star forest
1630 .  unit - data type
1631 .  rootmtype - memory type of rootdata
1632 .  leafmtype - memory type of leafdata
1633 .  leafdata - leaf values to use in reduction
1634 .  leafupdatemtype - memory type of leafupdate
1635 -  op - operation to use for reduction
1636 
1637    Output Parameters:
1638 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1639 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1640 
1641    Level: advanced
1642 
1643    Note:
1644    See `PetscSFFetchAndOpBegin()` for more details.
1645 
1646 .seealso: `PetscSF`, `PetscSFFetchAndOpBegin()`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`
1647 @*/
1648 PetscErrorCode PetscSFFetchAndOpWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, PetscMemType leafupdatemtype, void *leafupdate, MPI_Op op)
1649 {
1650   PetscFunctionBegin;
1651   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1652   PetscCall(PetscSFSetUp(sf));
1653   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1654   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1655   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1656   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1657   PetscFunctionReturn(0);
1658 }
1659 
1660 /*@C
1661    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1662 
1663    Collective
1664 
1665    Input Parameters:
1666 +  sf - star forest
1667 .  unit - data type
1668 .  leafdata - leaf values to use in reduction
1669 -  op - operation to use for reduction
1670 
1671    Output Parameters:
1672 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1673 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1674 
1675    Level: advanced
1676 
1677 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFReduceEnd()`, `PetscSFSetGraph()`
1678 @*/
1679 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1680 {
1681   PetscFunctionBegin;
1682   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1683   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1684   PetscUseTypeMethod(sf, FetchAndOpEnd, unit, rootdata, leafdata, leafupdate, op);
1685   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1686   PetscFunctionReturn(0);
1687 }
1688 
1689 /*@C
1690    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with `PetscSFComputeDegreeEnd()`
1691 
1692    Collective
1693 
1694    Input Parameter:
1695 .  sf - star forest
1696 
1697    Output Parameter:
1698 .  degree - degree of each root vertex
1699 
1700    Level: advanced
1701 
1702    Note:
1703    The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence no need to call `PetscFree()` on it.
1704 
1705 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeEnd()`
1706 @*/
1707 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf, const PetscInt **degree)
1708 {
1709   PetscFunctionBegin;
1710   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1711   PetscSFCheckGraphSet(sf, 1);
1712   PetscValidPointer(degree, 2);
1713   if (!sf->degreeknown) {
1714     PetscInt i, nroots = sf->nroots, maxlocal;
1715     PetscCheck(!sf->degree, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1716     maxlocal = sf->maxleaf - sf->minleaf + 1;
1717     PetscCall(PetscMalloc1(nroots, &sf->degree));
1718     PetscCall(PetscMalloc1(PetscMax(maxlocal, 1), &sf->degreetmp)); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1719     for (i = 0; i < nroots; i++) sf->degree[i] = 0;
1720     for (i = 0; i < maxlocal; i++) sf->degreetmp[i] = 1;
1721     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1722   }
1723   *degree = NULL;
1724   PetscFunctionReturn(0);
1725 }
1726 
1727 /*@C
1728    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with `PetscSFComputeDegreeBegin()`
1729 
1730    Collective
1731 
1732    Input Parameter:
1733 .  sf - star forest
1734 
1735    Output Parameter:
1736 .  degree - degree of each root vertex
1737 
1738    Level: developer
1739 
1740    Note:
1741    The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence no need to call `PetscFree()` on it.
1742 
1743 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeBegin()`
1744 @*/
1745 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf, const PetscInt **degree)
1746 {
1747   PetscFunctionBegin;
1748   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1749   PetscSFCheckGraphSet(sf, 1);
1750   PetscValidPointer(degree, 2);
1751   if (!sf->degreeknown) {
1752     PetscCheck(sf->degreetmp, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1753     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1754     PetscCall(PetscFree(sf->degreetmp));
1755     sf->degreeknown = PETSC_TRUE;
1756   }
1757   *degree = sf->degree;
1758   PetscFunctionReturn(0);
1759 }
1760 
1761 /*@C
1762    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by `PetscSFGetMultiSF()`).
1763    Each multi-root is assigned index of the corresponding original root.
1764 
1765    Collective
1766 
1767    Input Parameters:
1768 +  sf - star forest
1769 -  degree - degree of each root vertex, computed with `PetscSFComputeDegreeBegin()`/`PetscSFComputeDegreeEnd()`
1770 
1771    Output Parameters:
1772 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1773 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1774 
1775    Level: developer
1776 
1777    Note:
1778    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with `PetscFree()` when no longer needed.
1779 
1780 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFComputeDegreeEnd()`, `PetscSFGetMultiSF()`
1781 @*/
1782 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1783 {
1784   PetscSF  msf;
1785   PetscInt i, j, k, nroots, nmroots;
1786 
1787   PetscFunctionBegin;
1788   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1789   PetscCall(PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL));
1790   if (nroots) PetscValidIntPointer(degree, 2);
1791   if (nMultiRoots) PetscValidIntPointer(nMultiRoots, 3);
1792   PetscValidPointer(multiRootsOrigNumbering, 4);
1793   PetscCall(PetscSFGetMultiSF(sf, &msf));
1794   PetscCall(PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL));
1795   PetscCall(PetscMalloc1(nmroots, multiRootsOrigNumbering));
1796   for (i = 0, j = 0, k = 0; i < nroots; i++) {
1797     if (!degree[i]) continue;
1798     for (j = 0; j < degree[i]; j++, k++) (*multiRootsOrigNumbering)[k] = i;
1799   }
1800   PetscCheck(k == nmroots, PETSC_COMM_SELF, PETSC_ERR_PLIB, "sanity check fail");
1801   if (nMultiRoots) *nMultiRoots = nmroots;
1802   PetscFunctionReturn(0);
1803 }
1804 
1805 /*@C
1806    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with `PetscSFGatherEnd()`
1807 
1808    Collective
1809 
1810    Input Parameters:
1811 +  sf - star forest
1812 .  unit - data type
1813 -  leafdata - leaf data to gather to roots
1814 
1815    Output Parameter:
1816 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1817 
1818    Level: intermediate
1819 
1820 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterBegin()`
1821 @*/
1822 PetscErrorCode PetscSFGatherBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1823 {
1824   PetscSF multi = NULL;
1825 
1826   PetscFunctionBegin;
1827   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1828   PetscCall(PetscSFSetUp(sf));
1829   PetscCall(PetscSFGetMultiSF(sf, &multi));
1830   PetscCall(PetscSFReduceBegin(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1831   PetscFunctionReturn(0);
1832 }
1833 
1834 /*@C
1835    PetscSFGatherEnd - ends pointwise gather operation that was started with `PetscSFGatherBegin()`
1836 
1837    Collective
1838 
1839    Input Parameters:
1840 +  sf - star forest
1841 .  unit - data type
1842 -  leafdata - leaf data to gather to roots
1843 
1844    Output Parameter:
1845 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1846 
1847    Level: intermediate
1848 
1849 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterEnd()`
1850 @*/
1851 PetscErrorCode PetscSFGatherEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1852 {
1853   PetscSF multi = NULL;
1854 
1855   PetscFunctionBegin;
1856   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1857   PetscCall(PetscSFGetMultiSF(sf, &multi));
1858   PetscCall(PetscSFReduceEnd(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1859   PetscFunctionReturn(0);
1860 }
1861 
1862 /*@C
1863    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with `PetscSFScatterEnd()`
1864 
1865    Collective
1866 
1867    Input Parameters:
1868 +  sf - star forest
1869 .  unit - data type
1870 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1871 
1872    Output Parameter:
1873 .  leafdata - leaf data to be update with personal data from each respective root
1874 
1875    Level: intermediate
1876 
1877 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterBegin()`
1878 @*/
1879 PetscErrorCode PetscSFScatterBegin(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1880 {
1881   PetscSF multi = NULL;
1882 
1883   PetscFunctionBegin;
1884   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1885   PetscCall(PetscSFSetUp(sf));
1886   PetscCall(PetscSFGetMultiSF(sf, &multi));
1887   PetscCall(PetscSFBcastBegin(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1888   PetscFunctionReturn(0);
1889 }
1890 
1891 /*@C
1892    PetscSFScatterEnd - ends pointwise scatter operation that was started with `PetscSFScatterBegin()`
1893 
1894    Collective
1895 
1896    Input Parameters:
1897 +  sf - star forest
1898 .  unit - data type
1899 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1900 
1901    Output Parameter:
1902 .  leafdata - leaf data to be update with personal data from each respective root
1903 
1904    Level: intermediate
1905 
1906 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterEnd()`
1907 @*/
1908 PetscErrorCode PetscSFScatterEnd(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1909 {
1910   PetscSF multi = NULL;
1911 
1912   PetscFunctionBegin;
1913   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1914   PetscCall(PetscSFGetMultiSF(sf, &multi));
1915   PetscCall(PetscSFBcastEnd(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1916   PetscFunctionReturn(0);
1917 }
1918 
1919 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1920 {
1921   PetscInt        i, n, nleaves;
1922   const PetscInt *ilocal = NULL;
1923   PetscHSetI      seen;
1924 
1925   PetscFunctionBegin;
1926   if (PetscDefined(USE_DEBUG)) {
1927     PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, NULL));
1928     PetscCall(PetscHSetICreate(&seen));
1929     for (i = 0; i < nleaves; i++) {
1930       const PetscInt leaf = ilocal ? ilocal[i] : i;
1931       PetscCall(PetscHSetIAdd(seen, leaf));
1932     }
1933     PetscCall(PetscHSetIGetSize(seen, &n));
1934     PetscCheck(n == nleaves, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided leaves have repeated values: all leaves must be unique");
1935     PetscCall(PetscHSetIDestroy(&seen));
1936   }
1937   PetscFunctionReturn(0);
1938 }
1939 
1940 /*@
1941   PetscSFCompose - Compose a new `PetscSF` by putting the second `PetscSF` under the first one in a top (roots) down (leaves) view
1942 
1943   Input Parameters:
1944 + sfA - The first `PetscSF`
1945 - sfB - The second `PetscSF`
1946 
1947   Output Parameters:
1948 . sfBA - The composite `PetscSF`
1949 
1950   Level: developer
1951 
1952   Notes:
1953   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
1954   forests, i.e. the same leaf is not connected with different roots.
1955 
1956   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1957   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1958   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1959   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1960 
1961 .seealso: `PetscSF`, `PetscSFComposeInverse()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`
1962 @*/
1963 PetscErrorCode PetscSFCompose(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
1964 {
1965   const PetscSFNode *remotePointsA, *remotePointsB;
1966   PetscSFNode       *remotePointsBA = NULL, *reorderedRemotePointsA = NULL, *leafdataB;
1967   const PetscInt    *localPointsA, *localPointsB;
1968   PetscInt          *localPointsBA;
1969   PetscInt           i, numRootsA, numLeavesA, numRootsB, numLeavesB, minleaf, maxleaf, numLeavesBA;
1970   PetscBool          denseB;
1971 
1972   PetscFunctionBegin;
1973   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
1974   PetscSFCheckGraphSet(sfA, 1);
1975   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
1976   PetscSFCheckGraphSet(sfB, 2);
1977   PetscCheckSameComm(sfA, 1, sfB, 2);
1978   PetscValidPointer(sfBA, 3);
1979   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
1980   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
1981 
1982   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
1983   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
1984   /* Make sure that PetscSFBcast{Begin, End}(sfB, ...) works with root data of size       */
1985   /* numRootsB; otherwise, garbage will be broadcasted.                                   */
1986   /* Example (comm size = 1):                                                             */
1987   /* sfA: 0 <- (0, 0)                                                                     */
1988   /* sfB: 100 <- (0, 0)                                                                   */
1989   /*      101 <- (0, 1)                                                                   */
1990   /* Here, we have remotePointsA = [(0, 0)], but for remotePointsA to be a valid tartget  */
1991   /* of sfB, it has to be recasted as [(0, 0), (-1, -1)] so that points 100 and 101 would */
1992   /* receive (0, 0) and (-1, -1), respectively, when PetscSFBcast(sfB, ...) is called on  */
1993   /* remotePointsA; if not recasted, point 101 would receive a garbage value.             */
1994   PetscCall(PetscMalloc1(numRootsB, &reorderedRemotePointsA));
1995   for (i = 0; i < numRootsB; i++) {
1996     reorderedRemotePointsA[i].rank  = -1;
1997     reorderedRemotePointsA[i].index = -1;
1998   }
1999   for (i = 0; i < numLeavesA; i++) {
2000     PetscInt localp = localPointsA ? localPointsA[i] : i;
2001 
2002     if (localp >= numRootsB) continue;
2003     reorderedRemotePointsA[localp] = remotePointsA[i];
2004   }
2005   remotePointsA = reorderedRemotePointsA;
2006   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2007   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &leafdataB));
2008   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2009     leafdataB[i].rank  = -1;
2010     leafdataB[i].index = -1;
2011   }
2012   PetscCall(PetscSFBcastBegin(sfB, MPIU_2INT, remotePointsA, leafdataB - minleaf, MPI_REPLACE));
2013   PetscCall(PetscSFBcastEnd(sfB, MPIU_2INT, remotePointsA, leafdataB - minleaf, MPI_REPLACE));
2014   PetscCall(PetscFree(reorderedRemotePointsA));
2015 
2016   denseB = (PetscBool)!localPointsB;
2017   for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2018     if (leafdataB[localPointsB ? localPointsB[i] - minleaf : i].rank == -1) denseB = PETSC_FALSE;
2019     else numLeavesBA++;
2020   }
2021   if (denseB) {
2022     localPointsBA  = NULL;
2023     remotePointsBA = leafdataB;
2024   } else {
2025     PetscCall(PetscMalloc1(numLeavesBA, &localPointsBA));
2026     PetscCall(PetscMalloc1(numLeavesBA, &remotePointsBA));
2027     for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2028       const PetscInt l = localPointsB ? localPointsB[i] : i;
2029 
2030       if (leafdataB[l - minleaf].rank == -1) continue;
2031       remotePointsBA[numLeavesBA] = leafdataB[l - minleaf];
2032       localPointsBA[numLeavesBA]  = l;
2033       numLeavesBA++;
2034     }
2035     PetscCall(PetscFree(leafdataB));
2036   }
2037   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2038   PetscCall(PetscSFSetFromOptions(*sfBA));
2039   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2040   PetscFunctionReturn(0);
2041 }
2042 
2043 /*@
2044   PetscSFComposeInverse - Compose a new `PetscSF` by putting the inverse of the second `PetscSF` under the first one
2045 
2046   Input Parameters:
2047 + sfA - The first `PetscSF`
2048 - sfB - The second `PetscSF`
2049 
2050   Output Parameters:
2051 . sfBA - The composite `PetscSF`.
2052 
2053   Level: developer
2054 
2055   Notes:
2056   Currently, the two SFs must be defined on congruent communicators and they must be true star
2057   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2058   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2059 
2060   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2061   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2062   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2063   a Reduce on sfB, on connected roots.
2064 
2065 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFCreateInverseSF()`
2066 @*/
2067 PetscErrorCode PetscSFComposeInverse(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
2068 {
2069   const PetscSFNode *remotePointsA, *remotePointsB;
2070   PetscSFNode       *remotePointsBA;
2071   const PetscInt    *localPointsA, *localPointsB;
2072   PetscSFNode       *reorderedRemotePointsA = NULL;
2073   PetscInt           i, numRootsA, numLeavesA, numLeavesBA, numRootsB, numLeavesB, minleaf, maxleaf, *localPointsBA;
2074   MPI_Op             op;
2075 #if defined(PETSC_USE_64BIT_INDICES)
2076   PetscBool iswin;
2077 #endif
2078 
2079   PetscFunctionBegin;
2080   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
2081   PetscSFCheckGraphSet(sfA, 1);
2082   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
2083   PetscSFCheckGraphSet(sfB, 2);
2084   PetscCheckSameComm(sfA, 1, sfB, 2);
2085   PetscValidPointer(sfBA, 3);
2086   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
2087   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
2088 
2089   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
2090   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
2091 
2092   /* TODO: Check roots of sfB have degree of 1 */
2093   /* Once we implement it, we can replace the MPI_MAXLOC
2094      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2095      We use MPI_MAXLOC only to have a deterministic output from this routine if
2096      the root condition is not meet.
2097    */
2098   op = MPI_MAXLOC;
2099 #if defined(PETSC_USE_64BIT_INDICES)
2100   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2101   PetscCall(PetscObjectTypeCompare((PetscObject)sfB, PETSCSFWINDOW, &iswin));
2102   if (iswin) op = MPI_REPLACE;
2103 #endif
2104 
2105   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2106   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &reorderedRemotePointsA));
2107   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2108     reorderedRemotePointsA[i].rank  = -1;
2109     reorderedRemotePointsA[i].index = -1;
2110   }
2111   if (localPointsA) {
2112     for (i = 0; i < numLeavesA; i++) {
2113       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2114       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2115     }
2116   } else {
2117     for (i = 0; i < numLeavesA; i++) {
2118       if (i > maxleaf || i < minleaf) continue;
2119       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2120     }
2121   }
2122 
2123   PetscCall(PetscMalloc1(numRootsB, &localPointsBA));
2124   PetscCall(PetscMalloc1(numRootsB, &remotePointsBA));
2125   for (i = 0; i < numRootsB; i++) {
2126     remotePointsBA[i].rank  = -1;
2127     remotePointsBA[i].index = -1;
2128   }
2129 
2130   PetscCall(PetscSFReduceBegin(sfB, MPIU_2INT, reorderedRemotePointsA - minleaf, remotePointsBA, op));
2131   PetscCall(PetscSFReduceEnd(sfB, MPIU_2INT, reorderedRemotePointsA - minleaf, remotePointsBA, op));
2132   PetscCall(PetscFree(reorderedRemotePointsA));
2133   for (i = 0, numLeavesBA = 0; i < numRootsB; i++) {
2134     if (remotePointsBA[i].rank == -1) continue;
2135     remotePointsBA[numLeavesBA].rank  = remotePointsBA[i].rank;
2136     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2137     localPointsBA[numLeavesBA]        = i;
2138     numLeavesBA++;
2139   }
2140   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2141   PetscCall(PetscSFSetFromOptions(*sfBA));
2142   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2143   PetscFunctionReturn(0);
2144 }
2145 
2146 /*
2147   PetscSFCreateLocalSF_Private - Creates a local `PetscSF` that only has intra-process edges of the global `PetscSF`
2148 
2149   Input Parameters:
2150 . sf - The global `PetscSF`
2151 
2152   Output Parameters:
2153 . out - The local `PetscSF`
2154 
2155 .seealso: `PetscSF`, `PetscSFCreate()`
2156  */
2157 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf, PetscSF *out)
2158 {
2159   MPI_Comm           comm;
2160   PetscMPIInt        myrank;
2161   const PetscInt    *ilocal;
2162   const PetscSFNode *iremote;
2163   PetscInt           i, j, nroots, nleaves, lnleaves, *lilocal;
2164   PetscSFNode       *liremote;
2165   PetscSF            lsf;
2166 
2167   PetscFunctionBegin;
2168   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2169   if (sf->ops->CreateLocalSF) PetscUseTypeMethod(sf, CreateLocalSF, out);
2170   else {
2171     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2172     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
2173     PetscCallMPI(MPI_Comm_rank(comm, &myrank));
2174 
2175     /* Find out local edges and build a local SF */
2176     PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
2177     for (i = lnleaves = 0; i < nleaves; i++) {
2178       if (iremote[i].rank == (PetscInt)myrank) lnleaves++;
2179     }
2180     PetscCall(PetscMalloc1(lnleaves, &lilocal));
2181     PetscCall(PetscMalloc1(lnleaves, &liremote));
2182 
2183     for (i = j = 0; i < nleaves; i++) {
2184       if (iremote[i].rank == (PetscInt)myrank) {
2185         lilocal[j]        = ilocal ? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2186         liremote[j].rank  = 0;                      /* rank in PETSC_COMM_SELF */
2187         liremote[j].index = iremote[i].index;
2188         j++;
2189       }
2190     }
2191     PetscCall(PetscSFCreate(PETSC_COMM_SELF, &lsf));
2192     PetscCall(PetscSFSetFromOptions(lsf));
2193     PetscCall(PetscSFSetGraph(lsf, nroots, lnleaves, lilocal, PETSC_OWN_POINTER, liremote, PETSC_OWN_POINTER));
2194     PetscCall(PetscSFSetUp(lsf));
2195     *out = lsf;
2196   }
2197   PetscFunctionReturn(0);
2198 }
2199 
2200 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2201 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata)
2202 {
2203   PetscMemType rootmtype, leafmtype;
2204 
2205   PetscFunctionBegin;
2206   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2207   PetscCall(PetscSFSetUp(sf));
2208   PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
2209   PetscCall(PetscGetMemType(rootdata, &rootmtype));
2210   PetscCall(PetscGetMemType(leafdata, &leafmtype));
2211   PetscUseTypeMethod(sf, BcastToZero, unit, rootmtype, rootdata, leafmtype, leafdata);
2212   PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
2213   PetscFunctionReturn(0);
2214 }
2215 
2216 /*@
2217   PetscSFConcatenate - concatenate multiple `PetscSF` into one
2218 
2219   Input Parameters:
2220 + comm - the communicator
2221 . nsfs - the number of input `PetscSF`
2222 . sfs  - the array of input `PetscSF`
2223 . rootMode - the root mode specifying how roots are handled
2224 - leafOffsets - the array of local leaf offsets, one for each input `PetscSF`, or NULL for contiguous storage
2225 
2226   Output Parameters:
2227 . newsf - The resulting `PetscSF`
2228 
2229   Level: advanced
2230 
2231   Notes:
2232   The communicator of all SFs in sfs must be comm.
2233 
2234   Leaves are always concatenated locally, keeping them ordered by the input SF index and original local order.
2235   The offsets in leafOffsets are added to the original leaf indices.
2236   If all input SFs use contiguous leaf storage (ilocal = NULL), leafOffsets can be passed as NULL as well.
2237   In this case, NULL is also passed as ilocal to the resulting SF.
2238   If any input SF has non-null ilocal, leafOffsets is needed to distinguish leaves from different input SFs.
2239   In this case, user is responsible to provide correct offsets so that the resulting leaves are unique (otherwise an error occurs).
2240 
2241   All root modes retain the essential connectivity condition:
2242   If two leaves of the same input SF are connected (sharing the same root), they are also connected in the output SF.
2243   Parameter rootMode controls how the input root spaces are combined.
2244   For `PETSCSF_CONCATENATE_ROOTMODE_SHARED`, the root space is considered the same for each input SF (checked in debug mode) and is also the same in the output SF.
2245   For `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, the input root spaces are taken as separate and joined.
2246   `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` joins the root spaces locally;
2247   roots of sfs[0], sfs[1], sfs[2], ... are joined on each rank separately, ordered by input SF and original local index, and renumbered contiguously.
2248   `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL` joins the root spaces globally;
2249   roots of sfs[0], sfs[1], sfs[2, ... are joined globally, ordered by input SF index and original global index, and renumbered contiguously;
2250   the original root ranks are ignored.
2251   For both `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`,
2252   the output SF's root layout is such that the local number of roots is a sum of the input SF's local numbers of roots on each rank to keep the load balancing.
2253   However, for `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, that means roots can move to different ranks.
2254 
2255    Example:
2256    We can use src/vec/is/sf/tests/ex18.c to compare the root modes. By running
2257 $ make -C $PETSC_DIR/src/vec/is/sf/tests ex18
2258 $ for m in {local,global,shared}; do
2259 $   mpirun -n 2 $PETSC_DIR/src/vec/is/sf/tests/ex18 -nsfs 2 -n 2 -root_mode $m -sf_view
2260 $ done
2261    we generate two identical SFs sf_0 and sf_1,
2262 $ PetscSF Object: sf_0 2 MPI processes
2263 $   type: basic
2264 $   rank #leaves #roots
2265 $   [ 0]       4      2
2266 $   [ 1]       4      2
2267 $   leaves      roots       roots in global numbering
2268 $   ( 0,  0) <- ( 0,  0)  =   0
2269 $   ( 0,  1) <- ( 0,  1)  =   1
2270 $   ( 0,  2) <- ( 1,  0)  =   2
2271 $   ( 0,  3) <- ( 1,  1)  =   3
2272 $   ( 1,  0) <- ( 0,  0)  =   0
2273 $   ( 1,  1) <- ( 0,  1)  =   1
2274 $   ( 1,  2) <- ( 1,  0)  =   2
2275 $   ( 1,  3) <- ( 1,  1)  =   3
2276    and pass them to `PetscSFConcatenate()` along with different choices of rootMode, yielding different result_sf:
2277 $ rootMode = local:
2278 $ PetscSF Object: result_sf 2 MPI processes
2279 $   type: basic
2280 $   rank #leaves #roots
2281 $   [ 0]       8      4
2282 $   [ 1]       8      4
2283 $   leaves      roots       roots in global numbering
2284 $   ( 0,  0) <- ( 0,  0)  =   0
2285 $   ( 0,  1) <- ( 0,  1)  =   1
2286 $   ( 0,  2) <- ( 1,  0)  =   4
2287 $   ( 0,  3) <- ( 1,  1)  =   5
2288 $   ( 0,  4) <- ( 0,  2)  =   2
2289 $   ( 0,  5) <- ( 0,  3)  =   3
2290 $   ( 0,  6) <- ( 1,  2)  =   6
2291 $   ( 0,  7) <- ( 1,  3)  =   7
2292 $   ( 1,  0) <- ( 0,  0)  =   0
2293 $   ( 1,  1) <- ( 0,  1)  =   1
2294 $   ( 1,  2) <- ( 1,  0)  =   4
2295 $   ( 1,  3) <- ( 1,  1)  =   5
2296 $   ( 1,  4) <- ( 0,  2)  =   2
2297 $   ( 1,  5) <- ( 0,  3)  =   3
2298 $   ( 1,  6) <- ( 1,  2)  =   6
2299 $   ( 1,  7) <- ( 1,  3)  =   7
2300 $
2301 $ rootMode = global:
2302 $ PetscSF Object: result_sf 2 MPI processes
2303 $   type: basic
2304 $   rank #leaves #roots
2305 $   [ 0]       8      4
2306 $   [ 1]       8      4
2307 $   leaves      roots       roots in global numbering
2308 $   ( 0,  0) <- ( 0,  0)  =   0
2309 $   ( 0,  1) <- ( 0,  1)  =   1
2310 $   ( 0,  2) <- ( 0,  2)  =   2
2311 $   ( 0,  3) <- ( 0,  3)  =   3
2312 $   ( 0,  4) <- ( 1,  0)  =   4
2313 $   ( 0,  5) <- ( 1,  1)  =   5
2314 $   ( 0,  6) <- ( 1,  2)  =   6
2315 $   ( 0,  7) <- ( 1,  3)  =   7
2316 $   ( 1,  0) <- ( 0,  0)  =   0
2317 $   ( 1,  1) <- ( 0,  1)  =   1
2318 $   ( 1,  2) <- ( 0,  2)  =   2
2319 $   ( 1,  3) <- ( 0,  3)  =   3
2320 $   ( 1,  4) <- ( 1,  0)  =   4
2321 $   ( 1,  5) <- ( 1,  1)  =   5
2322 $   ( 1,  6) <- ( 1,  2)  =   6
2323 $   ( 1,  7) <- ( 1,  3)  =   7
2324 $
2325 $ rootMode = shared:
2326 $ PetscSF Object: result_sf 2 MPI processes
2327 $   type: basic
2328 $   rank #leaves #roots
2329 $   [ 0]       8      2
2330 $   [ 1]       8      2
2331 $   leaves      roots       roots in global numbering
2332 $   ( 0,  0) <- ( 0,  0)  =   0
2333 $   ( 0,  1) <- ( 0,  1)  =   1
2334 $   ( 0,  2) <- ( 1,  0)  =   2
2335 $   ( 0,  3) <- ( 1,  1)  =   3
2336 $   ( 0,  4) <- ( 0,  0)  =   0
2337 $   ( 0,  5) <- ( 0,  1)  =   1
2338 $   ( 0,  6) <- ( 1,  0)  =   2
2339 $   ( 0,  7) <- ( 1,  1)  =   3
2340 $   ( 1,  0) <- ( 0,  0)  =   0
2341 $   ( 1,  1) <- ( 0,  1)  =   1
2342 $   ( 1,  2) <- ( 1,  0)  =   2
2343 $   ( 1,  3) <- ( 1,  1)  =   3
2344 $   ( 1,  4) <- ( 0,  0)  =   0
2345 $   ( 1,  5) <- ( 0,  1)  =   1
2346 $   ( 1,  6) <- ( 1,  0)  =   2
2347 $   ( 1,  7) <- ( 1,  1)  =   3
2348 
2349 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFConcatenateRootMode`
2350 @*/
2351 PetscErrorCode PetscSFConcatenate(MPI_Comm comm, PetscInt nsfs, PetscSF sfs[], PetscSFConcatenateRootMode rootMode, PetscInt leafOffsets[], PetscSF *newsf)
2352 {
2353   PetscInt     i, s, nLeaves, nRoots;
2354   PetscInt    *leafArrayOffsets;
2355   PetscInt    *ilocal_new;
2356   PetscSFNode *iremote_new;
2357   PetscBool    all_ilocal_null = PETSC_FALSE;
2358   PetscLayout  glayout         = NULL;
2359   PetscInt    *gremote         = NULL;
2360   PetscMPIInt  rank, size;
2361 
2362   PetscFunctionBegin;
2363   {
2364     PetscSF dummy; /* just to have a PetscObject on comm for input validation */
2365 
2366     PetscCall(PetscSFCreate(comm, &dummy));
2367     PetscValidLogicalCollectiveInt(dummy, nsfs, 2);
2368     PetscValidPointer(sfs, 3);
2369     for (i = 0; i < nsfs; i++) {
2370       PetscValidHeaderSpecific(sfs[i], PETSCSF_CLASSID, 3);
2371       PetscCheckSameComm(dummy, 1, sfs[i], 3);
2372     }
2373     PetscValidLogicalCollectiveEnum(dummy, rootMode, 4);
2374     if (leafOffsets) PetscValidIntPointer(leafOffsets, 5);
2375     PetscValidPointer(newsf, 6);
2376     PetscCall(PetscSFDestroy(&dummy));
2377   }
2378   if (!nsfs) {
2379     PetscCall(PetscSFCreate(comm, newsf));
2380     PetscCall(PetscSFSetGraph(*newsf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
2381     PetscFunctionReturn(0);
2382   }
2383   PetscCallMPI(MPI_Comm_rank(comm, &rank));
2384   PetscCallMPI(MPI_Comm_size(comm, &size));
2385 
2386   /* Calculate leaf array offsets */
2387   PetscCall(PetscMalloc1(nsfs + 1, &leafArrayOffsets));
2388   leafArrayOffsets[0] = 0;
2389   for (s = 0; s < nsfs; s++) {
2390     PetscInt nl;
2391 
2392     PetscCall(PetscSFGetGraph(sfs[s], NULL, &nl, NULL, NULL));
2393     leafArrayOffsets[s + 1] = leafArrayOffsets[s] + nl;
2394   }
2395   nLeaves = leafArrayOffsets[nsfs];
2396 
2397   /* Calculate number of roots */
2398   switch (rootMode) {
2399   case PETSCSF_CONCATENATE_ROOTMODE_SHARED: {
2400     PetscCall(PetscSFGetGraph(sfs[0], &nRoots, NULL, NULL, NULL));
2401     if (PetscDefined(USE_DEBUG)) {
2402       for (s = 1; s < nsfs; s++) {
2403         PetscInt nr;
2404 
2405         PetscCall(PetscSFGetGraph(sfs[s], &nr, NULL, NULL, NULL));
2406         PetscCheck(nr == nRoots, comm, PETSC_ERR_ARG_SIZ, "rootMode = %s but sfs[%" PetscInt_FMT "] has a different number of roots (%" PetscInt_FMT ") than sfs[0] (%" PetscInt_FMT ")", PetscSFConcatenateRootModes[rootMode], s, nr, nRoots);
2407       }
2408     }
2409   } break;
2410   case PETSCSF_CONCATENATE_ROOTMODE_GLOBAL: {
2411     /* Calculate also global layout in this case */
2412     PetscInt    *nls;
2413     PetscLayout *lts;
2414     PetscInt   **inds;
2415     PetscInt     j;
2416     PetscInt     rootOffset = 0;
2417 
2418     PetscCall(PetscCalloc3(nsfs, &lts, nsfs, &nls, nsfs, &inds));
2419     PetscCall(PetscLayoutCreate(comm, &glayout));
2420     glayout->bs = 1;
2421     glayout->n  = 0;
2422     glayout->N  = 0;
2423     for (s = 0; s < nsfs; s++) {
2424       PetscCall(PetscSFGetGraphLayout(sfs[s], &lts[s], &nls[s], NULL, &inds[s]));
2425       glayout->n += lts[s]->n;
2426       glayout->N += lts[s]->N;
2427     }
2428     PetscCall(PetscLayoutSetUp(glayout));
2429     PetscCall(PetscMalloc1(nLeaves, &gremote));
2430     for (s = 0, j = 0; s < nsfs; s++) {
2431       for (i = 0; i < nls[s]; i++, j++) gremote[j] = inds[s][i] + rootOffset;
2432       rootOffset += lts[s]->N;
2433       PetscCall(PetscLayoutDestroy(&lts[s]));
2434       PetscCall(PetscFree(inds[s]));
2435     }
2436     PetscCall(PetscFree3(lts, nls, inds));
2437     nRoots = glayout->N;
2438   } break;
2439   case PETSCSF_CONCATENATE_ROOTMODE_LOCAL:
2440     /* nRoots calculated later in this case */
2441     break;
2442   default:
2443     SETERRQ(comm, PETSC_ERR_ARG_WRONG, "Invalid PetscSFConcatenateRootMode %d", rootMode);
2444   }
2445 
2446   if (!leafOffsets) {
2447     all_ilocal_null = PETSC_TRUE;
2448     for (s = 0; s < nsfs; s++) {
2449       const PetscInt *ilocal;
2450 
2451       PetscCall(PetscSFGetGraph(sfs[s], NULL, NULL, &ilocal, NULL));
2452       if (ilocal) {
2453         all_ilocal_null = PETSC_FALSE;
2454         break;
2455       }
2456     }
2457     PetscCheck(all_ilocal_null, PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "leafOffsets can be passed as NULL only if all SFs have ilocal = NULL");
2458   }
2459 
2460   /* Renumber and concatenate local leaves */
2461   ilocal_new = NULL;
2462   if (!all_ilocal_null) {
2463     PetscCall(PetscMalloc1(nLeaves, &ilocal_new));
2464     for (i = 0; i < nLeaves; i++) ilocal_new[i] = -1;
2465     for (s = 0; s < nsfs; s++) {
2466       const PetscInt *ilocal;
2467       PetscInt       *ilocal_l = &ilocal_new[leafArrayOffsets[s]];
2468       PetscInt        i, nleaves_l;
2469 
2470       PetscCall(PetscSFGetGraph(sfs[s], NULL, &nleaves_l, &ilocal, NULL));
2471       for (i = 0; i < nleaves_l; i++) ilocal_l[i] = (ilocal ? ilocal[i] : i) + leafOffsets[s];
2472     }
2473   }
2474 
2475   /* Renumber and concatenate remote roots */
2476   if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL || rootMode == PETSCSF_CONCATENATE_ROOTMODE_SHARED) {
2477     PetscInt rootOffset = 0;
2478 
2479     PetscCall(PetscMalloc1(nLeaves, &iremote_new));
2480     for (i = 0; i < nLeaves; i++) {
2481       iremote_new[i].rank  = -1;
2482       iremote_new[i].index = -1;
2483     }
2484     for (s = 0; s < nsfs; s++) {
2485       PetscInt           i, nl, nr;
2486       PetscSF            tmp_sf;
2487       const PetscSFNode *iremote;
2488       PetscSFNode       *tmp_rootdata;
2489       PetscSFNode       *tmp_leafdata = &iremote_new[leafArrayOffsets[s]];
2490 
2491       PetscCall(PetscSFGetGraph(sfs[s], &nr, &nl, NULL, &iremote));
2492       PetscCall(PetscSFCreate(comm, &tmp_sf));
2493       /* create helper SF with contiguous leaves */
2494       PetscCall(PetscSFSetGraph(tmp_sf, nr, nl, NULL, PETSC_USE_POINTER, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
2495       PetscCall(PetscSFSetUp(tmp_sf));
2496       PetscCall(PetscMalloc1(nr, &tmp_rootdata));
2497       if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) {
2498         for (i = 0; i < nr; i++) {
2499           tmp_rootdata[i].index = i + rootOffset;
2500           tmp_rootdata[i].rank  = (PetscInt)rank;
2501         }
2502         rootOffset += nr;
2503       } else {
2504         for (i = 0; i < nr; i++) {
2505           tmp_rootdata[i].index = i;
2506           tmp_rootdata[i].rank  = (PetscInt)rank;
2507         }
2508       }
2509       PetscCall(PetscSFBcastBegin(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2510       PetscCall(PetscSFBcastEnd(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2511       PetscCall(PetscSFDestroy(&tmp_sf));
2512       PetscCall(PetscFree(tmp_rootdata));
2513     }
2514     if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) { nRoots = rootOffset; } // else nRoots already calculated above
2515 
2516     /* Build the new SF */
2517     PetscCall(PetscSFCreate(comm, newsf));
2518     PetscCall(PetscSFSetGraph(*newsf, nRoots, nLeaves, ilocal_new, PETSC_OWN_POINTER, iremote_new, PETSC_OWN_POINTER));
2519   } else {
2520     /* Build the new SF */
2521     PetscCall(PetscSFCreate(comm, newsf));
2522     PetscCall(PetscSFSetGraphLayout(*newsf, glayout, nLeaves, ilocal_new, PETSC_OWN_POINTER, gremote));
2523   }
2524   PetscCall(PetscSFSetUp(*newsf));
2525   PetscCall(PetscSFViewFromOptions(*newsf, NULL, "-sf_concat_view"));
2526   PetscCall(PetscLayoutDestroy(&glayout));
2527   PetscCall(PetscFree(gremote));
2528   PetscCall(PetscFree(leafArrayOffsets));
2529   PetscFunctionReturn(0);
2530 }
2531