xref: /petsc/src/vec/is/sf/interface/sf.c (revision dfb7d7afc6419f3665b48978fc015eb4d33caed1) !
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petsc/private/viewerimpl.h>
4 #include <petsc/private/hashmapi.h>
5 
6 #if defined(PETSC_HAVE_CUDA)
7   #include <cuda_runtime.h>
8 #endif
9 
10 #if defined(PETSC_HAVE_HIP)
11   #include <hip/hip_runtime.h>
12 #endif
13 
14 #if defined(PETSC_CLANG_STATIC_ANALYZER)
15 void PetscSFCheckGraphSet(PetscSF, int);
16 #else
17   #if defined(PETSC_USE_DEBUG)
18     #define PetscSFCheckGraphSet(sf, arg) PetscCheck((sf)->graphset, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %d \"%s\" before %s()", (arg), #sf, PETSC_FUNCTION_NAME);
19   #else
20     #define PetscSFCheckGraphSet(sf, arg) \
21       do { \
22       } while (0)
23   #endif
24 #endif
25 
26 const char *const PetscSFDuplicateOptions[]     = {"CONFONLY", "RANKS", "GRAPH", "PetscSFDuplicateOption", "PETSCSF_DUPLICATE_", NULL};
27 const char *const PetscSFConcatenateRootModes[] = {"local", "shared", "global", "PetscSFConcatenateRootMode", "PETSCSF_CONCATENATE_ROOTMODE_", NULL};
28 
29 /*@
30    PetscSFCreate - create a star forest communication context
31 
32    Collective
33 
34    Input Parameter:
35 .  comm - communicator on which the star forest will operate
36 
37    Output Parameter:
38 .  sf - new star forest context
39 
40    Options Database Keys:
41 .  -sf_type type - value of type may be
42 .vb
43     basic     -Use MPI persistent Isend/Irecv for communication (Default)
44     window    -Use MPI-3 one-sided window for communication
45     neighbor  -Use MPI-3 neighborhood collectives for communication
46 .ve
47 
48    Level: intermediate
49 
50    Note:
51    When one knows the communication graph is one of the predefined graph, such as `MPI_Alltoall()`, `MPI_Allgatherv()`,
52    `MPI_Gatherv()`, one can create a `PetscSF` and then set its graph with `PetscSFSetGraphWithPattern()`. These special
53    `SF`s are optimized and they have better performance than general `SF`s.
54 
55 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFSetGraphWithPattern()`, `PetscSFDestroy()`
56 @*/
57 PetscErrorCode PetscSFCreate(MPI_Comm comm, PetscSF *sf)
58 {
59   PetscSF b;
60 
61   PetscFunctionBegin;
62   PetscValidPointer(sf, 2);
63   PetscCall(PetscSFInitializePackage());
64 
65   PetscCall(PetscHeaderCreate(b, PETSCSF_CLASSID, "PetscSF", "Star Forest", "PetscSF", comm, PetscSFDestroy, PetscSFView));
66 
67   b->nroots    = -1;
68   b->nleaves   = -1;
69   b->minleaf   = PETSC_MAX_INT;
70   b->maxleaf   = PETSC_MIN_INT;
71   b->nranks    = -1;
72   b->rankorder = PETSC_TRUE;
73   b->ingroup   = MPI_GROUP_NULL;
74   b->outgroup  = MPI_GROUP_NULL;
75   b->graphset  = PETSC_FALSE;
76 #if defined(PETSC_HAVE_DEVICE)
77   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
78   b->use_stream_aware_mpi = PETSC_FALSE;
79   b->unknown_input_stream = PETSC_FALSE;
80   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
81   b->backend = PETSCSF_BACKEND_KOKKOS;
82   #elif defined(PETSC_HAVE_CUDA)
83   b->backend = PETSCSF_BACKEND_CUDA;
84   #elif defined(PETSC_HAVE_HIP)
85   b->backend = PETSCSF_BACKEND_HIP;
86   #endif
87 
88   #if defined(PETSC_HAVE_NVSHMEM)
89   b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
90   b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
91   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem", &b->use_nvshmem, NULL));
92   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem_get", &b->use_nvshmem_get, NULL));
93   #endif
94 #endif
95   b->vscat.from_n = -1;
96   b->vscat.to_n   = -1;
97   b->vscat.unit   = MPIU_SCALAR;
98   *sf             = b;
99   PetscFunctionReturn(0);
100 }
101 
102 /*@
103    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
104 
105    Collective
106 
107    Input Parameter:
108 .  sf - star forest
109 
110    Level: advanced
111 
112 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetGraph()`, `PetscSFDestroy()`
113 @*/
114 PetscErrorCode PetscSFReset(PetscSF sf)
115 {
116   PetscFunctionBegin;
117   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
118   PetscTryTypeMethod(sf, Reset);
119   sf->nroots   = -1;
120   sf->nleaves  = -1;
121   sf->minleaf  = PETSC_MAX_INT;
122   sf->maxleaf  = PETSC_MIN_INT;
123   sf->mine     = NULL;
124   sf->remote   = NULL;
125   sf->graphset = PETSC_FALSE;
126   PetscCall(PetscFree(sf->mine_alloc));
127   PetscCall(PetscFree(sf->remote_alloc));
128   sf->nranks = -1;
129   PetscCall(PetscFree4(sf->ranks, sf->roffset, sf->rmine, sf->rremote));
130   sf->degreeknown = PETSC_FALSE;
131   PetscCall(PetscFree(sf->degree));
132   if (sf->ingroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->ingroup));
133   if (sf->outgroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->outgroup));
134   if (sf->multi) sf->multi->multi = NULL;
135   PetscCall(PetscSFDestroy(&sf->multi));
136   PetscCall(PetscLayoutDestroy(&sf->map));
137 
138 #if defined(PETSC_HAVE_DEVICE)
139   for (PetscInt i = 0; i < 2; i++) PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, sf->rmine_d[i]));
140 #endif
141 
142   sf->setupcalled = PETSC_FALSE;
143   PetscFunctionReturn(0);
144 }
145 
146 /*@C
147    PetscSFSetType - Set the `PetscSF` communication implementation
148 
149    Collective
150 
151    Input Parameters:
152 +  sf - the `PetscSF` context
153 -  type - a known method
154 .vb
155     PETSCSFWINDOW - MPI-2/3 one-sided
156     PETSCSFBASIC - basic implementation using MPI-1 two-sided
157 .ve
158 
159    Options Database Key:
160 .  -sf_type <type> - Sets the method; for example basic or window use -help for a list of available methods (for instance, window, basic, neighbor)
161 
162   Level: intermediate
163 
164    Notes:
165    See "include/petscsf.h" for available methods (for instance)
166 
167 .seealso: `PetscSFType`, `PetscSFCreate()`
168 @*/
169 PetscErrorCode PetscSFSetType(PetscSF sf, PetscSFType type)
170 {
171   PetscBool match;
172   PetscErrorCode (*r)(PetscSF);
173 
174   PetscFunctionBegin;
175   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
176   PetscValidCharPointer(type, 2);
177 
178   PetscCall(PetscObjectTypeCompare((PetscObject)sf, type, &match));
179   if (match) PetscFunctionReturn(0);
180 
181   PetscCall(PetscFunctionListFind(PetscSFList, type, &r));
182   PetscCheck(r, PETSC_COMM_SELF, PETSC_ERR_ARG_UNKNOWN_TYPE, "Unable to find requested PetscSF type %s", type);
183   /* Destroy the previous PetscSF implementation context */
184   PetscTryTypeMethod(sf, Destroy);
185   PetscCall(PetscMemzero(sf->ops, sizeof(*sf->ops)));
186   PetscCall(PetscObjectChangeTypeName((PetscObject)sf, type));
187   PetscCall((*r)(sf));
188   PetscFunctionReturn(0);
189 }
190 
191 /*@C
192   PetscSFGetType - Get the `PetscSF` communication implementation
193 
194   Not Collective
195 
196   Input Parameter:
197 . sf  - the `PetscSF` context
198 
199   Output Parameter:
200 . type - the `PetscSF` type name
201 
202   Level: intermediate
203 
204 .seealso: `PetscSFType`, `PetscSFSetType()`, `PetscSFCreate()`
205 @*/
206 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
207 {
208   PetscFunctionBegin;
209   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
210   PetscValidPointer(type, 2);
211   *type = ((PetscObject)sf)->type_name;
212   PetscFunctionReturn(0);
213 }
214 
215 /*@C
216    PetscSFDestroy - destroy star forest
217 
218    Collective
219 
220    Input Parameter:
221 .  sf - address of star forest
222 
223    Level: intermediate
224 
225 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFReset()`
226 @*/
227 PetscErrorCode PetscSFDestroy(PetscSF *sf)
228 {
229   PetscFunctionBegin;
230   if (!*sf) PetscFunctionReturn(0);
231   PetscValidHeaderSpecific((*sf), PETSCSF_CLASSID, 1);
232   if (--((PetscObject)(*sf))->refct > 0) {
233     *sf = NULL;
234     PetscFunctionReturn(0);
235   }
236   PetscCall(PetscSFReset(*sf));
237   PetscTryTypeMethod((*sf), Destroy);
238   PetscCall(PetscSFDestroy(&(*sf)->vscat.lsf));
239   if ((*sf)->vscat.bs > 1) PetscCallMPI(MPI_Type_free(&(*sf)->vscat.unit));
240   PetscCall(PetscHeaderDestroy(sf));
241   PetscFunctionReturn(0);
242 }
243 
244 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
245 {
246   PetscInt           i, nleaves;
247   PetscMPIInt        size;
248   const PetscInt    *ilocal;
249   const PetscSFNode *iremote;
250 
251   PetscFunctionBegin;
252   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
253   PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, &iremote));
254   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
255   for (i = 0; i < nleaves; i++) {
256     const PetscInt rank   = iremote[i].rank;
257     const PetscInt remote = iremote[i].index;
258     const PetscInt leaf   = ilocal ? ilocal[i] : i;
259     PetscCheck(rank >= 0 && rank < size, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided rank (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be in [0, %d)", rank, i, size);
260     PetscCheck(remote >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided index (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be >= 0", remote, i);
261     PetscCheck(leaf >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided location (%" PetscInt_FMT ") for leaf %" PetscInt_FMT " is invalid, should be >= 0", leaf, i);
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 /*@
267    PetscSFSetUp - set up communication structures
268 
269    Collective
270 
271    Input Parameter:
272 .  sf - star forest communication object
273 
274    Level: beginner
275 
276 .seealso: `PetscSFType`, `PetscSFSetFromOptions()`, `PetscSFSetType()`
277 @*/
278 PetscErrorCode PetscSFSetUp(PetscSF sf)
279 {
280   PetscFunctionBegin;
281   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
282   PetscSFCheckGraphSet(sf, 1);
283   if (sf->setupcalled) PetscFunctionReturn(0);
284   PetscCall(PetscLogEventBegin(PETSCSF_SetUp, sf, 0, 0, 0));
285   PetscCall(PetscSFCheckGraphValid_Private(sf));
286   if (!((PetscObject)sf)->type_name) PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); /* Zero all sf->ops */
287   PetscTryTypeMethod(sf, SetUp);
288 #if defined(PETSC_HAVE_CUDA)
289   if (sf->backend == PETSCSF_BACKEND_CUDA) {
290     sf->ops->Malloc = PetscSFMalloc_CUDA;
291     sf->ops->Free   = PetscSFFree_CUDA;
292   }
293 #endif
294 #if defined(PETSC_HAVE_HIP)
295   if (sf->backend == PETSCSF_BACKEND_HIP) {
296     sf->ops->Malloc = PetscSFMalloc_HIP;
297     sf->ops->Free   = PetscSFFree_HIP;
298   }
299 #endif
300 
301 #
302 #if defined(PETSC_HAVE_KOKKOS)
303   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
304     sf->ops->Malloc = PetscSFMalloc_Kokkos;
305     sf->ops->Free   = PetscSFFree_Kokkos;
306   }
307 #endif
308   PetscCall(PetscLogEventEnd(PETSCSF_SetUp, sf, 0, 0, 0));
309   sf->setupcalled = PETSC_TRUE;
310   PetscFunctionReturn(0);
311 }
312 
313 /*@
314    PetscSFSetFromOptions - set `PetscSF` options using the options database
315 
316    Logically Collective
317 
318    Input Parameter:
319 .  sf - star forest
320 
321    Options Database Keys:
322 +  -sf_type               - implementation type, see PetscSFSetType()
323 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
324 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
325                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
326                             If true, this option only works with -use_gpu_aware_mpi 1.
327 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
328                                If true, this option only works with -use_gpu_aware_mpi 1.
329 
330 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently SF has these backends: cuda, hip and Kokkos.
331                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
332                               the only available is kokkos.
333 
334    Level: intermediate
335 
336 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetType()`
337 @*/
338 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
339 {
340   PetscSFType deft;
341   char        type[256];
342   PetscBool   flg;
343 
344   PetscFunctionBegin;
345   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
346   PetscObjectOptionsBegin((PetscObject)sf);
347   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
348   PetscCall(PetscOptionsFList("-sf_type", "PetscSF implementation type", "PetscSFSetType", PetscSFList, deft, type, sizeof(type), &flg));
349   PetscCall(PetscSFSetType(sf, flg ? type : deft));
350   PetscCall(PetscOptionsBool("-sf_rank_order", "sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise", "PetscSFSetRankOrder", sf->rankorder, &sf->rankorder, NULL));
351 #if defined(PETSC_HAVE_DEVICE)
352   {
353     char      backendstr[32] = {0};
354     PetscBool isCuda = PETSC_FALSE, isHip = PETSC_FALSE, isKokkos = PETSC_FALSE, set;
355     /* Change the defaults set in PetscSFCreate() with command line options */
356     PetscCall(PetscOptionsBool("-sf_unknown_input_stream", "SF root/leafdata is computed on arbitrary streams unknown to SF", "PetscSFSetFromOptions", sf->unknown_input_stream, &sf->unknown_input_stream, NULL));
357     PetscCall(PetscOptionsBool("-sf_use_stream_aware_mpi", "Assume the underlying MPI is cuda-stream aware", "PetscSFSetFromOptions", sf->use_stream_aware_mpi, &sf->use_stream_aware_mpi, NULL));
358     PetscCall(PetscOptionsString("-sf_backend", "Select the device backend SF uses", "PetscSFSetFromOptions", NULL, backendstr, sizeof(backendstr), &set));
359     PetscCall(PetscStrcasecmp("cuda", backendstr, &isCuda));
360     PetscCall(PetscStrcasecmp("kokkos", backendstr, &isKokkos));
361     PetscCall(PetscStrcasecmp("hip", backendstr, &isHip));
362   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
363     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
364     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
365     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
366     else PetscCheck(!set, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
367   #elif defined(PETSC_HAVE_KOKKOS)
368     PetscCheck(!set || isKokkos, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You can only choose kokkos", backendstr);
369   #endif
370   }
371 #endif
372   PetscTryTypeMethod(sf, SetFromOptions, PetscOptionsObject);
373   PetscOptionsEnd();
374   PetscFunctionReturn(0);
375 }
376 
377 /*@
378    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
379 
380    Logically Collective
381 
382    Input Parameters:
383 +  sf - star forest
384 -  flg - `PETSC_TRUE` to sort, `PETSC_FALSE` to skip sorting (lower setup cost, but non-deterministic)
385 
386    Level: advanced
387 
388 .seealso: `PetscSFType`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`
389 @*/
390 PetscErrorCode PetscSFSetRankOrder(PetscSF sf, PetscBool flg)
391 {
392   PetscFunctionBegin;
393   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
394   PetscValidLogicalCollectiveBool(sf, flg, 2);
395   PetscCheck(!sf->multi, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
396   sf->rankorder = flg;
397   PetscFunctionReturn(0);
398 }
399 
400 /*@C
401    PetscSFSetGraph - Set a parallel star forest
402 
403    Collective
404 
405    Input Parameters:
406 +  sf - star forest
407 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
408 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
409 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
410 during setup in debug mode)
411 .  localmode - copy mode for ilocal
412 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
413 during setup in debug mode)
414 -  remotemode - copy mode for iremote
415 
416    Level: intermediate
417 
418    Notes:
419    Leaf indices in ilocal must be unique, otherwise an error occurs.
420 
421    Input arrays ilocal and iremote follow the PetscCopyMode semantics.
422    In particular, if localmode/remotemode is `PETSC_OWN_POINTER` or `PETSC_USE_POINTER`,
423    PETSc might modify the respective array;
424    if `PETSC_USE_POINTER`, the user must delete the array after PetscSFDestroy().
425    Only if `PETSC_COPY_VALUES` is used, the respective array is guaranteed to stay intact and a const array can be passed (but a cast to non-const is needed).
426 
427    Fortran Note:
428    In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode.
429 
430    Developer Note:
431    We sort leaves to check for duplicates and contiguousness and to find minleaf/maxleaf.
432    This also allows to compare leaf sets of two SFs easily.
433 
434 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
435 @*/
436 PetscErrorCode PetscSFSetGraph(PetscSF sf, PetscInt nroots, PetscInt nleaves, PetscInt *ilocal, PetscCopyMode localmode, PetscSFNode *iremote, PetscCopyMode remotemode)
437 {
438   PetscBool unique, contiguous;
439 
440   PetscFunctionBegin;
441   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
442   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal, 4);
443   if (nleaves > 0) PetscValidPointer(iremote, 6);
444   PetscCheck(nroots >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nroots %" PetscInt_FMT ", cannot be negative", nroots);
445   PetscCheck(nleaves >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nleaves %" PetscInt_FMT ", cannot be negative", nleaves);
446   /* enums may be handled as unsigned by some compilers, NVHPC for example, the int cast
447    * below is to prevent NVHPC from warning about meaningless comparison of unsigned with zero */
448   PetscCheck((int)localmode >= PETSC_COPY_VALUES && localmode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong localmode %d", localmode);
449   PetscCheck((int)remotemode >= PETSC_COPY_VALUES && remotemode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong remotemode %d", remotemode);
450 
451   if (sf->nroots >= 0) { /* Reset only if graph already set */
452     PetscCall(PetscSFReset(sf));
453   }
454 
455   PetscCall(PetscLogEventBegin(PETSCSF_SetGraph, sf, 0, 0, 0));
456 
457   sf->nroots  = nroots;
458   sf->nleaves = nleaves;
459 
460   if (localmode == PETSC_COPY_VALUES && ilocal) {
461     PetscInt *tlocal = NULL;
462 
463     PetscCall(PetscMalloc1(nleaves, &tlocal));
464     PetscCall(PetscArraycpy(tlocal, ilocal, nleaves));
465     ilocal = tlocal;
466   }
467   if (remotemode == PETSC_COPY_VALUES) {
468     PetscSFNode *tremote = NULL;
469 
470     PetscCall(PetscMalloc1(nleaves, &tremote));
471     PetscCall(PetscArraycpy(tremote, iremote, nleaves));
472     iremote = tremote;
473   }
474 
475   if (nleaves && ilocal) {
476     PetscSFNode work;
477 
478     PetscCall(PetscSortIntWithDataArray(nleaves, ilocal, iremote, sizeof(PetscSFNode), &work));
479     PetscCall(PetscSortedCheckDupsInt(nleaves, ilocal, &unique));
480     unique = PetscNot(unique);
481     PetscCheck(sf->allow_multi_leaves || unique, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Input ilocal has duplicate entries which is not allowed for this PetscSF");
482     sf->minleaf = ilocal[0];
483     sf->maxleaf = ilocal[nleaves - 1];
484     contiguous  = (PetscBool)(unique && ilocal[0] == 0 && ilocal[nleaves - 1] == nleaves - 1);
485   } else {
486     sf->minleaf = 0;
487     sf->maxleaf = nleaves - 1;
488     unique      = PETSC_TRUE;
489     contiguous  = PETSC_TRUE;
490   }
491 
492   if (contiguous) {
493     if (localmode == PETSC_USE_POINTER) {
494       ilocal = NULL;
495     } else {
496       PetscCall(PetscFree(ilocal));
497     }
498   }
499   sf->mine = ilocal;
500   if (localmode == PETSC_USE_POINTER) {
501     sf->mine_alloc = NULL;
502   } else {
503     sf->mine_alloc = ilocal;
504   }
505   sf->remote = iremote;
506   if (remotemode == PETSC_USE_POINTER) {
507     sf->remote_alloc = NULL;
508   } else {
509     sf->remote_alloc = iremote;
510   }
511   PetscCall(PetscLogEventEnd(PETSCSF_SetGraph, sf, 0, 0, 0));
512   sf->graphset = PETSC_TRUE;
513   PetscFunctionReturn(0);
514 }
515 
516 /*@
517   PetscSFSetGraphWithPattern - Sets the graph of a `PetscSF` with a specific pattern
518 
519   Collective
520 
521   Input Parameters:
522 + sf      - The `PetscSF`
523 . map     - Layout of roots over all processes (insignificant when pattern is `PETSCSF_PATTERN_ALLTOALL`)
524 - pattern - One of `PETSCSF_PATTERN_ALLGATHER`, `PETSCSF_PATTERN_GATHER`, `PETSCSF_PATTERN_ALLTOALL`
525 
526   Level: intermediate
527 
528   Notes:
529   It is easier to explain `PetscSFPattern` using vectors. Suppose we have an MPI vector x and its layout is map.
530   n and N are local and global sizes of x respectively.
531 
532   With `PETSCSF_PATTERN_ALLGATHER`, the routine creates a graph that if one does Bcast on it, it will copy x to
533   sequential vectors y on all ranks.
534 
535   With `PETSCSF_PATTERN_GATHER`, the routine creates a graph that if one does Bcast on it, it will copy x to a
536   sequential vector y on rank 0.
537 
538   In above cases, entries of x are roots and entries of y are leaves.
539 
540   With `PETSCSF_PATTERN_ALLTOALL`, map is insignificant. Suppose NP is size of sf's communicator. The routine
541   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
542   of rank j. Here 0 <=i,j<NP. It is a kind of `MPI_Alltoall()` with sendcount/recvcount being 1. Note that it does
543   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
544   items with `MPI_Type_contiguous` and use that as the <unit> argument in SF routines.
545 
546   In this case, roots and leaves are symmetric.
547 
548 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
549  @*/
550 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf, PetscLayout map, PetscSFPattern pattern)
551 {
552   MPI_Comm    comm;
553   PetscInt    n, N, res[2];
554   PetscMPIInt rank, size;
555   PetscSFType type;
556 
557   PetscFunctionBegin;
558   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
559   if (pattern != PETSCSF_PATTERN_ALLTOALL) PetscValidPointer(map, 2);
560   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
561   PetscCheck(pattern >= PETSCSF_PATTERN_ALLGATHER && pattern <= PETSCSF_PATTERN_ALLTOALL, comm, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported PetscSFPattern %d", pattern);
562   PetscCallMPI(MPI_Comm_rank(comm, &rank));
563   PetscCallMPI(MPI_Comm_size(comm, &size));
564 
565   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
566     type = PETSCSFALLTOALL;
567     PetscCall(PetscLayoutCreate(comm, &sf->map));
568     PetscCall(PetscLayoutSetLocalSize(sf->map, size));
569     PetscCall(PetscLayoutSetSize(sf->map, ((PetscInt)size) * size));
570     PetscCall(PetscLayoutSetUp(sf->map));
571   } else {
572     PetscCall(PetscLayoutGetLocalSize(map, &n));
573     PetscCall(PetscLayoutGetSize(map, &N));
574     res[0] = n;
575     res[1] = -n;
576     /* Check if n are same over all ranks so that we can optimize it */
577     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, res, 2, MPIU_INT, MPI_MAX, comm));
578     if (res[0] == -res[1]) { /* same n */
579       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER : PETSCSFGATHER;
580     } else {
581       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
582     }
583     PetscCall(PetscLayoutReference(map, &sf->map));
584   }
585   PetscCall(PetscSFSetType(sf, type));
586 
587   sf->pattern = pattern;
588   sf->mine    = NULL; /* Contiguous */
589 
590   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
591      Also set other easy stuff.
592    */
593   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
594     sf->nleaves = N;
595     sf->nroots  = n;
596     sf->nranks  = size;
597     sf->minleaf = 0;
598     sf->maxleaf = N - 1;
599   } else if (pattern == PETSCSF_PATTERN_GATHER) {
600     sf->nleaves = rank ? 0 : N;
601     sf->nroots  = n;
602     sf->nranks  = rank ? 0 : size;
603     sf->minleaf = 0;
604     sf->maxleaf = rank ? -1 : N - 1;
605   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
606     sf->nleaves = size;
607     sf->nroots  = size;
608     sf->nranks  = size;
609     sf->minleaf = 0;
610     sf->maxleaf = size - 1;
611   }
612   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
613   sf->graphset = PETSC_TRUE;
614   PetscFunctionReturn(0);
615 }
616 
617 /*@
618    PetscSFCreateInverseSF - given a `PetscSF` in which all vertices have degree 1, creates the inverse map
619 
620    Collective
621 
622    Input Parameter:
623 .  sf - star forest to invert
624 
625    Output Parameter:
626 .  isf - inverse of sf
627 
628    Level: advanced
629 
630    Notes:
631    All roots must have degree 1.
632 
633    The local space may be a permutation, but cannot be sparse.
634 
635 .seealso: `PetscSFType`, `PetscSFSetGraph()`
636 @*/
637 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf, PetscSF *isf)
638 {
639   PetscMPIInt     rank;
640   PetscInt        i, nroots, nleaves, maxlocal, count, *newilocal;
641   const PetscInt *ilocal;
642   PetscSFNode    *roots, *leaves;
643 
644   PetscFunctionBegin;
645   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
646   PetscSFCheckGraphSet(sf, 1);
647   PetscValidPointer(isf, 2);
648 
649   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, NULL));
650   maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
651 
652   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
653   PetscCall(PetscMalloc2(nroots, &roots, maxlocal, &leaves));
654   for (i = 0; i < maxlocal; i++) {
655     leaves[i].rank  = rank;
656     leaves[i].index = i;
657   }
658   for (i = 0; i < nroots; i++) {
659     roots[i].rank  = -1;
660     roots[i].index = -1;
661   }
662   PetscCall(PetscSFReduceBegin(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
663   PetscCall(PetscSFReduceEnd(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
664 
665   /* Check whether our leaves are sparse */
666   for (i = 0, count = 0; i < nroots; i++)
667     if (roots[i].rank >= 0) count++;
668   if (count == nroots) newilocal = NULL;
669   else { /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */ PetscCall(PetscMalloc1(count, &newilocal));
670     for (i = 0, count = 0; i < nroots; i++) {
671       if (roots[i].rank >= 0) {
672         newilocal[count]   = i;
673         roots[count].rank  = roots[i].rank;
674         roots[count].index = roots[i].index;
675         count++;
676       }
677     }
678   }
679 
680   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, isf));
681   PetscCall(PetscSFSetGraph(*isf, maxlocal, count, newilocal, PETSC_OWN_POINTER, roots, PETSC_COPY_VALUES));
682   PetscCall(PetscFree2(roots, leaves));
683   PetscFunctionReturn(0);
684 }
685 
686 /*@
687    PetscSFDuplicate - duplicate a `PetscSF`, optionally preserving rank connectivity and graph
688 
689    Collective
690 
691    Input Parameters:
692 +  sf - communication object to duplicate
693 -  opt - `PETSCSF_DUPLICATE_CONFONLY`, `PETSCSF_DUPLICATE_RANKS`, or `PETSCSF_DUPLICATE_GRAPH` (see `PetscSFDuplicateOption`)
694 
695    Output Parameter:
696 .  newsf - new communication object
697 
698    Level: beginner
699 
700 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFSetType()`, `PetscSFSetGraph()`
701 @*/
702 PetscErrorCode PetscSFDuplicate(PetscSF sf, PetscSFDuplicateOption opt, PetscSF *newsf)
703 {
704   PetscSFType  type;
705   MPI_Datatype dtype = MPIU_SCALAR;
706 
707   PetscFunctionBegin;
708   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
709   PetscValidLogicalCollectiveEnum(sf, opt, 2);
710   PetscValidPointer(newsf, 3);
711   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sf), newsf));
712   PetscCall(PetscSFGetType(sf, &type));
713   if (type) PetscCall(PetscSFSetType(*newsf, type));
714   (*newsf)->allow_multi_leaves = sf->allow_multi_leaves; /* Dup this flag ealier since PetscSFSetGraph() below checks on this flag */
715   if (opt == PETSCSF_DUPLICATE_GRAPH) {
716     PetscSFCheckGraphSet(sf, 1);
717     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
718       PetscInt           nroots, nleaves;
719       const PetscInt    *ilocal;
720       const PetscSFNode *iremote;
721       PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
722       PetscCall(PetscSFSetGraph(*newsf, nroots, nleaves, (PetscInt *)ilocal, PETSC_COPY_VALUES, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
723     } else {
724       PetscCall(PetscSFSetGraphWithPattern(*newsf, sf->map, sf->pattern));
725     }
726   }
727   /* Since oldtype is committed, so is newtype, according to MPI */
728   if (sf->vscat.bs > 1) PetscCallMPI(MPI_Type_dup(sf->vscat.unit, &dtype));
729   (*newsf)->vscat.bs     = sf->vscat.bs;
730   (*newsf)->vscat.unit   = dtype;
731   (*newsf)->vscat.to_n   = sf->vscat.to_n;
732   (*newsf)->vscat.from_n = sf->vscat.from_n;
733   /* Do not copy lsf. Build it on demand since it is rarely used */
734 
735 #if defined(PETSC_HAVE_DEVICE)
736   (*newsf)->backend              = sf->backend;
737   (*newsf)->unknown_input_stream = sf->unknown_input_stream;
738   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
739   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
740 #endif
741   PetscTryTypeMethod(sf, Duplicate, opt, *newsf);
742   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
743   PetscFunctionReturn(0);
744 }
745 
746 /*@C
747    PetscSFGetGraph - Get the graph specifying a parallel star forest
748 
749    Not Collective
750 
751    Input Parameter:
752 .  sf - star forest
753 
754    Output Parameters:
755 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
756 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
757 .  ilocal - locations of leaves in leafdata buffers (if returned value is NULL, it means leaves are in contiguous storage)
758 -  iremote - remote locations of root vertices for each leaf on the current process
759 
760    Level: intermediate
761 
762    Notes:
763      We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
764 
765      The returned ilocal/iremote might contain values in different order than the input ones in `PetscSFSetGraph()`,
766      see its manpage for details.
767 
768    Fortran Notes:
769      The returned iremote array is a copy and must be deallocated after use. Consequently, if you
770      want to update the graph, you must call `PetscSFSetGraph()` after modifying the iremote array.
771 
772      To check for a NULL ilocal use
773 $      if (loc(ilocal) == loc(PETSC_NULL_INTEGER)) then
774 
775 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`
776 @*/
777 PetscErrorCode PetscSFGetGraph(PetscSF sf, PetscInt *nroots, PetscInt *nleaves, const PetscInt **ilocal, const PetscSFNode **iremote)
778 {
779   PetscFunctionBegin;
780   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
781   if (sf->ops->GetGraph) {
782     PetscCall((sf->ops->GetGraph)(sf, nroots, nleaves, ilocal, iremote));
783   } else {
784     if (nroots) *nroots = sf->nroots;
785     if (nleaves) *nleaves = sf->nleaves;
786     if (ilocal) *ilocal = sf->mine;
787     if (iremote) *iremote = sf->remote;
788   }
789   PetscFunctionReturn(0);
790 }
791 
792 /*@
793    PetscSFGetLeafRange - Get the active leaf ranges
794 
795    Not Collective
796 
797    Input Parameter:
798 .  sf - star forest
799 
800    Output Parameters:
801 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
802 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
803 
804    Level: developer
805 
806 .seealso: `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
807 @*/
808 PetscErrorCode PetscSFGetLeafRange(PetscSF sf, PetscInt *minleaf, PetscInt *maxleaf)
809 {
810   PetscFunctionBegin;
811   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
812   PetscSFCheckGraphSet(sf, 1);
813   if (minleaf) *minleaf = sf->minleaf;
814   if (maxleaf) *maxleaf = sf->maxleaf;
815   PetscFunctionReturn(0);
816 }
817 
818 /*@C
819    PetscSFViewFromOptions - View a `PetscSF` based on arguments in the options database
820 
821    Collective on A
822 
823    Input Parameters:
824 +  A - the star forest
825 .  obj - Optional object that provides the prefix for the option names
826 -  name - command line option
827 
828    Level: intermediate
829 
830 .seealso: `PetscSF`, `PetscSFView`, `PetscObjectViewFromOptions()`, `PetscSFCreate()`
831 @*/
832 PetscErrorCode PetscSFViewFromOptions(PetscSF A, PetscObject obj, const char name[])
833 {
834   PetscFunctionBegin;
835   PetscValidHeaderSpecific(A, PETSCSF_CLASSID, 1);
836   PetscCall(PetscObjectViewFromOptions((PetscObject)A, obj, name));
837   PetscFunctionReturn(0);
838 }
839 
840 /*@C
841    PetscSFView - view a star forest
842 
843    Collective
844 
845    Input Parameters:
846 +  sf - star forest
847 -  viewer - viewer to display graph, for example `PETSC_VIEWER_STDOUT_WORLD`
848 
849    Level: beginner
850 
851 .seealso: `PetscSF`, `PetscViewer`, `PetscSFCreate()`, `PetscSFSetGraph()`
852 @*/
853 PetscErrorCode PetscSFView(PetscSF sf, PetscViewer viewer)
854 {
855   PetscBool         iascii;
856   PetscViewerFormat format;
857 
858   PetscFunctionBegin;
859   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
860   if (!viewer) PetscCall(PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf), &viewer));
861   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
862   PetscCheckSameComm(sf, 1, viewer, 2);
863   if (sf->graphset) PetscCall(PetscSFSetUp(sf));
864   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
865   if (iascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) {
866     PetscMPIInt rank;
867     PetscInt    ii, i, j;
868 
869     PetscCall(PetscObjectPrintClassNamePrefixType((PetscObject)sf, viewer));
870     PetscCall(PetscViewerASCIIPushTab(viewer));
871     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
872       if (!sf->graphset) {
873         PetscCall(PetscViewerASCIIPrintf(viewer, "PetscSFSetGraph() has not been called yet\n"));
874         PetscCall(PetscViewerASCIIPopTab(viewer));
875         PetscFunctionReturn(0);
876       }
877       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
878       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
879       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Number of roots=%" PetscInt_FMT ", leaves=%" PetscInt_FMT ", remote ranks=%" PetscInt_FMT "\n", rank, sf->nroots, sf->nleaves, sf->nranks));
880       for (i = 0; i < sf->nleaves; i++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %" PetscInt_FMT " <- (%" PetscInt_FMT ",%" PetscInt_FMT ")\n", rank, sf->mine ? sf->mine[i] : i, sf->remote[i].rank, sf->remote[i].index));
881       PetscCall(PetscViewerFlush(viewer));
882       PetscCall(PetscViewerGetFormat(viewer, &format));
883       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
884         PetscMPIInt *tmpranks, *perm;
885         PetscCall(PetscMalloc2(sf->nranks, &tmpranks, sf->nranks, &perm));
886         PetscCall(PetscArraycpy(tmpranks, sf->ranks, sf->nranks));
887         for (i = 0; i < sf->nranks; i++) perm[i] = i;
888         PetscCall(PetscSortMPIIntWithArray(sf->nranks, tmpranks, perm));
889         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Roots referenced by my leaves, by rank\n", rank));
890         for (ii = 0; ii < sf->nranks; ii++) {
891           i = perm[ii];
892           PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %d: %" PetscInt_FMT " edges\n", rank, sf->ranks[i], sf->roffset[i + 1] - sf->roffset[i]));
893           for (j = sf->roffset[i]; j < sf->roffset[i + 1]; j++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d]    %" PetscInt_FMT " <- %" PetscInt_FMT "\n", rank, sf->rmine[j], sf->rremote[j]));
894         }
895         PetscCall(PetscFree2(tmpranks, perm));
896       }
897       PetscCall(PetscViewerFlush(viewer));
898       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
899     }
900     PetscCall(PetscViewerASCIIPopTab(viewer));
901   }
902   PetscTryTypeMethod(sf, View, viewer);
903   PetscFunctionReturn(0);
904 }
905 
906 /*@C
907    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
908 
909    Not Collective
910 
911    Input Parameter:
912 .  sf - star forest
913 
914    Output Parameters:
915 +  nranks - number of ranks referenced by local part
916 .  ranks - array of ranks
917 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
918 .  rmine - concatenated array holding local indices referencing each remote rank
919 -  rremote - concatenated array holding remote indices referenced for each remote rank
920 
921    Level: developer
922 
923 .seealso: `PetscSF`, `PetscSFGetLeafRanks()`
924 @*/
925 PetscErrorCode PetscSFGetRootRanks(PetscSF sf, PetscInt *nranks, const PetscMPIInt **ranks, const PetscInt **roffset, const PetscInt **rmine, const PetscInt **rremote)
926 {
927   PetscFunctionBegin;
928   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
929   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
930   if (sf->ops->GetRootRanks) {
931     PetscCall((sf->ops->GetRootRanks)(sf, nranks, ranks, roffset, rmine, rremote));
932   } else {
933     /* The generic implementation */
934     if (nranks) *nranks = sf->nranks;
935     if (ranks) *ranks = sf->ranks;
936     if (roffset) *roffset = sf->roffset;
937     if (rmine) *rmine = sf->rmine;
938     if (rremote) *rremote = sf->rremote;
939   }
940   PetscFunctionReturn(0);
941 }
942 
943 /*@C
944    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
945 
946    Not Collective
947 
948    Input Parameter:
949 .  sf - star forest
950 
951    Output Parameters:
952 +  niranks - number of leaf ranks referencing roots on this process
953 .  iranks - array of ranks
954 .  ioffset - offset in irootloc for each rank (length niranks+1)
955 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
956 
957    Level: developer
958 
959 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
960 @*/
961 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf, PetscInt *niranks, const PetscMPIInt **iranks, const PetscInt **ioffset, const PetscInt **irootloc)
962 {
963   PetscFunctionBegin;
964   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
965   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
966   if (sf->ops->GetLeafRanks) {
967     PetscCall((sf->ops->GetLeafRanks)(sf, niranks, iranks, ioffset, irootloc));
968   } else {
969     PetscSFType type;
970     PetscCall(PetscSFGetType(sf, &type));
971     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
972   }
973   PetscFunctionReturn(0);
974 }
975 
976 static PetscBool InList(PetscMPIInt needle, PetscMPIInt n, const PetscMPIInt *list)
977 {
978   PetscInt i;
979   for (i = 0; i < n; i++) {
980     if (needle == list[i]) return PETSC_TRUE;
981   }
982   return PETSC_FALSE;
983 }
984 
985 /*@C
986    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by `PetscSF` implementations.
987 
988    Collective
989 
990    Input Parameters:
991 +  sf - `PetscSF` to set up; `PetscSFSetGraph()` must have been called
992 -  dgroup - `MPI_Group` of ranks to be distinguished (e.g., for self or shared memory exchange)
993 
994    Level: developer
995 
996 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
997 @*/
998 PetscErrorCode PetscSFSetUpRanks(PetscSF sf, MPI_Group dgroup)
999 {
1000   PetscHMapI    table;
1001   PetscHashIter pos;
1002   PetscMPIInt   size, groupsize, *groupranks;
1003   PetscInt     *rcount, *ranks;
1004   PetscInt      i, irank = -1, orank = -1;
1005 
1006   PetscFunctionBegin;
1007   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1008   PetscSFCheckGraphSet(sf, 1);
1009   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
1010   PetscCall(PetscHMapICreateWithSize(10, &table));
1011   for (i = 0; i < sf->nleaves; i++) {
1012     /* Log 1-based rank */
1013     PetscCall(PetscHMapISetWithMode(table, sf->remote[i].rank + 1, 1, ADD_VALUES));
1014   }
1015   PetscCall(PetscHMapIGetSize(table, &sf->nranks));
1016   PetscCall(PetscMalloc4(sf->nranks, &sf->ranks, sf->nranks + 1, &sf->roffset, sf->nleaves, &sf->rmine, sf->nleaves, &sf->rremote));
1017   PetscCall(PetscMalloc2(sf->nranks, &rcount, sf->nranks, &ranks));
1018   PetscHashIterBegin(table, pos);
1019   for (i = 0; i < sf->nranks; i++) {
1020     PetscHashIterGetKey(table, pos, ranks[i]);
1021     PetscHashIterGetVal(table, pos, rcount[i]);
1022     PetscHashIterNext(table, pos);
1023     ranks[i]--; /* Convert back to 0-based */
1024   }
1025   PetscCall(PetscHMapIDestroy(&table));
1026 
1027   /* We expect that dgroup is reliably "small" while nranks could be large */
1028   {
1029     MPI_Group    group = MPI_GROUP_NULL;
1030     PetscMPIInt *dgroupranks;
1031     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1032     PetscCallMPI(MPI_Group_size(dgroup, &groupsize));
1033     PetscCall(PetscMalloc1(groupsize, &dgroupranks));
1034     PetscCall(PetscMalloc1(groupsize, &groupranks));
1035     for (i = 0; i < groupsize; i++) dgroupranks[i] = i;
1036     if (groupsize) PetscCallMPI(MPI_Group_translate_ranks(dgroup, groupsize, dgroupranks, group, groupranks));
1037     PetscCallMPI(MPI_Group_free(&group));
1038     PetscCall(PetscFree(dgroupranks));
1039   }
1040 
1041   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1042   for (sf->ndranks = 0, i = sf->nranks; sf->ndranks < i;) {
1043     for (i--; sf->ndranks < i; i--) { /* Scan i backward looking for distinguished rank */
1044       if (InList(ranks[i], groupsize, groupranks)) break;
1045     }
1046     for (; sf->ndranks <= i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1047       if (!InList(ranks[sf->ndranks], groupsize, groupranks)) break;
1048     }
1049     if (sf->ndranks < i) { /* Swap ranks[sf->ndranks] with ranks[i] */
1050       PetscInt tmprank, tmpcount;
1051 
1052       tmprank             = ranks[i];
1053       tmpcount            = rcount[i];
1054       ranks[i]            = ranks[sf->ndranks];
1055       rcount[i]           = rcount[sf->ndranks];
1056       ranks[sf->ndranks]  = tmprank;
1057       rcount[sf->ndranks] = tmpcount;
1058       sf->ndranks++;
1059     }
1060   }
1061   PetscCall(PetscFree(groupranks));
1062   PetscCall(PetscSortIntWithArray(sf->ndranks, ranks, rcount));
1063   PetscCall(PetscSortIntWithArray(sf->nranks - sf->ndranks, ranks + sf->ndranks, rcount + sf->ndranks));
1064   sf->roffset[0] = 0;
1065   for (i = 0; i < sf->nranks; i++) {
1066     PetscCall(PetscMPIIntCast(ranks[i], sf->ranks + i));
1067     sf->roffset[i + 1] = sf->roffset[i] + rcount[i];
1068     rcount[i]          = 0;
1069   }
1070   for (i = 0, irank = -1, orank = -1; i < sf->nleaves; i++) {
1071     /* short circuit */
1072     if (orank != sf->remote[i].rank) {
1073       /* Search for index of iremote[i].rank in sf->ranks */
1074       PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->ndranks, sf->ranks, &irank));
1075       if (irank < 0) {
1076         PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->nranks - sf->ndranks, sf->ranks + sf->ndranks, &irank));
1077         if (irank >= 0) irank += sf->ndranks;
1078       }
1079       orank = sf->remote[i].rank;
1080     }
1081     PetscCheck(irank >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Could not find rank %" PetscInt_FMT " in array", sf->remote[i].rank);
1082     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1083     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1084     rcount[irank]++;
1085   }
1086   PetscCall(PetscFree2(rcount, ranks));
1087   PetscFunctionReturn(0);
1088 }
1089 
1090 /*@C
1091    PetscSFGetGroups - gets incoming and outgoing process groups
1092 
1093    Collective
1094 
1095    Input Parameter:
1096 .  sf - star forest
1097 
1098    Output Parameters:
1099 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1100 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1101 
1102    Level: developer
1103 
1104 .seealso: `PetscSF`, `PetscSFGetWindow()`, `PetscSFRestoreWindow()`
1105 @*/
1106 PetscErrorCode PetscSFGetGroups(PetscSF sf, MPI_Group *incoming, MPI_Group *outgoing)
1107 {
1108   MPI_Group group = MPI_GROUP_NULL;
1109 
1110   PetscFunctionBegin;
1111   PetscCheck(sf->nranks >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUpRanks() before obtaining groups");
1112   if (sf->ingroup == MPI_GROUP_NULL) {
1113     PetscInt        i;
1114     const PetscInt *indegree;
1115     PetscMPIInt     rank, *outranks, *inranks;
1116     PetscSFNode    *remote;
1117     PetscSF         bgcount;
1118 
1119     /* Compute the number of incoming ranks */
1120     PetscCall(PetscMalloc1(sf->nranks, &remote));
1121     for (i = 0; i < sf->nranks; i++) {
1122       remote[i].rank  = sf->ranks[i];
1123       remote[i].index = 0;
1124     }
1125     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, &bgcount));
1126     PetscCall(PetscSFSetGraph(bgcount, 1, sf->nranks, NULL, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1127     PetscCall(PetscSFComputeDegreeBegin(bgcount, &indegree));
1128     PetscCall(PetscSFComputeDegreeEnd(bgcount, &indegree));
1129     /* Enumerate the incoming ranks */
1130     PetscCall(PetscMalloc2(indegree[0], &inranks, sf->nranks, &outranks));
1131     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1132     for (i = 0; i < sf->nranks; i++) outranks[i] = rank;
1133     PetscCall(PetscSFGatherBegin(bgcount, MPI_INT, outranks, inranks));
1134     PetscCall(PetscSFGatherEnd(bgcount, MPI_INT, outranks, inranks));
1135     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1136     PetscCallMPI(MPI_Group_incl(group, indegree[0], inranks, &sf->ingroup));
1137     PetscCallMPI(MPI_Group_free(&group));
1138     PetscCall(PetscFree2(inranks, outranks));
1139     PetscCall(PetscSFDestroy(&bgcount));
1140   }
1141   *incoming = sf->ingroup;
1142 
1143   if (sf->outgroup == MPI_GROUP_NULL) {
1144     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1145     PetscCallMPI(MPI_Group_incl(group, sf->nranks, sf->ranks, &sf->outgroup));
1146     PetscCallMPI(MPI_Group_free(&group));
1147   }
1148   *outgoing = sf->outgroup;
1149   PetscFunctionReturn(0);
1150 }
1151 
1152 /*@
1153    PetscSFGetMultiSF - gets the inner `PetscSF` implementing gathers and scatters
1154 
1155    Collective
1156 
1157    Input Parameter:
1158 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1159 
1160    Output Parameter:
1161 .  multi - star forest with split roots, such that each root has degree exactly 1
1162 
1163    Level: developer
1164 
1165    Note:
1166    In most cases, users should use `PetscSFGatherBegin()` and `PetscSFScatterBegin()` instead of manipulating multi
1167    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1168    edge, it is a candidate for future optimization that might involve its removal.
1169 
1170 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`, `PetscSFComputeMultiRootOriginalNumbering()`
1171 @*/
1172 PetscErrorCode PetscSFGetMultiSF(PetscSF sf, PetscSF *multi)
1173 {
1174   PetscFunctionBegin;
1175   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1176   PetscValidPointer(multi, 2);
1177   if (sf->nroots < 0) { /* Graph has not been set yet; why do we need this? */
1178     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1179     *multi           = sf->multi;
1180     sf->multi->multi = sf->multi;
1181     PetscFunctionReturn(0);
1182   }
1183   if (!sf->multi) {
1184     const PetscInt *indegree;
1185     PetscInt        i, *inoffset, *outones, *outoffset, maxlocal;
1186     PetscSFNode    *remote;
1187     maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
1188     PetscCall(PetscSFComputeDegreeBegin(sf, &indegree));
1189     PetscCall(PetscSFComputeDegreeEnd(sf, &indegree));
1190     PetscCall(PetscMalloc3(sf->nroots + 1, &inoffset, maxlocal, &outones, maxlocal, &outoffset));
1191     inoffset[0] = 0;
1192     for (i = 0; i < sf->nroots; i++) inoffset[i + 1] = inoffset[i] + indegree[i];
1193     for (i = 0; i < maxlocal; i++) outones[i] = 1;
1194     PetscCall(PetscSFFetchAndOpBegin(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1195     PetscCall(PetscSFFetchAndOpEnd(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1196     for (i = 0; i < sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1197     if (PetscDefined(USE_DEBUG)) {                               /* Check that the expected number of increments occurred */
1198       for (i = 0; i < sf->nroots; i++) PetscCheck(inoffset[i] + indegree[i] == inoffset[i + 1], PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incorrect result after PetscSFFetchAndOp");
1199     }
1200     PetscCall(PetscMalloc1(sf->nleaves, &remote));
1201     for (i = 0; i < sf->nleaves; i++) {
1202       remote[i].rank  = sf->remote[i].rank;
1203       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1204     }
1205     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1206     sf->multi->multi = sf->multi;
1207     PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1208     if (sf->rankorder) { /* Sort the ranks */
1209       PetscMPIInt  rank;
1210       PetscInt    *inranks, *newoffset, *outranks, *newoutoffset, *tmpoffset, maxdegree;
1211       PetscSFNode *newremote;
1212       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1213       for (i = 0, maxdegree = 0; i < sf->nroots; i++) maxdegree = PetscMax(maxdegree, indegree[i]);
1214       PetscCall(PetscMalloc5(sf->multi->nroots, &inranks, sf->multi->nroots, &newoffset, maxlocal, &outranks, maxlocal, &newoutoffset, maxdegree, &tmpoffset));
1215       for (i = 0; i < maxlocal; i++) outranks[i] = rank;
1216       PetscCall(PetscSFReduceBegin(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1217       PetscCall(PetscSFReduceEnd(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1218       /* Sort the incoming ranks at each vertex, build the inverse map */
1219       for (i = 0; i < sf->nroots; i++) {
1220         PetscInt j;
1221         for (j = 0; j < indegree[i]; j++) tmpoffset[j] = j;
1222         PetscCall(PetscSortIntWithArray(indegree[i], inranks + inoffset[i], tmpoffset));
1223         for (j = 0; j < indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1224       }
1225       PetscCall(PetscSFBcastBegin(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1226       PetscCall(PetscSFBcastEnd(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1227       PetscCall(PetscMalloc1(sf->nleaves, &newremote));
1228       for (i = 0; i < sf->nleaves; i++) {
1229         newremote[i].rank  = sf->remote[i].rank;
1230         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1231       }
1232       PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, newremote, PETSC_OWN_POINTER));
1233       PetscCall(PetscFree5(inranks, newoffset, outranks, newoutoffset, tmpoffset));
1234     }
1235     PetscCall(PetscFree3(inoffset, outones, outoffset));
1236   }
1237   *multi = sf->multi;
1238   PetscFunctionReturn(0);
1239 }
1240 
1241 /*@C
1242    PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots, does not remap indices
1243 
1244    Collective
1245 
1246    Input Parameters:
1247 +  sf - original star forest
1248 .  nselected  - number of selected roots on this process
1249 -  selected   - indices of the selected roots on this process
1250 
1251    Output Parameter:
1252 .  esf - new star forest
1253 
1254    Level: advanced
1255 
1256    Note:
1257    To use the new `PetscSF`, it may be necessary to know the indices of the leaves that are still participating. This can
1258    be done by calling PetscSFGetGraph().
1259 
1260 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1261 @*/
1262 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *esf)
1263 {
1264   PetscInt           i, j, n, nroots, nleaves, esf_nleaves, *new_ilocal, minleaf, maxleaf, maxlocal;
1265   const PetscInt    *ilocal;
1266   signed char       *rootdata, *leafdata, *leafmem;
1267   const PetscSFNode *iremote;
1268   PetscSFNode       *new_iremote;
1269   MPI_Comm           comm;
1270 
1271   PetscFunctionBegin;
1272   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1273   PetscSFCheckGraphSet(sf, 1);
1274   if (nselected) PetscValidIntPointer(selected, 3);
1275   PetscValidPointer(esf, 4);
1276 
1277   PetscCall(PetscSFSetUp(sf));
1278   PetscCall(PetscLogEventBegin(PETSCSF_EmbedSF, sf, 0, 0, 0));
1279   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1280   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
1281 
1282   if (PetscDefined(USE_DEBUG)) { /* Error out if selected[] has dups or  out of range indices */
1283     PetscBool dups;
1284     PetscCall(PetscCheckDupsInt(nselected, selected, &dups));
1285     PetscCheck(!dups, comm, PETSC_ERR_ARG_WRONG, "selected[] has dups");
1286     for (i = 0; i < nselected; i++) PetscCheck(selected[i] >= 0 && selected[i] < nroots, comm, PETSC_ERR_ARG_OUTOFRANGE, "selected root indice %" PetscInt_FMT " is out of [0,%" PetscInt_FMT ")", selected[i], nroots);
1287   }
1288 
1289   if (sf->ops->CreateEmbeddedRootSF) PetscUseTypeMethod(sf, CreateEmbeddedRootSF, nselected, selected, esf);
1290   else {
1291     /* A generic version of creating embedded sf */
1292     PetscCall(PetscSFGetLeafRange(sf, &minleaf, &maxleaf));
1293     maxlocal = maxleaf - minleaf + 1;
1294     PetscCall(PetscCalloc2(nroots, &rootdata, maxlocal, &leafmem));
1295     leafdata = leafmem - minleaf;
1296     /* Tag selected roots and bcast to leaves */
1297     for (i = 0; i < nselected; i++) rootdata[selected[i]] = 1;
1298     PetscCall(PetscSFBcastBegin(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1299     PetscCall(PetscSFBcastEnd(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1300 
1301     /* Build esf with leaves that are still connected */
1302     esf_nleaves = 0;
1303     for (i = 0; i < nleaves; i++) {
1304       j = ilocal ? ilocal[i] : i;
1305       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1306          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1307       */
1308       esf_nleaves += (leafdata[j] ? 1 : 0);
1309     }
1310     PetscCall(PetscMalloc1(esf_nleaves, &new_ilocal));
1311     PetscCall(PetscMalloc1(esf_nleaves, &new_iremote));
1312     for (i = n = 0; i < nleaves; i++) {
1313       j = ilocal ? ilocal[i] : i;
1314       if (leafdata[j]) {
1315         new_ilocal[n]        = j;
1316         new_iremote[n].rank  = iremote[i].rank;
1317         new_iremote[n].index = iremote[i].index;
1318         ++n;
1319       }
1320     }
1321     PetscCall(PetscSFCreate(comm, esf));
1322     PetscCall(PetscSFSetFromOptions(*esf));
1323     PetscCall(PetscSFSetGraph(*esf, nroots, esf_nleaves, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1324     PetscCall(PetscFree2(rootdata, leafmem));
1325   }
1326   PetscCall(PetscLogEventEnd(PETSCSF_EmbedSF, sf, 0, 0, 0));
1327   PetscFunctionReturn(0);
1328 }
1329 
1330 /*@C
1331   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1332 
1333   Collective
1334 
1335   Input Parameters:
1336 + sf - original star forest
1337 . nselected  - number of selected leaves on this process
1338 - selected   - indices of the selected leaves on this process
1339 
1340   Output Parameter:
1341 .  newsf - new star forest
1342 
1343   Level: advanced
1344 
1345 .seealso: `PetscSF`, `PetscSFCreateEmbeddedRootSF()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1346 @*/
1347 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *newsf)
1348 {
1349   const PetscSFNode *iremote;
1350   PetscSFNode       *new_iremote;
1351   const PetscInt    *ilocal;
1352   PetscInt           i, nroots, *leaves, *new_ilocal;
1353   MPI_Comm           comm;
1354 
1355   PetscFunctionBegin;
1356   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1357   PetscSFCheckGraphSet(sf, 1);
1358   if (nselected) PetscValidIntPointer(selected, 3);
1359   PetscValidPointer(newsf, 4);
1360 
1361   /* Uniq selected[] and put results in leaves[] */
1362   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1363   PetscCall(PetscMalloc1(nselected, &leaves));
1364   PetscCall(PetscArraycpy(leaves, selected, nselected));
1365   PetscCall(PetscSortedRemoveDupsInt(&nselected, leaves));
1366   PetscCheck(!nselected || !(leaves[0] < 0 || leaves[nselected - 1] >= sf->nleaves), comm, PETSC_ERR_ARG_OUTOFRANGE, "Min/Max leaf indices %" PetscInt_FMT "/%" PetscInt_FMT " are not in [0,%" PetscInt_FMT ")", leaves[0], leaves[nselected - 1], sf->nleaves);
1367 
1368   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1369   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) PetscUseTypeMethod(sf, CreateEmbeddedLeafSF, nselected, leaves, newsf);
1370   else {
1371     PetscCall(PetscSFGetGraph(sf, &nroots, NULL, &ilocal, &iremote));
1372     PetscCall(PetscMalloc1(nselected, &new_ilocal));
1373     PetscCall(PetscMalloc1(nselected, &new_iremote));
1374     for (i = 0; i < nselected; ++i) {
1375       const PetscInt l     = leaves[i];
1376       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1377       new_iremote[i].rank  = iremote[l].rank;
1378       new_iremote[i].index = iremote[l].index;
1379     }
1380     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, newsf));
1381     PetscCall(PetscSFSetGraph(*newsf, nroots, nselected, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1382   }
1383   PetscCall(PetscFree(leaves));
1384   PetscFunctionReturn(0);
1385 }
1386 
1387 /*@C
1388    PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to `PetscSFBcastEnd()`
1389 
1390    Collective
1391 
1392    Input Parameters:
1393 +  sf - star forest on which to communicate
1394 .  unit - data type associated with each node
1395 .  rootdata - buffer to broadcast
1396 -  op - operation to use for reduction
1397 
1398    Output Parameter:
1399 .  leafdata - buffer to be reduced with values from each leaf's respective root
1400 
1401    Level: intermediate
1402 
1403    Notes:
1404     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1405     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1406     use `PetscSFBcastWithMemTypeBegin()` instead.
1407 
1408 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastWithMemTypeBegin()`
1409 @*/
1410 PetscErrorCode PetscSFBcastBegin(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1411 {
1412   PetscMemType rootmtype, leafmtype;
1413 
1414   PetscFunctionBegin;
1415   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1416   PetscCall(PetscSFSetUp(sf));
1417   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1418   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1419   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1420   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1421   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1422   PetscFunctionReturn(0);
1423 }
1424 
1425 /*@C
1426    PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to `PetscSFBcastEnd()`
1427 
1428    Collective
1429 
1430    Input Parameters:
1431 +  sf - star forest on which to communicate
1432 .  unit - data type associated with each node
1433 .  rootmtype - memory type of rootdata
1434 .  rootdata - buffer to broadcast
1435 .  leafmtype - memory type of leafdata
1436 -  op - operation to use for reduction
1437 
1438    Output Parameter:
1439 .  leafdata - buffer to be reduced with values from each leaf's respective root
1440 
1441    Level: intermediate
1442 
1443 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastBegin()`
1444 @*/
1445 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
1446 {
1447   PetscFunctionBegin;
1448   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1449   PetscCall(PetscSFSetUp(sf));
1450   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1451   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1452   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1453   PetscFunctionReturn(0);
1454 }
1455 
1456 /*@C
1457    PetscSFBcastEnd - end a broadcast & reduce operation started with `PetscSFBcastBegin()`
1458 
1459    Collective
1460 
1461    Input Parameters:
1462 +  sf - star forest
1463 .  unit - data type
1464 .  rootdata - buffer to broadcast
1465 -  op - operation to use for reduction
1466 
1467    Output Parameter:
1468 .  leafdata - buffer to be reduced with values from each leaf's respective root
1469 
1470    Level: intermediate
1471 
1472 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFReduceEnd()`
1473 @*/
1474 PetscErrorCode PetscSFBcastEnd(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1475 {
1476   PetscFunctionBegin;
1477   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1478   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastEnd, sf, 0, 0, 0));
1479   PetscUseTypeMethod(sf, BcastEnd, unit, rootdata, leafdata, op);
1480   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastEnd, sf, 0, 0, 0));
1481   PetscFunctionReturn(0);
1482 }
1483 
1484 /*@C
1485    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to `PetscSFReduceEnd()`
1486 
1487    Collective
1488 
1489    Input Parameters:
1490 +  sf - star forest
1491 .  unit - data type
1492 .  leafdata - values to reduce
1493 -  op - reduction operation
1494 
1495    Output Parameter:
1496 .  rootdata - result of reduction of values from all leaves of each root
1497 
1498    Level: intermediate
1499 
1500    Notes:
1501     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1502     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1503     use `PetscSFReduceWithMemTypeBegin()` instead.
1504 
1505 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceWithMemTypeBegin()`
1506 @*/
1507 PetscErrorCode PetscSFReduceBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1508 {
1509   PetscMemType rootmtype, leafmtype;
1510 
1511   PetscFunctionBegin;
1512   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1513   PetscCall(PetscSFSetUp(sf));
1514   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1515   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1516   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1517   PetscCall((sf->ops->ReduceBegin)(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1518   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1519   PetscFunctionReturn(0);
1520 }
1521 
1522 /*@C
1523    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to `PetscSFReduceEnd()`
1524 
1525    Collective
1526 
1527    Input Parameters:
1528 +  sf - star forest
1529 .  unit - data type
1530 .  leafmtype - memory type of leafdata
1531 .  leafdata - values to reduce
1532 .  rootmtype - memory type of rootdata
1533 -  op - reduction operation
1534 
1535    Output Parameter:
1536 .  rootdata - result of reduction of values from all leaves of each root
1537 
1538    Level: intermediate
1539 
1540 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceBegin()`
1541 @*/
1542 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
1543 {
1544   PetscFunctionBegin;
1545   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1546   PetscCall(PetscSFSetUp(sf));
1547   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1548   PetscCall((sf->ops->ReduceBegin)(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1549   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1550   PetscFunctionReturn(0);
1551 }
1552 
1553 /*@C
1554    PetscSFReduceEnd - end a reduction operation started with `PetscSFReduceBegin()`
1555 
1556    Collective
1557 
1558    Input Parameters:
1559 +  sf - star forest
1560 .  unit - data type
1561 .  leafdata - values to reduce
1562 -  op - reduction operation
1563 
1564    Output Parameter:
1565 .  rootdata - result of reduction of values from all leaves of each root
1566 
1567    Level: intermediate
1568 
1569 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFBcastEnd()`
1570 @*/
1571 PetscErrorCode PetscSFReduceEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1572 {
1573   PetscFunctionBegin;
1574   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1575   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1576   PetscUseTypeMethod(sf, ReduceEnd, unit, leafdata, rootdata, op);
1577   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1578   PetscFunctionReturn(0);
1579 }
1580 
1581 /*@C
1582    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value,
1583    to be completed with `PetscSFFetchAndOpEnd()`
1584 
1585    Collective
1586 
1587    Input Parameters:
1588 +  sf - star forest
1589 .  unit - data type
1590 .  leafdata - leaf values to use in reduction
1591 -  op - operation to use for reduction
1592 
1593    Output Parameters:
1594 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1595 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1596 
1597    Level: advanced
1598 
1599    Note:
1600    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1601    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1602    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1603    integers.
1604 
1605 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`
1606 @*/
1607 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1608 {
1609   PetscMemType rootmtype, leafmtype, leafupdatemtype;
1610 
1611   PetscFunctionBegin;
1612   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1613   PetscCall(PetscSFSetUp(sf));
1614   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1615   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1616   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1617   PetscCall(PetscGetMemType(leafupdate, &leafupdatemtype));
1618   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1619   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1620   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1621   PetscFunctionReturn(0);
1622 }
1623 
1624 /*@C
1625    PetscSFFetchAndOpWithMemTypeBegin - begin operation with explicit memory types that fetches values from root and updates atomically by
1626    applying operation using my leaf value, to be completed with `PetscSFFetchAndOpEnd()`
1627 
1628    Collective
1629 
1630    Input Parameters:
1631 +  sf - star forest
1632 .  unit - data type
1633 .  rootmtype - memory type of rootdata
1634 .  leafmtype - memory type of leafdata
1635 .  leafdata - leaf values to use in reduction
1636 .  leafupdatemtype - memory type of leafupdate
1637 -  op - operation to use for reduction
1638 
1639    Output Parameters:
1640 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1641 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1642 
1643    Level: advanced
1644 
1645    Note:
1646    See `PetscSFFetchAndOpBegin()` for more details.
1647 
1648 .seealso: `PetscSF`, `PetscSFFetchAndOpBegin()`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`
1649 @*/
1650 PetscErrorCode PetscSFFetchAndOpWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, PetscMemType leafupdatemtype, void *leafupdate, MPI_Op op)
1651 {
1652   PetscFunctionBegin;
1653   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1654   PetscCall(PetscSFSetUp(sf));
1655   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1656   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1657   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1658   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1659   PetscFunctionReturn(0);
1660 }
1661 
1662 /*@C
1663    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1664 
1665    Collective
1666 
1667    Input Parameters:
1668 +  sf - star forest
1669 .  unit - data type
1670 .  leafdata - leaf values to use in reduction
1671 -  op - operation to use for reduction
1672 
1673    Output Parameters:
1674 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1675 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1676 
1677    Level: advanced
1678 
1679 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFReduceEnd()`, `PetscSFSetGraph()`
1680 @*/
1681 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1682 {
1683   PetscFunctionBegin;
1684   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1685   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1686   PetscUseTypeMethod(sf, FetchAndOpEnd, unit, rootdata, leafdata, leafupdate, op);
1687   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1688   PetscFunctionReturn(0);
1689 }
1690 
1691 /*@C
1692    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with `PetscSFComputeDegreeEnd()`
1693 
1694    Collective
1695 
1696    Input Parameter:
1697 .  sf - star forest
1698 
1699    Output Parameter:
1700 .  degree - degree of each root vertex
1701 
1702    Level: advanced
1703 
1704    Note:
1705    The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence no need to call `PetscFree()` on it.
1706 
1707 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeEnd()`
1708 @*/
1709 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf, const PetscInt **degree)
1710 {
1711   PetscFunctionBegin;
1712   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1713   PetscSFCheckGraphSet(sf, 1);
1714   PetscValidPointer(degree, 2);
1715   if (!sf->degreeknown) {
1716     PetscInt i, nroots = sf->nroots, maxlocal;
1717     PetscCheck(!sf->degree, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1718     maxlocal = sf->maxleaf - sf->minleaf + 1;
1719     PetscCall(PetscMalloc1(nroots, &sf->degree));
1720     PetscCall(PetscMalloc1(PetscMax(maxlocal, 1), &sf->degreetmp)); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1721     for (i = 0; i < nroots; i++) sf->degree[i] = 0;
1722     for (i = 0; i < maxlocal; i++) sf->degreetmp[i] = 1;
1723     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1724   }
1725   *degree = NULL;
1726   PetscFunctionReturn(0);
1727 }
1728 
1729 /*@C
1730    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with `PetscSFComputeDegreeBegin()`
1731 
1732    Collective
1733 
1734    Input Parameter:
1735 .  sf - star forest
1736 
1737    Output Parameter:
1738 .  degree - degree of each root vertex
1739 
1740    Level: developer
1741 
1742    Note:
1743    The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence no need to call `PetscFree()` on it.
1744 
1745 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeBegin()`
1746 @*/
1747 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf, const PetscInt **degree)
1748 {
1749   PetscFunctionBegin;
1750   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1751   PetscSFCheckGraphSet(sf, 1);
1752   PetscValidPointer(degree, 2);
1753   if (!sf->degreeknown) {
1754     PetscCheck(sf->degreetmp, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1755     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1756     PetscCall(PetscFree(sf->degreetmp));
1757     sf->degreeknown = PETSC_TRUE;
1758   }
1759   *degree = sf->degree;
1760   PetscFunctionReturn(0);
1761 }
1762 
1763 /*@C
1764    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by `PetscSFGetMultiSF()`).
1765    Each multi-root is assigned index of the corresponding original root.
1766 
1767    Collective
1768 
1769    Input Parameters:
1770 +  sf - star forest
1771 -  degree - degree of each root vertex, computed with `PetscSFComputeDegreeBegin()`/`PetscSFComputeDegreeEnd()`
1772 
1773    Output Parameters:
1774 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1775 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1776 
1777    Level: developer
1778 
1779    Note:
1780    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with `PetscFree()` when no longer needed.
1781 
1782 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFComputeDegreeEnd()`, `PetscSFGetMultiSF()`
1783 @*/
1784 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1785 {
1786   PetscSF  msf;
1787   PetscInt i, j, k, nroots, nmroots;
1788 
1789   PetscFunctionBegin;
1790   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1791   PetscCall(PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL));
1792   if (nroots) PetscValidIntPointer(degree, 2);
1793   if (nMultiRoots) PetscValidIntPointer(nMultiRoots, 3);
1794   PetscValidPointer(multiRootsOrigNumbering, 4);
1795   PetscCall(PetscSFGetMultiSF(sf, &msf));
1796   PetscCall(PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL));
1797   PetscCall(PetscMalloc1(nmroots, multiRootsOrigNumbering));
1798   for (i = 0, j = 0, k = 0; i < nroots; i++) {
1799     if (!degree[i]) continue;
1800     for (j = 0; j < degree[i]; j++, k++) (*multiRootsOrigNumbering)[k] = i;
1801   }
1802   PetscCheck(k == nmroots, PETSC_COMM_SELF, PETSC_ERR_PLIB, "sanity check fail");
1803   if (nMultiRoots) *nMultiRoots = nmroots;
1804   PetscFunctionReturn(0);
1805 }
1806 
1807 /*@C
1808    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with `PetscSFGatherEnd()`
1809 
1810    Collective
1811 
1812    Input Parameters:
1813 +  sf - star forest
1814 .  unit - data type
1815 -  leafdata - leaf data to gather to roots
1816 
1817    Output Parameter:
1818 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1819 
1820    Level: intermediate
1821 
1822 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterBegin()`
1823 @*/
1824 PetscErrorCode PetscSFGatherBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1825 {
1826   PetscSF multi = NULL;
1827 
1828   PetscFunctionBegin;
1829   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1830   PetscCall(PetscSFSetUp(sf));
1831   PetscCall(PetscSFGetMultiSF(sf, &multi));
1832   PetscCall(PetscSFReduceBegin(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1833   PetscFunctionReturn(0);
1834 }
1835 
1836 /*@C
1837    PetscSFGatherEnd - ends pointwise gather operation that was started with `PetscSFGatherBegin()`
1838 
1839    Collective
1840 
1841    Input Parameters:
1842 +  sf - star forest
1843 .  unit - data type
1844 -  leafdata - leaf data to gather to roots
1845 
1846    Output Parameter:
1847 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1848 
1849    Level: intermediate
1850 
1851 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterEnd()`
1852 @*/
1853 PetscErrorCode PetscSFGatherEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1854 {
1855   PetscSF multi = NULL;
1856 
1857   PetscFunctionBegin;
1858   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1859   PetscCall(PetscSFGetMultiSF(sf, &multi));
1860   PetscCall(PetscSFReduceEnd(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1861   PetscFunctionReturn(0);
1862 }
1863 
1864 /*@C
1865    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with `PetscSFScatterEnd()`
1866 
1867    Collective
1868 
1869    Input Parameters:
1870 +  sf - star forest
1871 .  unit - data type
1872 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1873 
1874    Output Parameter:
1875 .  leafdata - leaf data to be update with personal data from each respective root
1876 
1877    Level: intermediate
1878 
1879 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterBegin()`
1880 @*/
1881 PetscErrorCode PetscSFScatterBegin(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1882 {
1883   PetscSF multi = NULL;
1884 
1885   PetscFunctionBegin;
1886   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1887   PetscCall(PetscSFSetUp(sf));
1888   PetscCall(PetscSFGetMultiSF(sf, &multi));
1889   PetscCall(PetscSFBcastBegin(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1890   PetscFunctionReturn(0);
1891 }
1892 
1893 /*@C
1894    PetscSFScatterEnd - ends pointwise scatter operation that was started with `PetscSFScatterBegin()`
1895 
1896    Collective
1897 
1898    Input Parameters:
1899 +  sf - star forest
1900 .  unit - data type
1901 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1902 
1903    Output Parameter:
1904 .  leafdata - leaf data to be update with personal data from each respective root
1905 
1906    Level: intermediate
1907 
1908 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterEnd()`
1909 @*/
1910 PetscErrorCode PetscSFScatterEnd(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1911 {
1912   PetscSF multi = NULL;
1913 
1914   PetscFunctionBegin;
1915   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1916   PetscCall(PetscSFGetMultiSF(sf, &multi));
1917   PetscCall(PetscSFBcastEnd(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1918   PetscFunctionReturn(0);
1919 }
1920 
1921 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1922 {
1923   PetscInt        i, n, nleaves;
1924   const PetscInt *ilocal = NULL;
1925   PetscHSetI      seen;
1926 
1927   PetscFunctionBegin;
1928   if (PetscDefined(USE_DEBUG)) {
1929     PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, NULL));
1930     PetscCall(PetscHSetICreate(&seen));
1931     for (i = 0; i < nleaves; i++) {
1932       const PetscInt leaf = ilocal ? ilocal[i] : i;
1933       PetscCall(PetscHSetIAdd(seen, leaf));
1934     }
1935     PetscCall(PetscHSetIGetSize(seen, &n));
1936     PetscCheck(n == nleaves, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided leaves have repeated values: all leaves must be unique");
1937     PetscCall(PetscHSetIDestroy(&seen));
1938   }
1939   PetscFunctionReturn(0);
1940 }
1941 
1942 /*@
1943   PetscSFCompose - Compose a new `PetscSF` by putting the second `PetscSF` under the first one in a top (roots) down (leaves) view
1944 
1945   Input Parameters:
1946 + sfA - The first `PetscSF`
1947 - sfB - The second `PetscSF`
1948 
1949   Output Parameters:
1950 . sfBA - The composite `PetscSF`
1951 
1952   Level: developer
1953 
1954   Notes:
1955   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
1956   forests, i.e. the same leaf is not connected with different roots.
1957 
1958   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1959   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1960   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1961   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1962 
1963 .seealso: `PetscSF`, `PetscSFComposeInverse()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`
1964 @*/
1965 PetscErrorCode PetscSFCompose(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
1966 {
1967   const PetscSFNode *remotePointsA, *remotePointsB;
1968   PetscSFNode       *remotePointsBA = NULL, *reorderedRemotePointsA = NULL, *leafdataB;
1969   const PetscInt    *localPointsA, *localPointsB;
1970   PetscInt          *localPointsBA;
1971   PetscInt           i, numRootsA, numLeavesA, numRootsB, numLeavesB, minleaf, maxleaf, numLeavesBA;
1972   PetscBool          denseB;
1973 
1974   PetscFunctionBegin;
1975   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
1976   PetscSFCheckGraphSet(sfA, 1);
1977   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
1978   PetscSFCheckGraphSet(sfB, 2);
1979   PetscCheckSameComm(sfA, 1, sfB, 2);
1980   PetscValidPointer(sfBA, 3);
1981   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
1982   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
1983 
1984   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
1985   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
1986   /* Make sure that PetscSFBcast{Begin, End}(sfB, ...) works with root data of size       */
1987   /* numRootsB; otherwise, garbage will be broadcasted.                                   */
1988   /* Example (comm size = 1):                                                             */
1989   /* sfA: 0 <- (0, 0)                                                                     */
1990   /* sfB: 100 <- (0, 0)                                                                   */
1991   /*      101 <- (0, 1)                                                                   */
1992   /* Here, we have remotePointsA = [(0, 0)], but for remotePointsA to be a valid tartget  */
1993   /* of sfB, it has to be recasted as [(0, 0), (-1, -1)] so that points 100 and 101 would */
1994   /* receive (0, 0) and (-1, -1), respectively, when PetscSFBcast(sfB, ...) is called on  */
1995   /* remotePointsA; if not recasted, point 101 would receive a garbage value.             */
1996   PetscCall(PetscMalloc1(numRootsB, &reorderedRemotePointsA));
1997   for (i = 0; i < numRootsB; i++) {
1998     reorderedRemotePointsA[i].rank  = -1;
1999     reorderedRemotePointsA[i].index = -1;
2000   }
2001   for (i = 0; i < numLeavesA; i++) {
2002     PetscInt localp = localPointsA ? localPointsA[i] : i;
2003 
2004     if (localp >= numRootsB) continue;
2005     reorderedRemotePointsA[localp] = remotePointsA[i];
2006   }
2007   remotePointsA = reorderedRemotePointsA;
2008   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2009   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &leafdataB));
2010   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2011     leafdataB[i].rank  = -1;
2012     leafdataB[i].index = -1;
2013   }
2014   PetscCall(PetscSFBcastBegin(sfB, MPIU_2INT, remotePointsA, leafdataB - minleaf, MPI_REPLACE));
2015   PetscCall(PetscSFBcastEnd(sfB, MPIU_2INT, remotePointsA, leafdataB - minleaf, MPI_REPLACE));
2016   PetscCall(PetscFree(reorderedRemotePointsA));
2017 
2018   denseB = (PetscBool)!localPointsB;
2019   for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2020     if (leafdataB[localPointsB ? localPointsB[i] - minleaf : i].rank == -1) denseB = PETSC_FALSE;
2021     else numLeavesBA++;
2022   }
2023   if (denseB) {
2024     localPointsBA  = NULL;
2025     remotePointsBA = leafdataB;
2026   } else {
2027     PetscCall(PetscMalloc1(numLeavesBA, &localPointsBA));
2028     PetscCall(PetscMalloc1(numLeavesBA, &remotePointsBA));
2029     for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2030       const PetscInt l = localPointsB ? localPointsB[i] : i;
2031 
2032       if (leafdataB[l - minleaf].rank == -1) continue;
2033       remotePointsBA[numLeavesBA] = leafdataB[l - minleaf];
2034       localPointsBA[numLeavesBA]  = l;
2035       numLeavesBA++;
2036     }
2037     PetscCall(PetscFree(leafdataB));
2038   }
2039   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2040   PetscCall(PetscSFSetFromOptions(*sfBA));
2041   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2042   PetscFunctionReturn(0);
2043 }
2044 
2045 /*@
2046   PetscSFComposeInverse - Compose a new `PetscSF` by putting the inverse of the second `PetscSF` under the first one
2047 
2048   Input Parameters:
2049 + sfA - The first `PetscSF`
2050 - sfB - The second `PetscSF`
2051 
2052   Output Parameters:
2053 . sfBA - The composite `PetscSF`.
2054 
2055   Level: developer
2056 
2057   Notes:
2058   Currently, the two SFs must be defined on congruent communicators and they must be true star
2059   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2060   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2061 
2062   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2063   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2064   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2065   a Reduce on sfB, on connected roots.
2066 
2067 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFCreateInverseSF()`
2068 @*/
2069 PetscErrorCode PetscSFComposeInverse(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
2070 {
2071   const PetscSFNode *remotePointsA, *remotePointsB;
2072   PetscSFNode       *remotePointsBA;
2073   const PetscInt    *localPointsA, *localPointsB;
2074   PetscSFNode       *reorderedRemotePointsA = NULL;
2075   PetscInt           i, numRootsA, numLeavesA, numLeavesBA, numRootsB, numLeavesB, minleaf, maxleaf, *localPointsBA;
2076   MPI_Op             op;
2077 #if defined(PETSC_USE_64BIT_INDICES)
2078   PetscBool iswin;
2079 #endif
2080 
2081   PetscFunctionBegin;
2082   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
2083   PetscSFCheckGraphSet(sfA, 1);
2084   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
2085   PetscSFCheckGraphSet(sfB, 2);
2086   PetscCheckSameComm(sfA, 1, sfB, 2);
2087   PetscValidPointer(sfBA, 3);
2088   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
2089   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
2090 
2091   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
2092   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
2093 
2094   /* TODO: Check roots of sfB have degree of 1 */
2095   /* Once we implement it, we can replace the MPI_MAXLOC
2096      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2097      We use MPI_MAXLOC only to have a deterministic output from this routine if
2098      the root condition is not meet.
2099    */
2100   op = MPI_MAXLOC;
2101 #if defined(PETSC_USE_64BIT_INDICES)
2102   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2103   PetscCall(PetscObjectTypeCompare((PetscObject)sfB, PETSCSFWINDOW, &iswin));
2104   if (iswin) op = MPI_REPLACE;
2105 #endif
2106 
2107   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2108   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &reorderedRemotePointsA));
2109   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2110     reorderedRemotePointsA[i].rank  = -1;
2111     reorderedRemotePointsA[i].index = -1;
2112   }
2113   if (localPointsA) {
2114     for (i = 0; i < numLeavesA; i++) {
2115       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2116       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2117     }
2118   } else {
2119     for (i = 0; i < numLeavesA; i++) {
2120       if (i > maxleaf || i < minleaf) continue;
2121       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2122     }
2123   }
2124 
2125   PetscCall(PetscMalloc1(numRootsB, &localPointsBA));
2126   PetscCall(PetscMalloc1(numRootsB, &remotePointsBA));
2127   for (i = 0; i < numRootsB; i++) {
2128     remotePointsBA[i].rank  = -1;
2129     remotePointsBA[i].index = -1;
2130   }
2131 
2132   PetscCall(PetscSFReduceBegin(sfB, MPIU_2INT, reorderedRemotePointsA - minleaf, remotePointsBA, op));
2133   PetscCall(PetscSFReduceEnd(sfB, MPIU_2INT, reorderedRemotePointsA - minleaf, remotePointsBA, op));
2134   PetscCall(PetscFree(reorderedRemotePointsA));
2135   for (i = 0, numLeavesBA = 0; i < numRootsB; i++) {
2136     if (remotePointsBA[i].rank == -1) continue;
2137     remotePointsBA[numLeavesBA].rank  = remotePointsBA[i].rank;
2138     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2139     localPointsBA[numLeavesBA]        = i;
2140     numLeavesBA++;
2141   }
2142   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2143   PetscCall(PetscSFSetFromOptions(*sfBA));
2144   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2145   PetscFunctionReturn(0);
2146 }
2147 
2148 /*
2149   PetscSFCreateLocalSF_Private - Creates a local `PetscSF` that only has intra-process edges of the global `PetscSF`
2150 
2151   Input Parameters:
2152 . sf - The global `PetscSF`
2153 
2154   Output Parameters:
2155 . out - The local `PetscSF`
2156 
2157 .seealso: `PetscSF`, `PetscSFCreate()`
2158  */
2159 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf, PetscSF *out)
2160 {
2161   MPI_Comm           comm;
2162   PetscMPIInt        myrank;
2163   const PetscInt    *ilocal;
2164   const PetscSFNode *iremote;
2165   PetscInt           i, j, nroots, nleaves, lnleaves, *lilocal;
2166   PetscSFNode       *liremote;
2167   PetscSF            lsf;
2168 
2169   PetscFunctionBegin;
2170   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2171   if (sf->ops->CreateLocalSF) PetscUseTypeMethod(sf, CreateLocalSF, out);
2172   else {
2173     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2174     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
2175     PetscCallMPI(MPI_Comm_rank(comm, &myrank));
2176 
2177     /* Find out local edges and build a local SF */
2178     PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
2179     for (i = lnleaves = 0; i < nleaves; i++) {
2180       if (iremote[i].rank == (PetscInt)myrank) lnleaves++;
2181     }
2182     PetscCall(PetscMalloc1(lnleaves, &lilocal));
2183     PetscCall(PetscMalloc1(lnleaves, &liremote));
2184 
2185     for (i = j = 0; i < nleaves; i++) {
2186       if (iremote[i].rank == (PetscInt)myrank) {
2187         lilocal[j]        = ilocal ? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2188         liremote[j].rank  = 0;                      /* rank in PETSC_COMM_SELF */
2189         liremote[j].index = iremote[i].index;
2190         j++;
2191       }
2192     }
2193     PetscCall(PetscSFCreate(PETSC_COMM_SELF, &lsf));
2194     PetscCall(PetscSFSetFromOptions(lsf));
2195     PetscCall(PetscSFSetGraph(lsf, nroots, lnleaves, lilocal, PETSC_OWN_POINTER, liremote, PETSC_OWN_POINTER));
2196     PetscCall(PetscSFSetUp(lsf));
2197     *out = lsf;
2198   }
2199   PetscFunctionReturn(0);
2200 }
2201 
2202 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2203 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata)
2204 {
2205   PetscMemType rootmtype, leafmtype;
2206 
2207   PetscFunctionBegin;
2208   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2209   PetscCall(PetscSFSetUp(sf));
2210   PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
2211   PetscCall(PetscGetMemType(rootdata, &rootmtype));
2212   PetscCall(PetscGetMemType(leafdata, &leafmtype));
2213   PetscUseTypeMethod(sf, BcastToZero, unit, rootmtype, rootdata, leafmtype, leafdata);
2214   PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
2215   PetscFunctionReturn(0);
2216 }
2217 
2218 /*@
2219   PetscSFConcatenate - concatenate multiple `PetscSF` into one
2220 
2221   Input Parameters:
2222 + comm - the communicator
2223 . nsfs - the number of input `PetscSF`
2224 . sfs  - the array of input `PetscSF`
2225 . rootMode - the root mode specifying how roots are handled
2226 - leafOffsets - the array of local leaf offsets, one for each input `PetscSF`, or NULL for contiguous storage
2227 
2228   Output Parameters:
2229 . newsf - The resulting `PetscSF`
2230 
2231   Level: advanced
2232 
2233   Notes:
2234   The communicator of all SFs in sfs must be comm.
2235 
2236   Leaves are always concatenated locally, keeping them ordered by the input SF index and original local order.
2237   The offsets in leafOffsets are added to the original leaf indices.
2238   If all input SFs use contiguous leaf storage (ilocal = NULL), leafOffsets can be passed as NULL as well.
2239   In this case, NULL is also passed as ilocal to the resulting SF.
2240   If any input SF has non-null ilocal, leafOffsets is needed to distinguish leaves from different input SFs.
2241   In this case, user is responsible to provide correct offsets so that the resulting leaves are unique (otherwise an error occurs).
2242 
2243   All root modes retain the essential connectivity condition:
2244   If two leaves of the same input SF are connected (sharing the same root), they are also connected in the output SF.
2245   Parameter rootMode controls how the input root spaces are combined.
2246   For `PETSCSF_CONCATENATE_ROOTMODE_SHARED`, the root space is considered the same for each input SF (checked in debug mode) and is also the same in the output SF.
2247   For `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, the input root spaces are taken as separate and joined.
2248   `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` joins the root spaces locally;
2249   roots of sfs[0], sfs[1], sfs[2], ... are joined on each rank separately, ordered by input SF and original local index, and renumbered contiguously.
2250   `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL` joins the root spaces globally;
2251   roots of sfs[0], sfs[1], sfs[2, ... are joined globally, ordered by input SF index and original global index, and renumbered contiguously;
2252   the original root ranks are ignored.
2253   For both `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`,
2254   the output SF's root layout is such that the local number of roots is a sum of the input SF's local numbers of roots on each rank to keep the load balancing.
2255   However, for `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, that means roots can move to different ranks.
2256 
2257    Example:
2258    We can use src/vec/is/sf/tests/ex18.c to compare the root modes. By running
2259 $ make -C $PETSC_DIR/src/vec/is/sf/tests ex18
2260 $ for m in {local,global,shared}; do
2261 $   mpirun -n 2 $PETSC_DIR/src/vec/is/sf/tests/ex18 -nsfs 2 -n 2 -root_mode $m -sf_view
2262 $ done
2263    we generate two identical SFs sf_0 and sf_1,
2264 $ PetscSF Object: sf_0 2 MPI processes
2265 $   type: basic
2266 $   rank #leaves #roots
2267 $   [ 0]       4      2
2268 $   [ 1]       4      2
2269 $   leaves      roots       roots in global numbering
2270 $   ( 0,  0) <- ( 0,  0)  =   0
2271 $   ( 0,  1) <- ( 0,  1)  =   1
2272 $   ( 0,  2) <- ( 1,  0)  =   2
2273 $   ( 0,  3) <- ( 1,  1)  =   3
2274 $   ( 1,  0) <- ( 0,  0)  =   0
2275 $   ( 1,  1) <- ( 0,  1)  =   1
2276 $   ( 1,  2) <- ( 1,  0)  =   2
2277 $   ( 1,  3) <- ( 1,  1)  =   3
2278    and pass them to `PetscSFConcatenate()` along with different choices of rootMode, yielding different result_sf:
2279 $ rootMode = local:
2280 $ PetscSF Object: result_sf 2 MPI processes
2281 $   type: basic
2282 $   rank #leaves #roots
2283 $   [ 0]       8      4
2284 $   [ 1]       8      4
2285 $   leaves      roots       roots in global numbering
2286 $   ( 0,  0) <- ( 0,  0)  =   0
2287 $   ( 0,  1) <- ( 0,  1)  =   1
2288 $   ( 0,  2) <- ( 1,  0)  =   4
2289 $   ( 0,  3) <- ( 1,  1)  =   5
2290 $   ( 0,  4) <- ( 0,  2)  =   2
2291 $   ( 0,  5) <- ( 0,  3)  =   3
2292 $   ( 0,  6) <- ( 1,  2)  =   6
2293 $   ( 0,  7) <- ( 1,  3)  =   7
2294 $   ( 1,  0) <- ( 0,  0)  =   0
2295 $   ( 1,  1) <- ( 0,  1)  =   1
2296 $   ( 1,  2) <- ( 1,  0)  =   4
2297 $   ( 1,  3) <- ( 1,  1)  =   5
2298 $   ( 1,  4) <- ( 0,  2)  =   2
2299 $   ( 1,  5) <- ( 0,  3)  =   3
2300 $   ( 1,  6) <- ( 1,  2)  =   6
2301 $   ( 1,  7) <- ( 1,  3)  =   7
2302 $
2303 $ rootMode = global:
2304 $ PetscSF Object: result_sf 2 MPI processes
2305 $   type: basic
2306 $   rank #leaves #roots
2307 $   [ 0]       8      4
2308 $   [ 1]       8      4
2309 $   leaves      roots       roots in global numbering
2310 $   ( 0,  0) <- ( 0,  0)  =   0
2311 $   ( 0,  1) <- ( 0,  1)  =   1
2312 $   ( 0,  2) <- ( 0,  2)  =   2
2313 $   ( 0,  3) <- ( 0,  3)  =   3
2314 $   ( 0,  4) <- ( 1,  0)  =   4
2315 $   ( 0,  5) <- ( 1,  1)  =   5
2316 $   ( 0,  6) <- ( 1,  2)  =   6
2317 $   ( 0,  7) <- ( 1,  3)  =   7
2318 $   ( 1,  0) <- ( 0,  0)  =   0
2319 $   ( 1,  1) <- ( 0,  1)  =   1
2320 $   ( 1,  2) <- ( 0,  2)  =   2
2321 $   ( 1,  3) <- ( 0,  3)  =   3
2322 $   ( 1,  4) <- ( 1,  0)  =   4
2323 $   ( 1,  5) <- ( 1,  1)  =   5
2324 $   ( 1,  6) <- ( 1,  2)  =   6
2325 $   ( 1,  7) <- ( 1,  3)  =   7
2326 $
2327 $ rootMode = shared:
2328 $ PetscSF Object: result_sf 2 MPI processes
2329 $   type: basic
2330 $   rank #leaves #roots
2331 $   [ 0]       8      2
2332 $   [ 1]       8      2
2333 $   leaves      roots       roots in global numbering
2334 $   ( 0,  0) <- ( 0,  0)  =   0
2335 $   ( 0,  1) <- ( 0,  1)  =   1
2336 $   ( 0,  2) <- ( 1,  0)  =   2
2337 $   ( 0,  3) <- ( 1,  1)  =   3
2338 $   ( 0,  4) <- ( 0,  0)  =   0
2339 $   ( 0,  5) <- ( 0,  1)  =   1
2340 $   ( 0,  6) <- ( 1,  0)  =   2
2341 $   ( 0,  7) <- ( 1,  1)  =   3
2342 $   ( 1,  0) <- ( 0,  0)  =   0
2343 $   ( 1,  1) <- ( 0,  1)  =   1
2344 $   ( 1,  2) <- ( 1,  0)  =   2
2345 $   ( 1,  3) <- ( 1,  1)  =   3
2346 $   ( 1,  4) <- ( 0,  0)  =   0
2347 $   ( 1,  5) <- ( 0,  1)  =   1
2348 $   ( 1,  6) <- ( 1,  0)  =   2
2349 $   ( 1,  7) <- ( 1,  1)  =   3
2350 
2351 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFConcatenateRootMode`
2352 @*/
2353 PetscErrorCode PetscSFConcatenate(MPI_Comm comm, PetscInt nsfs, PetscSF sfs[], PetscSFConcatenateRootMode rootMode, PetscInt leafOffsets[], PetscSF *newsf)
2354 {
2355   PetscInt     i, s, nLeaves, nRoots;
2356   PetscInt    *leafArrayOffsets;
2357   PetscInt    *ilocal_new;
2358   PetscSFNode *iremote_new;
2359   PetscBool    all_ilocal_null = PETSC_FALSE;
2360   PetscLayout  glayout         = NULL;
2361   PetscInt    *gremote         = NULL;
2362   PetscMPIInt  rank, size;
2363 
2364   PetscFunctionBegin;
2365   if (PetscDefined(USE_DEBUG)) {
2366     PetscSF dummy; /* just to have a PetscObject on comm for input validation */
2367 
2368     PetscCall(PetscSFCreate(comm, &dummy));
2369     PetscValidLogicalCollectiveInt(dummy, nsfs, 2);
2370     PetscValidPointer(sfs, 3);
2371     for (i = 0; i < nsfs; i++) {
2372       PetscValidHeaderSpecific(sfs[i], PETSCSF_CLASSID, 3);
2373       PetscCheckSameComm(dummy, 1, sfs[i], 3);
2374     }
2375     PetscValidLogicalCollectiveEnum(dummy, rootMode, 4);
2376     if (leafOffsets) PetscValidIntPointer(leafOffsets, 5);
2377     PetscValidPointer(newsf, 6);
2378     PetscCall(PetscSFDestroy(&dummy));
2379   }
2380   if (!nsfs) {
2381     PetscCall(PetscSFCreate(comm, newsf));
2382     PetscCall(PetscSFSetGraph(*newsf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
2383     PetscFunctionReturn(0);
2384   }
2385   PetscCallMPI(MPI_Comm_rank(comm, &rank));
2386   PetscCallMPI(MPI_Comm_size(comm, &size));
2387 
2388   /* Calculate leaf array offsets */
2389   PetscCall(PetscMalloc1(nsfs + 1, &leafArrayOffsets));
2390   leafArrayOffsets[0] = 0;
2391   for (s = 0; s < nsfs; s++) {
2392     PetscInt nl;
2393 
2394     PetscCall(PetscSFGetGraph(sfs[s], NULL, &nl, NULL, NULL));
2395     leafArrayOffsets[s + 1] = leafArrayOffsets[s] + nl;
2396   }
2397   nLeaves = leafArrayOffsets[nsfs];
2398 
2399   /* Calculate number of roots */
2400   switch (rootMode) {
2401   case PETSCSF_CONCATENATE_ROOTMODE_SHARED: {
2402     PetscCall(PetscSFGetGraph(sfs[0], &nRoots, NULL, NULL, NULL));
2403     if (PetscDefined(USE_DEBUG)) {
2404       for (s = 1; s < nsfs; s++) {
2405         PetscInt nr;
2406 
2407         PetscCall(PetscSFGetGraph(sfs[s], &nr, NULL, NULL, NULL));
2408         PetscCheck(nr == nRoots, comm, PETSC_ERR_ARG_SIZ, "rootMode = %s but sfs[%" PetscInt_FMT "] has a different number of roots (%" PetscInt_FMT ") than sfs[0] (%" PetscInt_FMT ")", PetscSFConcatenateRootModes[rootMode], s, nr, nRoots);
2409       }
2410     }
2411   } break;
2412   case PETSCSF_CONCATENATE_ROOTMODE_GLOBAL: {
2413     /* Calculate also global layout in this case */
2414     PetscInt    *nls;
2415     PetscLayout *lts;
2416     PetscInt   **inds;
2417     PetscInt     j;
2418     PetscInt     rootOffset = 0;
2419 
2420     PetscCall(PetscCalloc3(nsfs, &lts, nsfs, &nls, nsfs, &inds));
2421     PetscCall(PetscLayoutCreate(comm, &glayout));
2422     glayout->bs = 1;
2423     glayout->n  = 0;
2424     glayout->N  = 0;
2425     for (s = 0; s < nsfs; s++) {
2426       PetscCall(PetscSFGetGraphLayout(sfs[s], &lts[s], &nls[s], NULL, &inds[s]));
2427       glayout->n += lts[s]->n;
2428       glayout->N += lts[s]->N;
2429     }
2430     PetscCall(PetscLayoutSetUp(glayout));
2431     PetscCall(PetscMalloc1(nLeaves, &gremote));
2432     for (s = 0, j = 0; s < nsfs; s++) {
2433       for (i = 0; i < nls[s]; i++, j++) gremote[j] = inds[s][i] + rootOffset;
2434       rootOffset += lts[s]->N;
2435       PetscCall(PetscLayoutDestroy(&lts[s]));
2436       PetscCall(PetscFree(inds[s]));
2437     }
2438     PetscCall(PetscFree3(lts, nls, inds));
2439     nRoots = glayout->N;
2440   } break;
2441   case PETSCSF_CONCATENATE_ROOTMODE_LOCAL:
2442     /* nRoots calculated later in this case */
2443     break;
2444   default:
2445     SETERRQ(comm, PETSC_ERR_ARG_WRONG, "Invalid PetscSFConcatenateRootMode %d", rootMode);
2446   }
2447 
2448   if (!leafOffsets) {
2449     all_ilocal_null = PETSC_TRUE;
2450     for (s = 0; s < nsfs; s++) {
2451       const PetscInt *ilocal;
2452 
2453       PetscCall(PetscSFGetGraph(sfs[s], NULL, NULL, &ilocal, NULL));
2454       if (ilocal) {
2455         all_ilocal_null = PETSC_FALSE;
2456         break;
2457       }
2458     }
2459     PetscCheck(all_ilocal_null, PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "leafOffsets can be passed as NULL only if all SFs have ilocal = NULL");
2460   }
2461 
2462   /* Renumber and concatenate local leaves */
2463   ilocal_new = NULL;
2464   if (!all_ilocal_null) {
2465     PetscCall(PetscMalloc1(nLeaves, &ilocal_new));
2466     for (i = 0; i < nLeaves; i++) ilocal_new[i] = -1;
2467     for (s = 0; s < nsfs; s++) {
2468       const PetscInt *ilocal;
2469       PetscInt       *ilocal_l = &ilocal_new[leafArrayOffsets[s]];
2470       PetscInt        i, nleaves_l;
2471 
2472       PetscCall(PetscSFGetGraph(sfs[s], NULL, &nleaves_l, &ilocal, NULL));
2473       for (i = 0; i < nleaves_l; i++) ilocal_l[i] = (ilocal ? ilocal[i] : i) + leafOffsets[s];
2474     }
2475   }
2476 
2477   /* Renumber and concatenate remote roots */
2478   if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL || rootMode == PETSCSF_CONCATENATE_ROOTMODE_SHARED) {
2479     PetscInt rootOffset = 0;
2480 
2481     PetscCall(PetscMalloc1(nLeaves, &iremote_new));
2482     for (i = 0; i < nLeaves; i++) {
2483       iremote_new[i].rank  = -1;
2484       iremote_new[i].index = -1;
2485     }
2486     for (s = 0; s < nsfs; s++) {
2487       PetscInt           i, nl, nr;
2488       PetscSF            tmp_sf;
2489       const PetscSFNode *iremote;
2490       PetscSFNode       *tmp_rootdata;
2491       PetscSFNode       *tmp_leafdata = &iremote_new[leafArrayOffsets[s]];
2492 
2493       PetscCall(PetscSFGetGraph(sfs[s], &nr, &nl, NULL, &iremote));
2494       PetscCall(PetscSFCreate(comm, &tmp_sf));
2495       /* create helper SF with contiguous leaves */
2496       PetscCall(PetscSFSetGraph(tmp_sf, nr, nl, NULL, PETSC_USE_POINTER, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
2497       PetscCall(PetscSFSetUp(tmp_sf));
2498       PetscCall(PetscMalloc1(nr, &tmp_rootdata));
2499       if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) {
2500         for (i = 0; i < nr; i++) {
2501           tmp_rootdata[i].index = i + rootOffset;
2502           tmp_rootdata[i].rank  = (PetscInt)rank;
2503         }
2504         rootOffset += nr;
2505       } else {
2506         for (i = 0; i < nr; i++) {
2507           tmp_rootdata[i].index = i;
2508           tmp_rootdata[i].rank  = (PetscInt)rank;
2509         }
2510       }
2511       PetscCall(PetscSFBcastBegin(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2512       PetscCall(PetscSFBcastEnd(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2513       PetscCall(PetscSFDestroy(&tmp_sf));
2514       PetscCall(PetscFree(tmp_rootdata));
2515     }
2516     if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) { nRoots = rootOffset; } // else nRoots already calculated above
2517 
2518     /* Build the new SF */
2519     PetscCall(PetscSFCreate(comm, newsf));
2520     PetscCall(PetscSFSetGraph(*newsf, nRoots, nLeaves, ilocal_new, PETSC_OWN_POINTER, iremote_new, PETSC_OWN_POINTER));
2521   } else {
2522     /* Build the new SF */
2523     PetscCall(PetscSFCreate(comm, newsf));
2524     PetscCall(PetscSFSetGraphLayout(*newsf, glayout, nLeaves, ilocal_new, PETSC_OWN_POINTER, gremote));
2525   }
2526   PetscCall(PetscSFSetUp(*newsf));
2527   PetscCall(PetscSFViewFromOptions(*newsf, NULL, "-sf_concat_view"));
2528   PetscCall(PetscLayoutDestroy(&glayout));
2529   PetscCall(PetscFree(gremote));
2530   PetscCall(PetscFree(leafArrayOffsets));
2531   PetscFunctionReturn(0);
2532 }
2533