xref: /petsc/src/vec/is/sf/interface/sf.c (revision 9a6bacb05cc5a54be4cd68af72f4fbfbc48b3cb3)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petsc/private/viewerimpl.h>
4 #include <petsc/private/hashmapi.h>
5 
6 #if defined(PETSC_HAVE_CUDA)
7   #include <cuda_runtime.h>
8   #include <petscdevice_cuda.h>
9 #endif
10 
11 #if defined(PETSC_HAVE_HIP)
12   #include <hip/hip_runtime.h>
13 #endif
14 
15 #if defined(PETSC_CLANG_STATIC_ANALYZER)
16 extern void PetscSFCheckGraphSet(PetscSF, int);
17 #else
18   #if defined(PETSC_USE_DEBUG)
19     #define PetscSFCheckGraphSet(sf, arg) PetscCheck((sf)->graphset, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %d \"%s\" before %s()", (arg), #sf, PETSC_FUNCTION_NAME)
20   #else
21     #define PetscSFCheckGraphSet(sf, arg) \
22       do { \
23       } while (0)
24   #endif
25 #endif
26 
27 const char *const PetscSFDuplicateOptions[]     = {"CONFONLY", "RANKS", "GRAPH", "PetscSFDuplicateOption", "PETSCSF_DUPLICATE_", NULL};
28 const char *const PetscSFConcatenateRootModes[] = {"local", "shared", "global", "PetscSFConcatenateRootMode", "PETSCSF_CONCATENATE_ROOTMODE_", NULL};
29 
30 /*@
31   PetscSFCreate - create a star forest communication context
32 
33   Collective
34 
35   Input Parameter:
36 . comm - communicator on which the star forest will operate
37 
38   Output Parameter:
39 . sf - new star forest context
40 
41   Options Database Key:
42 + -sf_type basic                 - Use MPI persistent Isend/Irecv for communication (Default)
43 . -sf_type window                - Use MPI-3 one-sided window for communication
44 . -sf_type neighbor              - Use MPI-3 neighborhood collectives for communication
45 - -sf_neighbor_persistent <bool> - If true, use MPI-4 persistent neighborhood collectives for communication (used along with -sf_type neighbor)
46 
47   Level: intermediate
48 
49   Note:
50   When one knows the communication graph is one of the predefined graph, such as `MPI_Alltoall()`, `MPI_Allgatherv()`,
51   `MPI_Gatherv()`, one can create a `PetscSF` and then set its graph with `PetscSFSetGraphWithPattern()`. These special
52   `SF`s are optimized and they have better performance than the general `SF`s.
53 
54 .seealso: `PetscSF`, `PetscSFSetType`, `PetscSFSetGraph()`, `PetscSFSetGraphWithPattern()`, `PetscSFDestroy()`
55 @*/
56 PetscErrorCode PetscSFCreate(MPI_Comm comm, PetscSF *sf)
57 {
58   PetscSF b;
59 
60   PetscFunctionBegin;
61   PetscAssertPointer(sf, 2);
62   PetscCall(PetscSFInitializePackage());
63 
64   PetscCall(PetscHeaderCreate(b, PETSCSF_CLASSID, "PetscSF", "Star Forest", "PetscSF", comm, PetscSFDestroy, PetscSFView));
65   b->nroots    = -1;
66   b->nleaves   = -1;
67   b->minleaf   = PETSC_MAX_INT;
68   b->maxleaf   = PETSC_MIN_INT;
69   b->nranks    = -1;
70   b->rankorder = PETSC_TRUE;
71   b->ingroup   = MPI_GROUP_NULL;
72   b->outgroup  = MPI_GROUP_NULL;
73   b->graphset  = PETSC_FALSE;
74 #if defined(PETSC_HAVE_DEVICE)
75   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
76   b->use_stream_aware_mpi = PETSC_FALSE;
77   b->unknown_input_stream = PETSC_FALSE;
78   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
79   b->backend = PETSCSF_BACKEND_KOKKOS;
80   #elif defined(PETSC_HAVE_CUDA)
81   b->backend = PETSCSF_BACKEND_CUDA;
82   #elif defined(PETSC_HAVE_HIP)
83   b->backend = PETSCSF_BACKEND_HIP;
84   #endif
85 
86   #if defined(PETSC_HAVE_NVSHMEM)
87   b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
88   b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
89   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem", &b->use_nvshmem, NULL));
90   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem_get", &b->use_nvshmem_get, NULL));
91   #endif
92 #endif
93   b->vscat.from_n = -1;
94   b->vscat.to_n   = -1;
95   b->vscat.unit   = MPIU_SCALAR;
96   *sf             = b;
97   PetscFunctionReturn(PETSC_SUCCESS);
98 }
99 
100 /*@
101   PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
102 
103   Collective
104 
105   Input Parameter:
106 . sf - star forest
107 
108   Level: advanced
109 
110 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetGraph()`, `PetscSFDestroy()`
111 @*/
112 PetscErrorCode PetscSFReset(PetscSF sf)
113 {
114   PetscFunctionBegin;
115   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
116   PetscTryTypeMethod(sf, Reset);
117   PetscCall(PetscSFDestroy(&sf->rankssf));
118 
119   sf->nroots   = -1;
120   sf->nleaves  = -1;
121   sf->minleaf  = PETSC_MAX_INT;
122   sf->maxleaf  = PETSC_MIN_INT;
123   sf->mine     = NULL;
124   sf->remote   = NULL;
125   sf->graphset = PETSC_FALSE;
126   PetscCall(PetscFree(sf->mine_alloc));
127   PetscCall(PetscFree(sf->remote_alloc));
128   sf->nranks = -1;
129   PetscCall(PetscFree4(sf->ranks, sf->roffset, sf->rmine, sf->rremote));
130   sf->degreeknown = PETSC_FALSE;
131   PetscCall(PetscFree(sf->degree));
132   if (sf->ingroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->ingroup));
133   if (sf->outgroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->outgroup));
134 
135   if (sf->multi) sf->multi->multi = NULL;
136   PetscCall(PetscSFDestroy(&sf->multi));
137 
138   PetscCall(PetscLayoutDestroy(&sf->map));
139 
140 #if defined(PETSC_HAVE_DEVICE)
141   for (PetscInt i = 0; i < 2; i++) PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, sf->rmine_d[i]));
142 #endif
143 
144   sf->setupcalled = PETSC_FALSE;
145   PetscFunctionReturn(PETSC_SUCCESS);
146 }
147 
148 /*@
149   PetscSFSetType - Set the `PetscSF` communication implementation
150 
151   Collective
152 
153   Input Parameters:
154 + sf   - the `PetscSF` context
155 - type - a known method
156 .vb
157     PETSCSFWINDOW - MPI-2/3 one-sided
158     PETSCSFBASIC - basic implementation using MPI-1 two-sided
159 .ve
160 
161   Options Database Key:
162 . -sf_type <type> - Sets the method; for example `basic` or `window` use -help for a list of available methods
163 
164   Level: intermediate
165 
166   Notes:
167   See `PetscSFType` for possible values
168 
169 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`
170 @*/
171 PetscErrorCode PetscSFSetType(PetscSF sf, PetscSFType type)
172 {
173   PetscBool match;
174   PetscErrorCode (*r)(PetscSF);
175 
176   PetscFunctionBegin;
177   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
178   PetscAssertPointer(type, 2);
179 
180   PetscCall(PetscObjectTypeCompare((PetscObject)sf, type, &match));
181   if (match) PetscFunctionReturn(PETSC_SUCCESS);
182 
183   PetscCall(PetscFunctionListFind(PetscSFList, type, &r));
184   PetscCheck(r, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_UNKNOWN_TYPE, "Unable to find requested PetscSF type %s", type);
185   /* Destroy the previous PetscSF implementation context */
186   PetscTryTypeMethod(sf, Destroy);
187   PetscCall(PetscMemzero(sf->ops, sizeof(*sf->ops)));
188   PetscCall(PetscObjectChangeTypeName((PetscObject)sf, type));
189   PetscCall((*r)(sf));
190   PetscFunctionReturn(PETSC_SUCCESS);
191 }
192 
193 /*@
194   PetscSFGetType - Get the `PetscSF` communication implementation
195 
196   Not Collective
197 
198   Input Parameter:
199 . sf - the `PetscSF` context
200 
201   Output Parameter:
202 . type - the `PetscSF` type name
203 
204   Level: intermediate
205 
206 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetType()`, `PetscSFCreate()`
207 @*/
208 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
209 {
210   PetscFunctionBegin;
211   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
212   PetscAssertPointer(type, 2);
213   *type = ((PetscObject)sf)->type_name;
214   PetscFunctionReturn(PETSC_SUCCESS);
215 }
216 
217 /*@
218   PetscSFDestroy - destroy a star forest
219 
220   Collective
221 
222   Input Parameter:
223 . sf - address of star forest
224 
225   Level: intermediate
226 
227 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFReset()`
228 @*/
229 PetscErrorCode PetscSFDestroy(PetscSF *sf)
230 {
231   PetscFunctionBegin;
232   if (!*sf) PetscFunctionReturn(PETSC_SUCCESS);
233   PetscValidHeaderSpecific(*sf, PETSCSF_CLASSID, 1);
234   if (--((PetscObject)*sf)->refct > 0) {
235     *sf = NULL;
236     PetscFunctionReturn(PETSC_SUCCESS);
237   }
238   PetscCall(PetscSFReset(*sf));
239   PetscTryTypeMethod(*sf, Destroy);
240   PetscCall(PetscSFDestroy(&(*sf)->vscat.lsf));
241   if ((*sf)->vscat.bs > 1) PetscCallMPI(MPI_Type_free(&(*sf)->vscat.unit));
242 #if defined(PETSC_HAVE_CUDA) && defined(PETSC_HAVE_MPIX_STREAM)
243   if ((*sf)->use_stream_aware_mpi) {
244     PetscCallMPI(MPIX_Stream_free(&(*sf)->mpi_stream));
245     PetscCallMPI(MPI_Comm_free(&(*sf)->stream_comm));
246   }
247 #endif
248   PetscCall(PetscHeaderDestroy(sf));
249   PetscFunctionReturn(PETSC_SUCCESS);
250 }
251 
252 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
253 {
254   PetscInt           i, nleaves;
255   PetscMPIInt        size;
256   const PetscInt    *ilocal;
257   const PetscSFNode *iremote;
258 
259   PetscFunctionBegin;
260   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(PETSC_SUCCESS);
261   PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, &iremote));
262   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
263   for (i = 0; i < nleaves; i++) {
264     const PetscInt rank   = iremote[i].rank;
265     const PetscInt remote = iremote[i].index;
266     const PetscInt leaf   = ilocal ? ilocal[i] : i;
267     PetscCheck(rank >= 0 && rank < size, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided rank (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be in [0, %d)", rank, i, size);
268     PetscCheck(remote >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided index (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be >= 0", remote, i);
269     PetscCheck(leaf >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided location (%" PetscInt_FMT ") for leaf %" PetscInt_FMT " is invalid, should be >= 0", leaf, i);
270   }
271   PetscFunctionReturn(PETSC_SUCCESS);
272 }
273 
274 /*@
275   PetscSFSetUp - set up communication structures for a `PetscSF`, after this is done it may be used to perform communication
276 
277   Collective
278 
279   Input Parameter:
280 . sf - star forest communication object
281 
282   Level: beginner
283 
284 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetFromOptions()`, `PetscSFSetType()`
285 @*/
286 PetscErrorCode PetscSFSetUp(PetscSF sf)
287 {
288   PetscFunctionBegin;
289   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
290   PetscSFCheckGraphSet(sf, 1);
291   if (sf->setupcalled) PetscFunctionReturn(PETSC_SUCCESS);
292   PetscCall(PetscLogEventBegin(PETSCSF_SetUp, sf, 0, 0, 0));
293   PetscCall(PetscSFCheckGraphValid_Private(sf));
294   if (!((PetscObject)sf)->type_name) PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); /* Zero all sf->ops */
295   PetscTryTypeMethod(sf, SetUp);
296 #if defined(PETSC_HAVE_CUDA)
297   if (sf->backend == PETSCSF_BACKEND_CUDA) {
298     sf->ops->Malloc = PetscSFMalloc_CUDA;
299     sf->ops->Free   = PetscSFFree_CUDA;
300   }
301 #endif
302 #if defined(PETSC_HAVE_HIP)
303   if (sf->backend == PETSCSF_BACKEND_HIP) {
304     sf->ops->Malloc = PetscSFMalloc_HIP;
305     sf->ops->Free   = PetscSFFree_HIP;
306   }
307 #endif
308 
309 #if defined(PETSC_HAVE_KOKKOS)
310   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
311     sf->ops->Malloc = PetscSFMalloc_Kokkos;
312     sf->ops->Free   = PetscSFFree_Kokkos;
313   }
314 #endif
315   PetscCall(PetscLogEventEnd(PETSCSF_SetUp, sf, 0, 0, 0));
316   sf->setupcalled = PETSC_TRUE;
317   PetscFunctionReturn(PETSC_SUCCESS);
318 }
319 
320 /*@
321   PetscSFSetFromOptions - set `PetscSF` options using the options database
322 
323   Logically Collective
324 
325   Input Parameter:
326 . sf - star forest
327 
328   Options Database Keys:
329 + -sf_type                                                                                                         - implementation type, see `PetscSFSetType()`
330 . -sf_rank_order                                                                                                   - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
331 . -sf_use_default_stream                                                                                           - Assume callers of `PetscSF` computed the input root/leafdata with the default CUDA stream. `PetscSF` will also
332                             use the default stream to process data. Therefore, no stream synchronization is needed between `PetscSF` and its caller (default: true).
333                             If true, this option only works with `-use_gpu_aware_mpi 1`.
334 . -sf_use_stream_aware_mpi                                                                                         - Assume the underlying MPI is CUDA-stream aware and `PetscSF` won't sync streams for send/recv buffers passed to MPI (default: false).
335                                If true, this option only works with `-use_gpu_aware_mpi 1`.
336 
337 - -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently `PetscSF` has these backends: cuda - hip and Kokkos.
338                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
339                               the only available is kokkos.
340 
341   Level: intermediate
342 
343 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetType()`
344 @*/
345 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
346 {
347   PetscSFType deft;
348   char        type[256];
349   PetscBool   flg;
350 
351   PetscFunctionBegin;
352   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
353   PetscObjectOptionsBegin((PetscObject)sf);
354   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
355   PetscCall(PetscOptionsFList("-sf_type", "PetscSF implementation type", "PetscSFSetType", PetscSFList, deft, type, sizeof(type), &flg));
356   PetscCall(PetscSFSetType(sf, flg ? type : deft));
357   PetscCall(PetscOptionsBool("-sf_rank_order", "sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise", "PetscSFSetRankOrder", sf->rankorder, &sf->rankorder, NULL));
358 #if defined(PETSC_HAVE_DEVICE)
359   {
360     char      backendstr[32] = {0};
361     PetscBool isCuda = PETSC_FALSE, isHip = PETSC_FALSE, isKokkos = PETSC_FALSE, set;
362     /* Change the defaults set in PetscSFCreate() with command line options */
363     PetscCall(PetscOptionsBool("-sf_unknown_input_stream", "SF root/leafdata is computed on arbitrary streams unknown to SF", "PetscSFSetFromOptions", sf->unknown_input_stream, &sf->unknown_input_stream, NULL));
364     PetscCall(PetscOptionsBool("-sf_use_stream_aware_mpi", "Assume the underlying MPI is cuda-stream aware", "PetscSFSetFromOptions", sf->use_stream_aware_mpi, &sf->use_stream_aware_mpi, NULL));
365     PetscCall(PetscOptionsString("-sf_backend", "Select the device backend SF uses", "PetscSFSetFromOptions", NULL, backendstr, sizeof(backendstr), &set));
366     PetscCall(PetscStrcasecmp("cuda", backendstr, &isCuda));
367     PetscCall(PetscStrcasecmp("kokkos", backendstr, &isKokkos));
368     PetscCall(PetscStrcasecmp("hip", backendstr, &isHip));
369   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
370     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
371     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
372     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
373     else PetscCheck(!set, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
374   #elif defined(PETSC_HAVE_KOKKOS)
375     PetscCheck(!set || isKokkos, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You can only choose kokkos", backendstr);
376   #endif
377 
378   #if defined(PETSC_HAVE_CUDA) && defined(PETSC_HAVE_MPIX_STREAM)
379     if (sf->use_stream_aware_mpi) {
380       MPI_Info info;
381 
382       PetscCallMPI(MPI_Info_create(&info));
383       PetscCallMPI(MPI_Info_set(info, "type", "cudaStream_t"));
384       PetscCallMPI(MPIX_Info_set_hex(info, "value", &PetscDefaultCudaStream, sizeof(PetscDefaultCudaStream)));
385       PetscCallMPI(MPIX_Stream_create(info, &sf->mpi_stream));
386       PetscCallMPI(MPI_Info_free(&info));
387       PetscCallMPI(MPIX_Stream_comm_create(PetscObjectComm((PetscObject)sf), sf->mpi_stream, &sf->stream_comm));
388     }
389   #endif
390   }
391 #endif
392   PetscTryTypeMethod(sf, SetFromOptions, PetscOptionsObject);
393   PetscOptionsEnd();
394   PetscFunctionReturn(PETSC_SUCCESS);
395 }
396 
397 /*@
398   PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
399 
400   Logically Collective
401 
402   Input Parameters:
403 + sf  - star forest
404 - flg - `PETSC_TRUE` to sort, `PETSC_FALSE` to skip sorting (lower setup cost, but non-deterministic)
405 
406   Level: advanced
407 
408 .seealso: `PetscSF`, `PetscSFType`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`
409 @*/
410 PetscErrorCode PetscSFSetRankOrder(PetscSF sf, PetscBool flg)
411 {
412   PetscFunctionBegin;
413   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
414   PetscValidLogicalCollectiveBool(sf, flg, 2);
415   PetscCheck(!sf->multi, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
416   sf->rankorder = flg;
417   PetscFunctionReturn(PETSC_SUCCESS);
418 }
419 
420 /*@
421   PetscSFSetGraph - Set a parallel star forest
422 
423   Collective
424 
425   Input Parameters:
426 + sf         - star forest
427 . nroots     - number of root vertices on the current process (these are possible targets for other process to attach leaves)
428 . nleaves    - number of leaf vertices on the current process, each of these references a root on any process
429 . ilocal     - locations of leaves in leafdata buffers, pass `NULL` for contiguous storage (locations must be >= 0, enforced
430 during setup in debug mode)
431 . localmode  - copy mode for `ilocal`
432 . iremote    - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
433 during setup in debug mode)
434 - remotemode - copy mode for `iremote`
435 
436   Level: intermediate
437 
438   Notes:
439   Leaf indices in `ilocal` must be unique, otherwise an error occurs.
440 
441   Input arrays `ilocal` and `iremote` follow the `PetscCopyMode` semantics.
442   In particular, if `localmode` or `remotemode` is `PETSC_OWN_POINTER` or `PETSC_USE_POINTER`,
443   PETSc might modify the respective array;
444   if `PETSC_USE_POINTER`, the user must delete the array after `PetscSFDestroy()`.
445   Only if `PETSC_COPY_VALUES` is used, the respective array is guaranteed to stay intact and a const array can be passed (but a cast to non-const is needed).
446 
447   Fortran Notes:
448   In Fortran you must use `PETSC_COPY_VALUES` for `localmode` and `remotemode`.
449 
450   Developer Notes:
451   We sort leaves to check for duplicates and contiguousness and to find minleaf/maxleaf.
452   This also allows to compare leaf sets of two `PetscSF`s easily.
453 
454 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
455 @*/
456 PetscErrorCode PetscSFSetGraph(PetscSF sf, PetscInt nroots, PetscInt nleaves, PetscInt *ilocal, PetscCopyMode localmode, PetscSFNode *iremote, PetscCopyMode remotemode)
457 {
458   PetscBool unique, contiguous;
459 
460   PetscFunctionBegin;
461   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
462   if (nleaves > 0 && ilocal) PetscAssertPointer(ilocal, 4);
463   if (nleaves > 0) PetscAssertPointer(iremote, 6);
464   PetscCheck(nroots >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nroots %" PetscInt_FMT ", cannot be negative", nroots);
465   PetscCheck(nleaves >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nleaves %" PetscInt_FMT ", cannot be negative", nleaves);
466   /* enums may be handled as unsigned by some compilers, NVHPC for example, the int cast
467    * below is to prevent NVHPC from warning about meaningless comparison of unsigned with zero */
468   PetscCheck((int)localmode >= PETSC_COPY_VALUES && localmode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong localmode %d", localmode);
469   PetscCheck((int)remotemode >= PETSC_COPY_VALUES && remotemode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong remotemode %d", remotemode);
470 
471   if (sf->nroots >= 0) { /* Reset only if graph already set */
472     PetscCall(PetscSFReset(sf));
473   }
474 
475   PetscCall(PetscLogEventBegin(PETSCSF_SetGraph, sf, 0, 0, 0));
476 
477   sf->nroots  = nroots;
478   sf->nleaves = nleaves;
479 
480   if (localmode == PETSC_COPY_VALUES && ilocal) {
481     PetscInt *tlocal = NULL;
482 
483     PetscCall(PetscMalloc1(nleaves, &tlocal));
484     PetscCall(PetscArraycpy(tlocal, ilocal, nleaves));
485     ilocal = tlocal;
486   }
487   if (remotemode == PETSC_COPY_VALUES) {
488     PetscSFNode *tremote = NULL;
489 
490     PetscCall(PetscMalloc1(nleaves, &tremote));
491     PetscCall(PetscArraycpy(tremote, iremote, nleaves));
492     iremote = tremote;
493   }
494 
495   if (nleaves && ilocal) {
496     PetscSFNode work;
497 
498     PetscCall(PetscSortIntWithDataArray(nleaves, ilocal, iremote, sizeof(PetscSFNode), &work));
499     PetscCall(PetscSortedCheckDupsInt(nleaves, ilocal, &unique));
500     unique = PetscNot(unique);
501     PetscCheck(sf->allow_multi_leaves || unique, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Input ilocal has duplicate entries which is not allowed for this PetscSF");
502     sf->minleaf = ilocal[0];
503     sf->maxleaf = ilocal[nleaves - 1];
504     contiguous  = (PetscBool)(unique && ilocal[0] == 0 && ilocal[nleaves - 1] == nleaves - 1);
505   } else {
506     sf->minleaf = 0;
507     sf->maxleaf = nleaves - 1;
508     unique      = PETSC_TRUE;
509     contiguous  = PETSC_TRUE;
510   }
511 
512   if (contiguous) {
513     if (localmode == PETSC_USE_POINTER) {
514       ilocal = NULL;
515     } else {
516       PetscCall(PetscFree(ilocal));
517     }
518   }
519   sf->mine = ilocal;
520   if (localmode == PETSC_USE_POINTER) {
521     sf->mine_alloc = NULL;
522   } else {
523     sf->mine_alloc = ilocal;
524   }
525   sf->remote = iremote;
526   if (remotemode == PETSC_USE_POINTER) {
527     sf->remote_alloc = NULL;
528   } else {
529     sf->remote_alloc = iremote;
530   }
531   PetscCall(PetscLogEventEnd(PETSCSF_SetGraph, sf, 0, 0, 0));
532   sf->graphset = PETSC_TRUE;
533   PetscFunctionReturn(PETSC_SUCCESS);
534 }
535 
536 /*@
537   PetscSFSetGraphWithPattern - Sets the graph of a `PetscSF` with a specific pattern
538 
539   Collective
540 
541   Input Parameters:
542 + sf      - The `PetscSF`
543 . map     - Layout of roots over all processes (insignificant when pattern is `PETSCSF_PATTERN_ALLTOALL`)
544 - pattern - One of `PETSCSF_PATTERN_ALLGATHER`, `PETSCSF_PATTERN_GATHER`, `PETSCSF_PATTERN_ALLTOALL`
545 
546   Level: intermediate
547 
548   Notes:
549   It is easier to explain `PetscSFPattern` using vectors. Suppose we have an MPI vector `x` and its `PetscLayout` is `map`.
550   `n` and `N` are the local and global sizes of `x` respectively.
551 
552   With `PETSCSF_PATTERN_ALLGATHER`, the routine creates a graph that if one does `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on it, it will copy `x` to
553   sequential vectors `y` on all MPI processes.
554 
555   With `PETSCSF_PATTERN_GATHER`, the routine creates a graph that if one does `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on it, it will copy `x` to a
556   sequential vector `y` on rank 0.
557 
558   In above cases, entries of `x` are roots and entries of `y` are leaves.
559 
560   With `PETSCSF_PATTERN_ALLTOALL`, map is insignificant. Suppose NP is size of `sf`'s communicator. The routine
561   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
562   of rank j. Here 0 <=i,j<NP. It is a kind of `MPI_Alltoall()` with sendcount/recvcount being 1. Note that it does
563   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
564   items with `MPI_Type_contiguous` and use that as the <unit> argument in SF routines.
565 
566   In this case, roots and leaves are symmetric.
567 
568 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
569  @*/
570 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf, PetscLayout map, PetscSFPattern pattern)
571 {
572   MPI_Comm    comm;
573   PetscInt    n, N, res[2];
574   PetscMPIInt rank, size;
575   PetscSFType type;
576 
577   PetscFunctionBegin;
578   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
579   if (pattern != PETSCSF_PATTERN_ALLTOALL) PetscAssertPointer(map, 2);
580   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
581   PetscCheck(pattern >= PETSCSF_PATTERN_ALLGATHER && pattern <= PETSCSF_PATTERN_ALLTOALL, comm, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported PetscSFPattern %d", pattern);
582   PetscCallMPI(MPI_Comm_rank(comm, &rank));
583   PetscCallMPI(MPI_Comm_size(comm, &size));
584 
585   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
586     type = PETSCSFALLTOALL;
587     PetscCall(PetscLayoutCreate(comm, &sf->map));
588     PetscCall(PetscLayoutSetLocalSize(sf->map, size));
589     PetscCall(PetscLayoutSetSize(sf->map, ((PetscInt)size) * size));
590     PetscCall(PetscLayoutSetUp(sf->map));
591   } else {
592     PetscCall(PetscLayoutGetLocalSize(map, &n));
593     PetscCall(PetscLayoutGetSize(map, &N));
594     res[0] = n;
595     res[1] = -n;
596     /* Check if n are same over all ranks so that we can optimize it */
597     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, res, 2, MPIU_INT, MPI_MAX, comm));
598     if (res[0] == -res[1]) { /* same n */
599       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER : PETSCSFGATHER;
600     } else {
601       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
602     }
603     PetscCall(PetscLayoutReference(map, &sf->map));
604   }
605   PetscCall(PetscSFSetType(sf, type));
606 
607   sf->pattern = pattern;
608   sf->mine    = NULL; /* Contiguous */
609 
610   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
611      Also set other easy stuff.
612    */
613   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
614     sf->nleaves = N;
615     sf->nroots  = n;
616     sf->nranks  = size;
617     sf->minleaf = 0;
618     sf->maxleaf = N - 1;
619   } else if (pattern == PETSCSF_PATTERN_GATHER) {
620     sf->nleaves = rank ? 0 : N;
621     sf->nroots  = n;
622     sf->nranks  = rank ? 0 : size;
623     sf->minleaf = 0;
624     sf->maxleaf = rank ? -1 : N - 1;
625   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
626     sf->nleaves = size;
627     sf->nroots  = size;
628     sf->nranks  = size;
629     sf->minleaf = 0;
630     sf->maxleaf = size - 1;
631   }
632   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
633   sf->graphset = PETSC_TRUE;
634   PetscFunctionReturn(PETSC_SUCCESS);
635 }
636 
637 /*@
638   PetscSFCreateInverseSF - given a `PetscSF` in which all vertices have degree 1, creates the inverse map
639 
640   Collective
641 
642   Input Parameter:
643 . sf - star forest to invert
644 
645   Output Parameter:
646 . isf - inverse of `sf`
647 
648   Level: advanced
649 
650   Notes:
651   All roots must have degree 1.
652 
653   The local space may be a permutation, but cannot be sparse.
654 
655 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetGraph()`
656 @*/
657 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf, PetscSF *isf)
658 {
659   PetscMPIInt     rank;
660   PetscInt        i, nroots, nleaves, maxlocal, count, *newilocal;
661   const PetscInt *ilocal;
662   PetscSFNode    *roots, *leaves;
663 
664   PetscFunctionBegin;
665   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
666   PetscSFCheckGraphSet(sf, 1);
667   PetscAssertPointer(isf, 2);
668 
669   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, NULL));
670   maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
671 
672   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
673   PetscCall(PetscMalloc2(nroots, &roots, maxlocal, &leaves));
674   for (i = 0; i < maxlocal; i++) {
675     leaves[i].rank  = rank;
676     leaves[i].index = i;
677   }
678   for (i = 0; i < nroots; i++) {
679     roots[i].rank  = -1;
680     roots[i].index = -1;
681   }
682   PetscCall(PetscSFReduceBegin(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
683   PetscCall(PetscSFReduceEnd(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
684 
685   /* Check whether our leaves are sparse */
686   for (i = 0, count = 0; i < nroots; i++)
687     if (roots[i].rank >= 0) count++;
688   if (count == nroots) newilocal = NULL;
689   else { /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */ PetscCall(PetscMalloc1(count, &newilocal));
690     for (i = 0, count = 0; i < nroots; i++) {
691       if (roots[i].rank >= 0) {
692         newilocal[count]   = i;
693         roots[count].rank  = roots[i].rank;
694         roots[count].index = roots[i].index;
695         count++;
696       }
697     }
698   }
699 
700   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, isf));
701   PetscCall(PetscSFSetGraph(*isf, maxlocal, count, newilocal, PETSC_OWN_POINTER, roots, PETSC_COPY_VALUES));
702   PetscCall(PetscFree2(roots, leaves));
703   PetscFunctionReturn(PETSC_SUCCESS);
704 }
705 
706 /*@
707   PetscSFDuplicate - duplicate a `PetscSF`, optionally preserving rank connectivity and graph
708 
709   Collective
710 
711   Input Parameters:
712 + sf  - communication object to duplicate
713 - opt - `PETSCSF_DUPLICATE_CONFONLY`, `PETSCSF_DUPLICATE_RANKS`, or `PETSCSF_DUPLICATE_GRAPH` (see `PetscSFDuplicateOption`)
714 
715   Output Parameter:
716 . newsf - new communication object
717 
718   Level: beginner
719 
720 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFSetType()`, `PetscSFSetGraph()`
721 @*/
722 PetscErrorCode PetscSFDuplicate(PetscSF sf, PetscSFDuplicateOption opt, PetscSF *newsf)
723 {
724   PetscSFType  type;
725   MPI_Datatype dtype = MPIU_SCALAR;
726 
727   PetscFunctionBegin;
728   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
729   PetscValidLogicalCollectiveEnum(sf, opt, 2);
730   PetscAssertPointer(newsf, 3);
731   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sf), newsf));
732   PetscCall(PetscSFGetType(sf, &type));
733   if (type) PetscCall(PetscSFSetType(*newsf, type));
734   (*newsf)->allow_multi_leaves = sf->allow_multi_leaves; /* Dup this flag earlier since PetscSFSetGraph() below checks on this flag */
735   if (opt == PETSCSF_DUPLICATE_GRAPH) {
736     PetscSFCheckGraphSet(sf, 1);
737     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
738       PetscInt           nroots, nleaves;
739       const PetscInt    *ilocal;
740       const PetscSFNode *iremote;
741       PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
742       PetscCall(PetscSFSetGraph(*newsf, nroots, nleaves, (PetscInt *)ilocal, PETSC_COPY_VALUES, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
743     } else {
744       PetscCall(PetscSFSetGraphWithPattern(*newsf, sf->map, sf->pattern));
745     }
746   }
747   /* Since oldtype is committed, so is newtype, according to MPI */
748   if (sf->vscat.bs > 1) PetscCallMPI(MPI_Type_dup(sf->vscat.unit, &dtype));
749   (*newsf)->vscat.bs     = sf->vscat.bs;
750   (*newsf)->vscat.unit   = dtype;
751   (*newsf)->vscat.to_n   = sf->vscat.to_n;
752   (*newsf)->vscat.from_n = sf->vscat.from_n;
753   /* Do not copy lsf. Build it on demand since it is rarely used */
754 
755 #if defined(PETSC_HAVE_DEVICE)
756   (*newsf)->backend              = sf->backend;
757   (*newsf)->unknown_input_stream = sf->unknown_input_stream;
758   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
759   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
760 #endif
761   PetscTryTypeMethod(sf, Duplicate, opt, *newsf);
762   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
763   PetscFunctionReturn(PETSC_SUCCESS);
764 }
765 
766 /*@C
767   PetscSFGetGraph - Get the graph specifying a parallel star forest
768 
769   Not Collective
770 
771   Input Parameter:
772 . sf - star forest
773 
774   Output Parameters:
775 + nroots  - number of root vertices on the current process (these are possible targets for other process to attach leaves)
776 . nleaves - number of leaf vertices on the current process, each of these references a root on any process
777 . ilocal  - locations of leaves in leafdata buffers (if returned value is `NULL`, it means leaves are in contiguous storage)
778 - iremote - remote locations of root vertices for each leaf on the current process
779 
780   Level: intermediate
781 
782   Notes:
783   We are not currently requiring that the graph is set, thus returning `nroots` = -1 if it has not been set yet
784 
785   The returned `ilocal` and `iremote` might contain values in different order than the input ones in `PetscSFSetGraph()`
786 
787   Fortran Notes:
788   The returned `iremote` array is a copy and must be deallocated after use. Consequently, if you
789   want to update the graph, you must call `PetscSFSetGraph()` after modifying the `iremote` array.
790 
791   To check for a `NULL` `ilocal` use
792 $      if (loc(ilocal) == loc(PETSC_NULL_INTEGER)) then
793 
794 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`
795 @*/
796 PetscErrorCode PetscSFGetGraph(PetscSF sf, PetscInt *nroots, PetscInt *nleaves, const PetscInt **ilocal, const PetscSFNode **iremote)
797 {
798   PetscFunctionBegin;
799   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
800   if (sf->ops->GetGraph) {
801     PetscCall(sf->ops->GetGraph(sf, nroots, nleaves, ilocal, iremote));
802   } else {
803     if (nroots) *nroots = sf->nroots;
804     if (nleaves) *nleaves = sf->nleaves;
805     if (ilocal) *ilocal = sf->mine;
806     if (iremote) *iremote = sf->remote;
807   }
808   PetscFunctionReturn(PETSC_SUCCESS);
809 }
810 
811 /*@
812   PetscSFGetLeafRange - Get the active leaf ranges
813 
814   Not Collective
815 
816   Input Parameter:
817 . sf - star forest
818 
819   Output Parameters:
820 + minleaf - minimum active leaf on this process. Returns 0 if there are no leaves.
821 - maxleaf - maximum active leaf on this process. Returns -1 if there are no leaves.
822 
823   Level: developer
824 
825 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
826 @*/
827 PetscErrorCode PetscSFGetLeafRange(PetscSF sf, PetscInt *minleaf, PetscInt *maxleaf)
828 {
829   PetscFunctionBegin;
830   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
831   PetscSFCheckGraphSet(sf, 1);
832   if (minleaf) *minleaf = sf->minleaf;
833   if (maxleaf) *maxleaf = sf->maxleaf;
834   PetscFunctionReturn(PETSC_SUCCESS);
835 }
836 
837 /*@
838   PetscSFViewFromOptions - View a `PetscSF` based on arguments in the options database
839 
840   Collective
841 
842   Input Parameters:
843 + A    - the star forest
844 . obj  - Optional object that provides the prefix for the option names
845 - name - command line option
846 
847   Level: intermediate
848 
849   Note:
850   See `PetscObjectViewFromOptions()` for possible `PetscViewer` and `PetscViewerFormat`
851 
852 .seealso: `PetscSF`, `PetscSFView`, `PetscObjectViewFromOptions()`, `PetscSFCreate()`
853 @*/
854 PetscErrorCode PetscSFViewFromOptions(PetscSF A, PetscObject obj, const char name[])
855 {
856   PetscFunctionBegin;
857   PetscValidHeaderSpecific(A, PETSCSF_CLASSID, 1);
858   PetscCall(PetscObjectViewFromOptions((PetscObject)A, obj, name));
859   PetscFunctionReturn(PETSC_SUCCESS);
860 }
861 
862 /*@
863   PetscSFView - view a star forest
864 
865   Collective
866 
867   Input Parameters:
868 + sf     - star forest
869 - viewer - viewer to display graph, for example `PETSC_VIEWER_STDOUT_WORLD`
870 
871   Level: beginner
872 
873 .seealso: `PetscSF`, `PetscViewer`, `PetscSFCreate()`, `PetscSFSetGraph()`
874 @*/
875 PetscErrorCode PetscSFView(PetscSF sf, PetscViewer viewer)
876 {
877   PetscBool         iascii;
878   PetscViewerFormat format;
879 
880   PetscFunctionBegin;
881   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
882   if (!viewer) PetscCall(PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf), &viewer));
883   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
884   PetscCheckSameComm(sf, 1, viewer, 2);
885   if (sf->graphset) PetscCall(PetscSFSetUp(sf));
886   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
887   if (iascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) {
888     PetscMPIInt rank;
889     PetscInt    ii, i, j;
890 
891     PetscCall(PetscObjectPrintClassNamePrefixType((PetscObject)sf, viewer));
892     PetscCall(PetscViewerASCIIPushTab(viewer));
893     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
894       if (!sf->graphset) {
895         PetscCall(PetscViewerASCIIPrintf(viewer, "PetscSFSetGraph() has not been called yet\n"));
896         PetscCall(PetscViewerASCIIPopTab(viewer));
897         PetscFunctionReturn(PETSC_SUCCESS);
898       }
899       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
900       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
901       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Number of roots=%" PetscInt_FMT ", leaves=%" PetscInt_FMT ", remote ranks=%" PetscInt_FMT "\n", rank, sf->nroots, sf->nleaves, sf->nranks));
902       for (i = 0; i < sf->nleaves; i++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %" PetscInt_FMT " <- (%" PetscInt_FMT ",%" PetscInt_FMT ")\n", rank, sf->mine ? sf->mine[i] : i, sf->remote[i].rank, sf->remote[i].index));
903       PetscCall(PetscViewerFlush(viewer));
904       PetscCall(PetscViewerGetFormat(viewer, &format));
905       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
906         PetscMPIInt *tmpranks, *perm;
907         PetscCall(PetscMalloc2(sf->nranks, &tmpranks, sf->nranks, &perm));
908         PetscCall(PetscArraycpy(tmpranks, sf->ranks, sf->nranks));
909         for (i = 0; i < sf->nranks; i++) perm[i] = i;
910         PetscCall(PetscSortMPIIntWithArray(sf->nranks, tmpranks, perm));
911         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Roots referenced by my leaves, by rank\n", rank));
912         for (ii = 0; ii < sf->nranks; ii++) {
913           i = perm[ii];
914           PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %d: %" PetscInt_FMT " edges\n", rank, sf->ranks[i], sf->roffset[i + 1] - sf->roffset[i]));
915           for (j = sf->roffset[i]; j < sf->roffset[i + 1]; j++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d]    %" PetscInt_FMT " <- %" PetscInt_FMT "\n", rank, sf->rmine[j], sf->rremote[j]));
916         }
917         PetscCall(PetscFree2(tmpranks, perm));
918       }
919       PetscCall(PetscViewerFlush(viewer));
920       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
921     }
922     PetscCall(PetscViewerASCIIPopTab(viewer));
923   }
924   PetscTryTypeMethod(sf, View, viewer);
925   PetscFunctionReturn(PETSC_SUCCESS);
926 }
927 
928 /*@C
929   PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
930 
931   Not Collective
932 
933   Input Parameter:
934 . sf - star forest
935 
936   Output Parameters:
937 + nranks  - number of ranks referenced by local part
938 . ranks   - [`nranks`] array of ranks
939 . roffset - [`nranks`+1] offset in `rmine`/`rremote` for each rank
940 . rmine   - [`roffset`[`nranks`]] concatenated array holding local indices referencing each remote rank
941 - rremote - [`roffset`[`nranks`]] concatenated array holding remote indices referenced for each remote rank
942 
943   Level: developer
944 
945 .seealso: `PetscSF`, `PetscSFGetLeafRanks()`
946 @*/
947 PetscErrorCode PetscSFGetRootRanks(PetscSF sf, PetscInt *nranks, const PetscMPIInt **ranks, const PetscInt **roffset, const PetscInt **rmine, const PetscInt **rremote)
948 {
949   PetscFunctionBegin;
950   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
951   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
952   if (sf->ops->GetRootRanks) {
953     PetscUseTypeMethod(sf, GetRootRanks, nranks, ranks, roffset, rmine, rremote);
954   } else {
955     /* The generic implementation */
956     if (nranks) *nranks = sf->nranks;
957     if (ranks) *ranks = sf->ranks;
958     if (roffset) *roffset = sf->roffset;
959     if (rmine) *rmine = sf->rmine;
960     if (rremote) *rremote = sf->rremote;
961   }
962   PetscFunctionReturn(PETSC_SUCCESS);
963 }
964 
965 /*@C
966   PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
967 
968   Not Collective
969 
970   Input Parameter:
971 . sf - star forest
972 
973   Output Parameters:
974 + niranks  - number of leaf ranks referencing roots on this process
975 . iranks   - [`niranks`] array of ranks
976 . ioffset  - [`niranks`+1] offset in `irootloc` for each rank
977 - irootloc - [`ioffset`[`niranks`]] concatenated array holding local indices of roots referenced by each leaf rank
978 
979   Level: developer
980 
981 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
982 @*/
983 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf, PetscInt *niranks, const PetscMPIInt **iranks, const PetscInt **ioffset, const PetscInt **irootloc)
984 {
985   PetscFunctionBegin;
986   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
987   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
988   if (sf->ops->GetLeafRanks) {
989     PetscUseTypeMethod(sf, GetLeafRanks, niranks, iranks, ioffset, irootloc);
990   } else {
991     PetscSFType type;
992     PetscCall(PetscSFGetType(sf, &type));
993     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
994   }
995   PetscFunctionReturn(PETSC_SUCCESS);
996 }
997 
998 static PetscBool InList(PetscMPIInt needle, PetscMPIInt n, const PetscMPIInt *list)
999 {
1000   PetscInt i;
1001   for (i = 0; i < n; i++) {
1002     if (needle == list[i]) return PETSC_TRUE;
1003   }
1004   return PETSC_FALSE;
1005 }
1006 
1007 /*@C
1008   PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by `PetscSF` implementations.
1009 
1010   Collective
1011 
1012   Input Parameters:
1013 + sf     - `PetscSF` to set up; `PetscSFSetGraph()` must have been called
1014 - dgroup - `MPI_Group` of ranks to be distinguished (e.g., for self or shared memory exchange)
1015 
1016   Level: developer
1017 
1018 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
1019 @*/
1020 PetscErrorCode PetscSFSetUpRanks(PetscSF sf, MPI_Group dgroup)
1021 {
1022   PetscHMapI    table;
1023   PetscHashIter pos;
1024   PetscMPIInt   size, groupsize, *groupranks;
1025   PetscInt     *rcount, *ranks;
1026   PetscInt      i, irank = -1, orank = -1;
1027 
1028   PetscFunctionBegin;
1029   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1030   PetscSFCheckGraphSet(sf, 1);
1031   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
1032   PetscCall(PetscHMapICreateWithSize(10, &table));
1033   for (i = 0; i < sf->nleaves; i++) {
1034     /* Log 1-based rank */
1035     PetscCall(PetscHMapISetWithMode(table, sf->remote[i].rank + 1, 1, ADD_VALUES));
1036   }
1037   PetscCall(PetscHMapIGetSize(table, &sf->nranks));
1038   PetscCall(PetscMalloc4(sf->nranks, &sf->ranks, sf->nranks + 1, &sf->roffset, sf->nleaves, &sf->rmine, sf->nleaves, &sf->rremote));
1039   PetscCall(PetscMalloc2(sf->nranks, &rcount, sf->nranks, &ranks));
1040   PetscHashIterBegin(table, pos);
1041   for (i = 0; i < sf->nranks; i++) {
1042     PetscHashIterGetKey(table, pos, ranks[i]);
1043     PetscHashIterGetVal(table, pos, rcount[i]);
1044     PetscHashIterNext(table, pos);
1045     ranks[i]--; /* Convert back to 0-based */
1046   }
1047   PetscCall(PetscHMapIDestroy(&table));
1048 
1049   /* We expect that dgroup is reliably "small" while nranks could be large */
1050   {
1051     MPI_Group    group = MPI_GROUP_NULL;
1052     PetscMPIInt *dgroupranks;
1053     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1054     PetscCallMPI(MPI_Group_size(dgroup, &groupsize));
1055     PetscCall(PetscMalloc1(groupsize, &dgroupranks));
1056     PetscCall(PetscMalloc1(groupsize, &groupranks));
1057     for (i = 0; i < groupsize; i++) dgroupranks[i] = i;
1058     if (groupsize) PetscCallMPI(MPI_Group_translate_ranks(dgroup, groupsize, dgroupranks, group, groupranks));
1059     PetscCallMPI(MPI_Group_free(&group));
1060     PetscCall(PetscFree(dgroupranks));
1061   }
1062 
1063   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1064   for (sf->ndranks = 0, i = sf->nranks; sf->ndranks < i;) {
1065     for (i--; sf->ndranks < i; i--) { /* Scan i backward looking for distinguished rank */
1066       if (InList(ranks[i], groupsize, groupranks)) break;
1067     }
1068     for (; sf->ndranks <= i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1069       if (!InList(ranks[sf->ndranks], groupsize, groupranks)) break;
1070     }
1071     if (sf->ndranks < i) { /* Swap ranks[sf->ndranks] with ranks[i] */
1072       PetscInt tmprank, tmpcount;
1073 
1074       tmprank             = ranks[i];
1075       tmpcount            = rcount[i];
1076       ranks[i]            = ranks[sf->ndranks];
1077       rcount[i]           = rcount[sf->ndranks];
1078       ranks[sf->ndranks]  = tmprank;
1079       rcount[sf->ndranks] = tmpcount;
1080       sf->ndranks++;
1081     }
1082   }
1083   PetscCall(PetscFree(groupranks));
1084   PetscCall(PetscSortIntWithArray(sf->ndranks, ranks, rcount));
1085   if (rcount) PetscCall(PetscSortIntWithArray(sf->nranks - sf->ndranks, ranks + sf->ndranks, rcount + sf->ndranks));
1086   sf->roffset[0] = 0;
1087   for (i = 0; i < sf->nranks; i++) {
1088     PetscCall(PetscMPIIntCast(ranks[i], sf->ranks + i));
1089     sf->roffset[i + 1] = sf->roffset[i] + rcount[i];
1090     rcount[i]          = 0;
1091   }
1092   for (i = 0, irank = -1, orank = -1; i < sf->nleaves; i++) {
1093     /* short circuit */
1094     if (orank != sf->remote[i].rank) {
1095       /* Search for index of iremote[i].rank in sf->ranks */
1096       PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->ndranks, sf->ranks, &irank));
1097       if (irank < 0) {
1098         PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->nranks - sf->ndranks, sf->ranks + sf->ndranks, &irank));
1099         if (irank >= 0) irank += sf->ndranks;
1100       }
1101       orank = sf->remote[i].rank;
1102     }
1103     PetscCheck(irank >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Could not find rank %" PetscInt_FMT " in array", sf->remote[i].rank);
1104     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1105     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1106     rcount[irank]++;
1107   }
1108   PetscCall(PetscFree2(rcount, ranks));
1109   PetscFunctionReturn(PETSC_SUCCESS);
1110 }
1111 
1112 /*@C
1113   PetscSFGetGroups - gets incoming and outgoing process groups
1114 
1115   Collective
1116 
1117   Input Parameter:
1118 . sf - star forest
1119 
1120   Output Parameters:
1121 + incoming - group of origin processes for incoming edges (leaves that reference my roots)
1122 - outgoing - group of destination processes for outgoing edges (roots that I reference)
1123 
1124   Level: developer
1125 
1126 .seealso: `PetscSF`, `PetscSFGetWindow()`, `PetscSFRestoreWindow()`
1127 @*/
1128 PetscErrorCode PetscSFGetGroups(PetscSF sf, MPI_Group *incoming, MPI_Group *outgoing)
1129 {
1130   MPI_Group group = MPI_GROUP_NULL;
1131 
1132   PetscFunctionBegin;
1133   PetscCheck(sf->nranks >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUpRanks() before obtaining groups");
1134   if (sf->ingroup == MPI_GROUP_NULL) {
1135     PetscInt        i;
1136     const PetscInt *indegree;
1137     PetscMPIInt     rank, *outranks, *inranks;
1138     PetscSFNode    *remote;
1139     PetscSF         bgcount;
1140 
1141     /* Compute the number of incoming ranks */
1142     PetscCall(PetscMalloc1(sf->nranks, &remote));
1143     for (i = 0; i < sf->nranks; i++) {
1144       remote[i].rank  = sf->ranks[i];
1145       remote[i].index = 0;
1146     }
1147     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, &bgcount));
1148     PetscCall(PetscSFSetGraph(bgcount, 1, sf->nranks, NULL, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1149     PetscCall(PetscSFComputeDegreeBegin(bgcount, &indegree));
1150     PetscCall(PetscSFComputeDegreeEnd(bgcount, &indegree));
1151     /* Enumerate the incoming ranks */
1152     PetscCall(PetscMalloc2(indegree[0], &inranks, sf->nranks, &outranks));
1153     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1154     for (i = 0; i < sf->nranks; i++) outranks[i] = rank;
1155     PetscCall(PetscSFGatherBegin(bgcount, MPI_INT, outranks, inranks));
1156     PetscCall(PetscSFGatherEnd(bgcount, MPI_INT, outranks, inranks));
1157     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1158     PetscCallMPI(MPI_Group_incl(group, indegree[0], inranks, &sf->ingroup));
1159     PetscCallMPI(MPI_Group_free(&group));
1160     PetscCall(PetscFree2(inranks, outranks));
1161     PetscCall(PetscSFDestroy(&bgcount));
1162   }
1163   *incoming = sf->ingroup;
1164 
1165   if (sf->outgroup == MPI_GROUP_NULL) {
1166     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1167     PetscCallMPI(MPI_Group_incl(group, sf->nranks, sf->ranks, &sf->outgroup));
1168     PetscCallMPI(MPI_Group_free(&group));
1169   }
1170   *outgoing = sf->outgroup;
1171   PetscFunctionReturn(PETSC_SUCCESS);
1172 }
1173 
1174 /*@
1175   PetscSFGetRanksSF - gets the `PetscSF` to perform communications with root ranks
1176 
1177   Collective
1178 
1179   Input Parameter:
1180 . sf - star forest
1181 
1182   Output Parameter:
1183 . rsf - the star forest with a single root per process to perform communications
1184 
1185   Level: developer
1186 
1187 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGetRootRanks()`
1188 @*/
1189 PetscErrorCode PetscSFGetRanksSF(PetscSF sf, PetscSF *rsf)
1190 {
1191   PetscFunctionBegin;
1192   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1193   PetscAssertPointer(rsf, 2);
1194   if (!sf->rankssf) {
1195     PetscSFNode       *rremotes;
1196     const PetscMPIInt *ranks;
1197     PetscInt           nranks;
1198 
1199     PetscCall(PetscSFGetRootRanks(sf, &nranks, &ranks, NULL, NULL, NULL));
1200     PetscCall(PetscMalloc1(nranks, &rremotes));
1201     for (PetscInt i = 0; i < nranks; i++) {
1202       rremotes[i].rank  = ranks[i];
1203       rremotes[i].index = 0;
1204     }
1205     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, &sf->rankssf));
1206     PetscCall(PetscSFSetGraph(sf->rankssf, 1, nranks, NULL, PETSC_OWN_POINTER, rremotes, PETSC_OWN_POINTER));
1207   }
1208   *rsf = sf->rankssf;
1209   PetscFunctionReturn(PETSC_SUCCESS);
1210 }
1211 
1212 /*@
1213   PetscSFGetMultiSF - gets the inner `PetscSF` implementing gathers and scatters
1214 
1215   Collective
1216 
1217   Input Parameter:
1218 . sf - star forest that may contain roots with 0 or with more than 1 vertex
1219 
1220   Output Parameter:
1221 . multi - star forest with split roots, such that each root has degree exactly 1
1222 
1223   Level: developer
1224 
1225   Note:
1226   In most cases, users should use `PetscSFGatherBegin()` and `PetscSFScatterBegin()` instead of manipulating multi
1227   directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1228   edge, it is a candidate for future optimization that might involve its removal.
1229 
1230 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`, `PetscSFComputeMultiRootOriginalNumbering()`
1231 @*/
1232 PetscErrorCode PetscSFGetMultiSF(PetscSF sf, PetscSF *multi)
1233 {
1234   PetscFunctionBegin;
1235   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1236   PetscAssertPointer(multi, 2);
1237   if (sf->nroots < 0) { /* Graph has not been set yet; why do we need this? */
1238     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1239     *multi           = sf->multi;
1240     sf->multi->multi = sf->multi;
1241     PetscFunctionReturn(PETSC_SUCCESS);
1242   }
1243   if (!sf->multi) {
1244     const PetscInt *indegree;
1245     PetscInt        i, *inoffset, *outones, *outoffset, maxlocal;
1246     PetscSFNode    *remote;
1247     maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
1248     PetscCall(PetscSFComputeDegreeBegin(sf, &indegree));
1249     PetscCall(PetscSFComputeDegreeEnd(sf, &indegree));
1250     PetscCall(PetscMalloc3(sf->nroots + 1, &inoffset, maxlocal, &outones, maxlocal, &outoffset));
1251     inoffset[0] = 0;
1252     for (i = 0; i < sf->nroots; i++) inoffset[i + 1] = inoffset[i] + indegree[i];
1253     for (i = 0; i < maxlocal; i++) outones[i] = 1;
1254     PetscCall(PetscSFFetchAndOpBegin(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1255     PetscCall(PetscSFFetchAndOpEnd(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1256     for (i = 0; i < sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1257     if (PetscDefined(USE_DEBUG)) {                               /* Check that the expected number of increments occurred */
1258       for (i = 0; i < sf->nroots; i++) PetscCheck(inoffset[i] + indegree[i] == inoffset[i + 1], PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incorrect result after PetscSFFetchAndOp");
1259     }
1260     PetscCall(PetscMalloc1(sf->nleaves, &remote));
1261     for (i = 0; i < sf->nleaves; i++) {
1262       remote[i].rank  = sf->remote[i].rank;
1263       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1264     }
1265     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1266     sf->multi->multi = sf->multi;
1267     PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1268     if (sf->rankorder) { /* Sort the ranks */
1269       PetscMPIInt  rank;
1270       PetscInt    *inranks, *newoffset, *outranks, *newoutoffset, *tmpoffset, maxdegree;
1271       PetscSFNode *newremote;
1272       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1273       for (i = 0, maxdegree = 0; i < sf->nroots; i++) maxdegree = PetscMax(maxdegree, indegree[i]);
1274       PetscCall(PetscMalloc5(sf->multi->nroots, &inranks, sf->multi->nroots, &newoffset, maxlocal, &outranks, maxlocal, &newoutoffset, maxdegree, &tmpoffset));
1275       for (i = 0; i < maxlocal; i++) outranks[i] = rank;
1276       PetscCall(PetscSFReduceBegin(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1277       PetscCall(PetscSFReduceEnd(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1278       /* Sort the incoming ranks at each vertex, build the inverse map */
1279       for (i = 0; i < sf->nroots; i++) {
1280         PetscInt j;
1281         for (j = 0; j < indegree[i]; j++) tmpoffset[j] = j;
1282         PetscCall(PetscSortIntWithArray(indegree[i], PetscSafePointerPlusOffset(inranks, inoffset[i]), tmpoffset));
1283         for (j = 0; j < indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1284       }
1285       PetscCall(PetscSFBcastBegin(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1286       PetscCall(PetscSFBcastEnd(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1287       PetscCall(PetscMalloc1(sf->nleaves, &newremote));
1288       for (i = 0; i < sf->nleaves; i++) {
1289         newremote[i].rank  = sf->remote[i].rank;
1290         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1291       }
1292       PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, newremote, PETSC_OWN_POINTER));
1293       PetscCall(PetscFree5(inranks, newoffset, outranks, newoutoffset, tmpoffset));
1294     }
1295     PetscCall(PetscFree3(inoffset, outones, outoffset));
1296   }
1297   *multi = sf->multi;
1298   PetscFunctionReturn(PETSC_SUCCESS);
1299 }
1300 
1301 /*@C
1302   PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots of a `PetscSF`, does not remap indices
1303 
1304   Collective
1305 
1306   Input Parameters:
1307 + sf        - original star forest
1308 . nselected - number of selected roots on this process
1309 - selected  - indices of the selected roots on this process
1310 
1311   Output Parameter:
1312 . esf - new star forest
1313 
1314   Level: advanced
1315 
1316   Note:
1317   To use the new `PetscSF`, it may be necessary to know the indices of the leaves that are still participating. This can
1318   be done by calling PetscSFGetGraph().
1319 
1320 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1321 @*/
1322 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *esf)
1323 {
1324   PetscInt           i, j, n, nroots, nleaves, esf_nleaves, *new_ilocal, minleaf, maxleaf, maxlocal;
1325   const PetscInt    *ilocal;
1326   signed char       *rootdata, *leafdata, *leafmem;
1327   const PetscSFNode *iremote;
1328   PetscSFNode       *new_iremote;
1329   MPI_Comm           comm;
1330 
1331   PetscFunctionBegin;
1332   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1333   PetscSFCheckGraphSet(sf, 1);
1334   if (nselected) PetscAssertPointer(selected, 3);
1335   PetscAssertPointer(esf, 4);
1336 
1337   PetscCall(PetscSFSetUp(sf));
1338   PetscCall(PetscLogEventBegin(PETSCSF_EmbedSF, sf, 0, 0, 0));
1339   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1340   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
1341 
1342   if (PetscDefined(USE_DEBUG)) { /* Error out if selected[] has dups or out of range indices */
1343     PetscBool dups;
1344     PetscCall(PetscCheckDupsInt(nselected, selected, &dups));
1345     PetscCheck(!dups, comm, PETSC_ERR_ARG_WRONG, "selected[] has dups");
1346     for (i = 0; i < nselected; i++) PetscCheck(selected[i] >= 0 && selected[i] < nroots, comm, PETSC_ERR_ARG_OUTOFRANGE, "selected root index %" PetscInt_FMT " is out of [0,%" PetscInt_FMT ")", selected[i], nroots);
1347   }
1348 
1349   if (sf->ops->CreateEmbeddedRootSF) PetscUseTypeMethod(sf, CreateEmbeddedRootSF, nselected, selected, esf);
1350   else {
1351     /* A generic version of creating embedded sf */
1352     PetscCall(PetscSFGetLeafRange(sf, &minleaf, &maxleaf));
1353     maxlocal = maxleaf - minleaf + 1;
1354     PetscCall(PetscCalloc2(nroots, &rootdata, maxlocal, &leafmem));
1355     leafdata = PetscSafePointerPlusOffset(leafmem, -minleaf);
1356     /* Tag selected roots and bcast to leaves */
1357     for (i = 0; i < nselected; i++) rootdata[selected[i]] = 1;
1358     PetscCall(PetscSFBcastBegin(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1359     PetscCall(PetscSFBcastEnd(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1360 
1361     /* Build esf with leaves that are still connected */
1362     esf_nleaves = 0;
1363     for (i = 0; i < nleaves; i++) {
1364       j = ilocal ? ilocal[i] : i;
1365       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1366          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1367       */
1368       esf_nleaves += (leafdata[j] ? 1 : 0);
1369     }
1370     PetscCall(PetscMalloc1(esf_nleaves, &new_ilocal));
1371     PetscCall(PetscMalloc1(esf_nleaves, &new_iremote));
1372     for (i = n = 0; i < nleaves; i++) {
1373       j = ilocal ? ilocal[i] : i;
1374       if (leafdata[j]) {
1375         new_ilocal[n]        = j;
1376         new_iremote[n].rank  = iremote[i].rank;
1377         new_iremote[n].index = iremote[i].index;
1378         ++n;
1379       }
1380     }
1381     PetscCall(PetscSFCreate(comm, esf));
1382     PetscCall(PetscSFSetFromOptions(*esf));
1383     PetscCall(PetscSFSetGraph(*esf, nroots, esf_nleaves, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1384     PetscCall(PetscFree2(rootdata, leafmem));
1385   }
1386   PetscCall(PetscLogEventEnd(PETSCSF_EmbedSF, sf, 0, 0, 0));
1387   PetscFunctionReturn(PETSC_SUCCESS);
1388 }
1389 
1390 /*@C
1391   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves of a `PetscSF`, does not remap indices
1392 
1393   Collective
1394 
1395   Input Parameters:
1396 + sf        - original star forest
1397 . nselected - number of selected leaves on this process
1398 - selected  - indices of the selected leaves on this process
1399 
1400   Output Parameter:
1401 . newsf - new star forest
1402 
1403   Level: advanced
1404 
1405 .seealso: `PetscSF`, `PetscSFCreateEmbeddedRootSF()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1406 @*/
1407 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *newsf)
1408 {
1409   const PetscSFNode *iremote;
1410   PetscSFNode       *new_iremote;
1411   const PetscInt    *ilocal;
1412   PetscInt           i, nroots, *leaves, *new_ilocal;
1413   MPI_Comm           comm;
1414 
1415   PetscFunctionBegin;
1416   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1417   PetscSFCheckGraphSet(sf, 1);
1418   if (nselected) PetscAssertPointer(selected, 3);
1419   PetscAssertPointer(newsf, 4);
1420 
1421   /* Uniq selected[] and put results in leaves[] */
1422   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1423   PetscCall(PetscMalloc1(nselected, &leaves));
1424   PetscCall(PetscArraycpy(leaves, selected, nselected));
1425   PetscCall(PetscSortedRemoveDupsInt(&nselected, leaves));
1426   PetscCheck(!nselected || !(leaves[0] < 0 || leaves[nselected - 1] >= sf->nleaves), comm, PETSC_ERR_ARG_OUTOFRANGE, "Min/Max leaf indices %" PetscInt_FMT "/%" PetscInt_FMT " are not in [0,%" PetscInt_FMT ")", leaves[0], leaves[nselected - 1], sf->nleaves);
1427 
1428   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1429   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) PetscUseTypeMethod(sf, CreateEmbeddedLeafSF, nselected, leaves, newsf);
1430   else {
1431     PetscCall(PetscSFGetGraph(sf, &nroots, NULL, &ilocal, &iremote));
1432     PetscCall(PetscMalloc1(nselected, &new_ilocal));
1433     PetscCall(PetscMalloc1(nselected, &new_iremote));
1434     for (i = 0; i < nselected; ++i) {
1435       const PetscInt l     = leaves[i];
1436       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1437       new_iremote[i].rank  = iremote[l].rank;
1438       new_iremote[i].index = iremote[l].index;
1439     }
1440     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, newsf));
1441     PetscCall(PetscSFSetGraph(*newsf, nroots, nselected, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1442   }
1443   PetscCall(PetscFree(leaves));
1444   PetscFunctionReturn(PETSC_SUCCESS);
1445 }
1446 
1447 /*@C
1448   PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to `PetscSFBcastEnd()`
1449 
1450   Collective
1451 
1452   Input Parameters:
1453 + sf       - star forest on which to communicate
1454 . unit     - data type associated with each node
1455 . rootdata - buffer to broadcast
1456 - op       - operation to use for reduction
1457 
1458   Output Parameter:
1459 . leafdata - buffer to be reduced with values from each leaf's respective root
1460 
1461   Level: intermediate
1462 
1463   Note:
1464   When PETSc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1465   are host pointers or device pointers, which may incur a noticeable cost. If you already knew the info, you should
1466   use `PetscSFBcastWithMemTypeBegin()` instead.
1467 
1468 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastWithMemTypeBegin()`
1469 @*/
1470 PetscErrorCode PetscSFBcastBegin(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1471 {
1472   PetscMemType rootmtype, leafmtype;
1473 
1474   PetscFunctionBegin;
1475   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1476   PetscCall(PetscSFSetUp(sf));
1477   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1478   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1479   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1480   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1481   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1482   PetscFunctionReturn(PETSC_SUCCESS);
1483 }
1484 
1485 /*@C
1486   PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call
1487   to `PetscSFBcastEnd()`
1488 
1489   Collective
1490 
1491   Input Parameters:
1492 + sf        - star forest on which to communicate
1493 . unit      - data type associated with each node
1494 . rootmtype - memory type of rootdata
1495 . rootdata  - buffer to broadcast
1496 . leafmtype - memory type of leafdata
1497 - op        - operation to use for reduction
1498 
1499   Output Parameter:
1500 . leafdata - buffer to be reduced with values from each leaf's respective root
1501 
1502   Level: intermediate
1503 
1504 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastBegin()`
1505 @*/
1506 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
1507 {
1508   PetscFunctionBegin;
1509   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1510   PetscCall(PetscSFSetUp(sf));
1511   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1512   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1513   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1514   PetscFunctionReturn(PETSC_SUCCESS);
1515 }
1516 
1517 /*@C
1518   PetscSFBcastEnd - end a broadcast and reduce operation started with `PetscSFBcastBegin()` or `PetscSFBcastWithMemTypeBegin()`
1519 
1520   Collective
1521 
1522   Input Parameters:
1523 + sf       - star forest
1524 . unit     - data type
1525 . rootdata - buffer to broadcast
1526 - op       - operation to use for reduction
1527 
1528   Output Parameter:
1529 . leafdata - buffer to be reduced with values from each leaf's respective root
1530 
1531   Level: intermediate
1532 
1533 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFReduceEnd()`
1534 @*/
1535 PetscErrorCode PetscSFBcastEnd(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1536 {
1537   PetscFunctionBegin;
1538   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1539   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastEnd, sf, 0, 0, 0));
1540   PetscUseTypeMethod(sf, BcastEnd, unit, rootdata, leafdata, op);
1541   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastEnd, sf, 0, 0, 0));
1542   PetscFunctionReturn(PETSC_SUCCESS);
1543 }
1544 
1545 /*@C
1546   PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to `PetscSFReduceEnd()`
1547 
1548   Collective
1549 
1550   Input Parameters:
1551 + sf       - star forest
1552 . unit     - data type
1553 . leafdata - values to reduce
1554 - op       - reduction operation
1555 
1556   Output Parameter:
1557 . rootdata - result of reduction of values from all leaves of each root
1558 
1559   Level: intermediate
1560 
1561   Note:
1562   When PETSc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1563   are host pointers or device pointers, which may incur a noticeable cost. If you already knew the info, you should
1564   use `PetscSFReduceWithMemTypeBegin()` instead.
1565 
1566 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceWithMemTypeBegin()`, `PetscSFReduceEnd()`
1567 @*/
1568 PetscErrorCode PetscSFReduceBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1569 {
1570   PetscMemType rootmtype, leafmtype;
1571 
1572   PetscFunctionBegin;
1573   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1574   PetscCall(PetscSFSetUp(sf));
1575   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1576   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1577   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1578   PetscCall(sf->ops->ReduceBegin(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1579   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1580   PetscFunctionReturn(PETSC_SUCCESS);
1581 }
1582 
1583 /*@C
1584   PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to `PetscSFReduceEnd()`
1585 
1586   Collective
1587 
1588   Input Parameters:
1589 + sf        - star forest
1590 . unit      - data type
1591 . leafmtype - memory type of leafdata
1592 . leafdata  - values to reduce
1593 . rootmtype - memory type of rootdata
1594 - op        - reduction operation
1595 
1596   Output Parameter:
1597 . rootdata - result of reduction of values from all leaves of each root
1598 
1599   Level: intermediate
1600 
1601 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceBegin()`, `PetscSFReduceEnd()`
1602 @*/
1603 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
1604 {
1605   PetscFunctionBegin;
1606   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1607   PetscCall(PetscSFSetUp(sf));
1608   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1609   PetscCall(sf->ops->ReduceBegin(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1610   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1611   PetscFunctionReturn(PETSC_SUCCESS);
1612 }
1613 
1614 /*@C
1615   PetscSFReduceEnd - end a reduction operation started with `PetscSFReduceBegin()` or `PetscSFReduceWithMemTypeBegin()`
1616 
1617   Collective
1618 
1619   Input Parameters:
1620 + sf       - star forest
1621 . unit     - data type
1622 . leafdata - values to reduce
1623 - op       - reduction operation
1624 
1625   Output Parameter:
1626 . rootdata - result of reduction of values from all leaves of each root
1627 
1628   Level: intermediate
1629 
1630 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFBcastEnd()`, `PetscSFReduceBegin()`, `PetscSFReduceWithMemTypeBegin()`
1631 @*/
1632 PetscErrorCode PetscSFReduceEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1633 {
1634   PetscFunctionBegin;
1635   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1636   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1637   PetscUseTypeMethod(sf, ReduceEnd, unit, leafdata, rootdata, op);
1638   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1639   PetscFunctionReturn(PETSC_SUCCESS);
1640 }
1641 
1642 /*@C
1643   PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value,
1644   to be completed with `PetscSFFetchAndOpEnd()`
1645 
1646   Collective
1647 
1648   Input Parameters:
1649 + sf       - star forest
1650 . unit     - data type
1651 . leafdata - leaf values to use in reduction
1652 - op       - operation to use for reduction
1653 
1654   Output Parameters:
1655 + rootdata   - root values to be updated, input state is seen by first process to perform an update
1656 - leafupdate - state at each leaf's respective root immediately prior to my atomic update
1657 
1658   Level: advanced
1659 
1660   Note:
1661   The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1662   might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1663   not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1664   integers.
1665 
1666 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`
1667 @*/
1668 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1669 {
1670   PetscMemType rootmtype, leafmtype, leafupdatemtype;
1671 
1672   PetscFunctionBegin;
1673   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1674   PetscCall(PetscSFSetUp(sf));
1675   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1676   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1677   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1678   PetscCall(PetscGetMemType(leafupdate, &leafupdatemtype));
1679   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1680   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1681   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1682   PetscFunctionReturn(PETSC_SUCCESS);
1683 }
1684 
1685 /*@C
1686   PetscSFFetchAndOpWithMemTypeBegin - begin operation with explicit memory types that fetches values from root and updates atomically by
1687   applying operation using my leaf value, to be completed with `PetscSFFetchAndOpEnd()`
1688 
1689   Collective
1690 
1691   Input Parameters:
1692 + sf              - star forest
1693 . unit            - data type
1694 . rootmtype       - memory type of rootdata
1695 . leafmtype       - memory type of leafdata
1696 . leafdata        - leaf values to use in reduction
1697 . leafupdatemtype - memory type of leafupdate
1698 - op              - operation to use for reduction
1699 
1700   Output Parameters:
1701 + rootdata   - root values to be updated, input state is seen by first process to perform an update
1702 - leafupdate - state at each leaf's respective root immediately prior to my atomic update
1703 
1704   Level: advanced
1705 
1706   Note:
1707   See `PetscSFFetchAndOpBegin()` for more details.
1708 
1709 .seealso: `PetscSF`, `PetscSFFetchAndOpBegin()`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`, `PetscSFFetchAndOpEnd()`
1710 @*/
1711 PetscErrorCode PetscSFFetchAndOpWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, PetscMemType leafupdatemtype, void *leafupdate, MPI_Op op)
1712 {
1713   PetscFunctionBegin;
1714   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1715   PetscCall(PetscSFSetUp(sf));
1716   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1717   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1718   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1719   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1720   PetscFunctionReturn(PETSC_SUCCESS);
1721 }
1722 
1723 /*@C
1724   PetscSFFetchAndOpEnd - end operation started in matching call to `PetscSFFetchAndOpBegin()` or `PetscSFFetchAndOpWithMemTypeBegin()`
1725   to fetch values from roots and update atomically by applying operation using my leaf value
1726 
1727   Collective
1728 
1729   Input Parameters:
1730 + sf       - star forest
1731 . unit     - data type
1732 . leafdata - leaf values to use in reduction
1733 - op       - operation to use for reduction
1734 
1735   Output Parameters:
1736 + rootdata   - root values to be updated, input state is seen by first process to perform an update
1737 - leafupdate - state at each leaf's respective root immediately prior to my atomic update
1738 
1739   Level: advanced
1740 
1741 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFReduceEnd()`, `PetscSFSetGraph()`, `PetscSFFetchAndOpBegin()`, `PetscSFFetchAndOpWithMemTypeBegin()`
1742 @*/
1743 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1744 {
1745   PetscFunctionBegin;
1746   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1747   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1748   PetscUseTypeMethod(sf, FetchAndOpEnd, unit, rootdata, leafdata, leafupdate, op);
1749   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1750   PetscFunctionReturn(PETSC_SUCCESS);
1751 }
1752 
1753 /*@C
1754   PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with `PetscSFComputeDegreeEnd()`
1755 
1756   Collective
1757 
1758   Input Parameter:
1759 . sf - star forest
1760 
1761   Output Parameter:
1762 . degree - degree of each root vertex
1763 
1764   Level: advanced
1765 
1766   Note:
1767   The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence there is no need to call `PetscFree()` on it.
1768 
1769 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeEnd()`
1770 @*/
1771 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf, const PetscInt **degree)
1772 {
1773   PetscFunctionBegin;
1774   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1775   PetscSFCheckGraphSet(sf, 1);
1776   PetscAssertPointer(degree, 2);
1777   if (!sf->degreeknown) {
1778     PetscInt i, nroots = sf->nroots, maxlocal;
1779     PetscCheck(!sf->degree, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1780     maxlocal = sf->maxleaf - sf->minleaf + 1;
1781     PetscCall(PetscMalloc1(nroots, &sf->degree));
1782     PetscCall(PetscMalloc1(PetscMax(maxlocal, 1), &sf->degreetmp)); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1783     for (i = 0; i < nroots; i++) sf->degree[i] = 0;
1784     for (i = 0; i < maxlocal; i++) sf->degreetmp[i] = 1;
1785     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1786   }
1787   *degree = NULL;
1788   PetscFunctionReturn(PETSC_SUCCESS);
1789 }
1790 
1791 /*@C
1792   PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with `PetscSFComputeDegreeBegin()`
1793 
1794   Collective
1795 
1796   Input Parameter:
1797 . sf - star forest
1798 
1799   Output Parameter:
1800 . degree - degree of each root vertex
1801 
1802   Level: developer
1803 
1804   Note:
1805   The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence there is no need to call `PetscFree()` on it.
1806 
1807 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeBegin()`
1808 @*/
1809 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf, const PetscInt **degree)
1810 {
1811   PetscFunctionBegin;
1812   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1813   PetscSFCheckGraphSet(sf, 1);
1814   PetscAssertPointer(degree, 2);
1815   if (!sf->degreeknown) {
1816     PetscCheck(sf->degreetmp, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1817     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1818     PetscCall(PetscFree(sf->degreetmp));
1819     sf->degreeknown = PETSC_TRUE;
1820   }
1821   *degree = sf->degree;
1822   PetscFunctionReturn(PETSC_SUCCESS);
1823 }
1824 
1825 /*@C
1826   PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-`PetscSF` returned by `PetscSFGetMultiSF()`).
1827   Each multi-root is assigned index of the corresponding original root.
1828 
1829   Collective
1830 
1831   Input Parameters:
1832 + sf     - star forest
1833 - degree - degree of each root vertex, computed with `PetscSFComputeDegreeBegin()`/`PetscSFComputeDegreeEnd()`
1834 
1835   Output Parameters:
1836 + nMultiRoots             - (optional) number of multi-roots (roots of multi-`PetscSF`)
1837 - multiRootsOrigNumbering - original indices of multi-roots; length of this array is `nMultiRoots`
1838 
1839   Level: developer
1840 
1841   Note:
1842   The returned array `multiRootsOrigNumbering` is newly allocated and should be destroyed with `PetscFree()` when no longer needed.
1843 
1844 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFComputeDegreeEnd()`, `PetscSFGetMultiSF()`
1845 @*/
1846 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1847 {
1848   PetscSF  msf;
1849   PetscInt i, j, k, nroots, nmroots;
1850 
1851   PetscFunctionBegin;
1852   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1853   PetscCall(PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL));
1854   if (nroots) PetscAssertPointer(degree, 2);
1855   if (nMultiRoots) PetscAssertPointer(nMultiRoots, 3);
1856   PetscAssertPointer(multiRootsOrigNumbering, 4);
1857   PetscCall(PetscSFGetMultiSF(sf, &msf));
1858   PetscCall(PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL));
1859   PetscCall(PetscMalloc1(nmroots, multiRootsOrigNumbering));
1860   for (i = 0, j = 0, k = 0; i < nroots; i++) {
1861     if (!degree[i]) continue;
1862     for (j = 0; j < degree[i]; j++, k++) (*multiRootsOrigNumbering)[k] = i;
1863   }
1864   PetscCheck(k == nmroots, PETSC_COMM_SELF, PETSC_ERR_PLIB, "sanity check fail");
1865   if (nMultiRoots) *nMultiRoots = nmroots;
1866   PetscFunctionReturn(PETSC_SUCCESS);
1867 }
1868 
1869 /*@C
1870   PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with `PetscSFGatherEnd()`
1871 
1872   Collective
1873 
1874   Input Parameters:
1875 + sf       - star forest
1876 . unit     - data type
1877 - leafdata - leaf data to gather to roots
1878 
1879   Output Parameter:
1880 . multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1881 
1882   Level: intermediate
1883 
1884 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterBegin()`
1885 @*/
1886 PetscErrorCode PetscSFGatherBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1887 {
1888   PetscSF multi = NULL;
1889 
1890   PetscFunctionBegin;
1891   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1892   PetscCall(PetscSFSetUp(sf));
1893   PetscCall(PetscSFGetMultiSF(sf, &multi));
1894   PetscCall(PetscSFReduceBegin(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1895   PetscFunctionReturn(PETSC_SUCCESS);
1896 }
1897 
1898 /*@C
1899   PetscSFGatherEnd - ends pointwise gather operation that was started with `PetscSFGatherBegin()`
1900 
1901   Collective
1902 
1903   Input Parameters:
1904 + sf       - star forest
1905 . unit     - data type
1906 - leafdata - leaf data to gather to roots
1907 
1908   Output Parameter:
1909 . multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1910 
1911   Level: intermediate
1912 
1913 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterEnd()`
1914 @*/
1915 PetscErrorCode PetscSFGatherEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1916 {
1917   PetscSF multi = NULL;
1918 
1919   PetscFunctionBegin;
1920   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1921   PetscCall(PetscSFGetMultiSF(sf, &multi));
1922   PetscCall(PetscSFReduceEnd(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1923   PetscFunctionReturn(PETSC_SUCCESS);
1924 }
1925 
1926 /*@C
1927   PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with `PetscSFScatterEnd()`
1928 
1929   Collective
1930 
1931   Input Parameters:
1932 + sf            - star forest
1933 . unit          - data type
1934 - multirootdata - root buffer to send to each leaf, one unit of data per leaf
1935 
1936   Output Parameter:
1937 . leafdata - leaf data to be update with personal data from each respective root
1938 
1939   Level: intermediate
1940 
1941 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterEnd()`
1942 @*/
1943 PetscErrorCode PetscSFScatterBegin(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1944 {
1945   PetscSF multi = NULL;
1946 
1947   PetscFunctionBegin;
1948   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1949   PetscCall(PetscSFSetUp(sf));
1950   PetscCall(PetscSFGetMultiSF(sf, &multi));
1951   PetscCall(PetscSFBcastBegin(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1952   PetscFunctionReturn(PETSC_SUCCESS);
1953 }
1954 
1955 /*@C
1956   PetscSFScatterEnd - ends pointwise scatter operation that was started with `PetscSFScatterBegin()`
1957 
1958   Collective
1959 
1960   Input Parameters:
1961 + sf            - star forest
1962 . unit          - data type
1963 - multirootdata - root buffer to send to each leaf, one unit of data per leaf
1964 
1965   Output Parameter:
1966 . leafdata - leaf data to be update with personal data from each respective root
1967 
1968   Level: intermediate
1969 
1970 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterBegin()`
1971 @*/
1972 PetscErrorCode PetscSFScatterEnd(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1973 {
1974   PetscSF multi = NULL;
1975 
1976   PetscFunctionBegin;
1977   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1978   PetscCall(PetscSFGetMultiSF(sf, &multi));
1979   PetscCall(PetscSFBcastEnd(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1980   PetscFunctionReturn(PETSC_SUCCESS);
1981 }
1982 
1983 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1984 {
1985   PetscInt        i, n, nleaves;
1986   const PetscInt *ilocal = NULL;
1987   PetscHSetI      seen;
1988 
1989   PetscFunctionBegin;
1990   if (PetscDefined(USE_DEBUG)) {
1991     PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, NULL));
1992     PetscCall(PetscHSetICreate(&seen));
1993     for (i = 0; i < nleaves; i++) {
1994       const PetscInt leaf = ilocal ? ilocal[i] : i;
1995       PetscCall(PetscHSetIAdd(seen, leaf));
1996     }
1997     PetscCall(PetscHSetIGetSize(seen, &n));
1998     PetscCheck(n == nleaves, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided leaves have repeated values: all leaves must be unique");
1999     PetscCall(PetscHSetIDestroy(&seen));
2000   }
2001   PetscFunctionReturn(PETSC_SUCCESS);
2002 }
2003 
2004 /*@
2005   PetscSFCompose - Compose a new `PetscSF` by putting the second `PetscSF` under the first one in a top (roots) down (leaves) view
2006 
2007   Input Parameters:
2008 + sfA - The first `PetscSF`
2009 - sfB - The second `PetscSF`
2010 
2011   Output Parameter:
2012 . sfBA - The composite `PetscSF`
2013 
2014   Level: developer
2015 
2016   Notes:
2017   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
2018   forests, i.e. the same leaf is not connected with different roots.
2019 
2020   `sfA`'s leaf space and `sfB`'s root space might be partially overlapped. The composition builds
2021   a graph with `sfA`'s roots and `sfB`'s leaves only when there is a path between them. Unconnected
2022   nodes (roots or leaves) are not in `sfBA`. Doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on the new `PetscSF` is equivalent to doing a
2023   `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on `sfA`, then a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on `sfB`, on connected nodes.
2024 
2025 .seealso: `PetscSF`, `PetscSFComposeInverse()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`
2026 @*/
2027 PetscErrorCode PetscSFCompose(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
2028 {
2029   const PetscSFNode *remotePointsA, *remotePointsB;
2030   PetscSFNode       *remotePointsBA = NULL, *reorderedRemotePointsA = NULL, *leafdataB;
2031   const PetscInt    *localPointsA, *localPointsB;
2032   PetscInt          *localPointsBA;
2033   PetscInt           i, numRootsA, numLeavesA, numRootsB, numLeavesB, minleaf, maxleaf, numLeavesBA;
2034   PetscBool          denseB;
2035 
2036   PetscFunctionBegin;
2037   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
2038   PetscSFCheckGraphSet(sfA, 1);
2039   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
2040   PetscSFCheckGraphSet(sfB, 2);
2041   PetscCheckSameComm(sfA, 1, sfB, 2);
2042   PetscAssertPointer(sfBA, 3);
2043   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
2044   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
2045 
2046   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
2047   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
2048   /* Make sure that PetscSFBcast{Begin, End}(sfB, ...) works with root data of size
2049      numRootsB; otherwise, garbage will be broadcasted.
2050      Example (comm size = 1):
2051      sfA: 0 <- (0, 0)
2052      sfB: 100 <- (0, 0)
2053           101 <- (0, 1)
2054      Here, we have remotePointsA = [(0, 0)], but for remotePointsA to be a valid tartget
2055      of sfB, it has to be recasted as [(0, 0), (-1, -1)] so that points 100 and 101 would
2056      receive (0, 0) and (-1, -1), respectively, when PetscSFBcast(sfB, ...) is called on
2057      remotePointsA; if not recasted, point 101 would receive a garbage value.             */
2058   PetscCall(PetscMalloc1(numRootsB, &reorderedRemotePointsA));
2059   for (i = 0; i < numRootsB; i++) {
2060     reorderedRemotePointsA[i].rank  = -1;
2061     reorderedRemotePointsA[i].index = -1;
2062   }
2063   for (i = 0; i < numLeavesA; i++) {
2064     PetscInt localp = localPointsA ? localPointsA[i] : i;
2065 
2066     if (localp >= numRootsB) continue;
2067     reorderedRemotePointsA[localp] = remotePointsA[i];
2068   }
2069   remotePointsA = reorderedRemotePointsA;
2070   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2071   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &leafdataB));
2072   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2073     leafdataB[i].rank  = -1;
2074     leafdataB[i].index = -1;
2075   }
2076   PetscCall(PetscSFBcastBegin(sfB, MPIU_2INT, remotePointsA, PetscSafePointerPlusOffset(leafdataB, -minleaf), MPI_REPLACE));
2077   PetscCall(PetscSFBcastEnd(sfB, MPIU_2INT, remotePointsA, PetscSafePointerPlusOffset(leafdataB, -minleaf), MPI_REPLACE));
2078   PetscCall(PetscFree(reorderedRemotePointsA));
2079 
2080   denseB = (PetscBool)!localPointsB;
2081   for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2082     if (leafdataB[localPointsB ? localPointsB[i] - minleaf : i].rank == -1) denseB = PETSC_FALSE;
2083     else numLeavesBA++;
2084   }
2085   if (denseB) {
2086     localPointsBA  = NULL;
2087     remotePointsBA = leafdataB;
2088   } else {
2089     PetscCall(PetscMalloc1(numLeavesBA, &localPointsBA));
2090     PetscCall(PetscMalloc1(numLeavesBA, &remotePointsBA));
2091     for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2092       const PetscInt l = localPointsB ? localPointsB[i] : i;
2093 
2094       if (leafdataB[l - minleaf].rank == -1) continue;
2095       remotePointsBA[numLeavesBA] = leafdataB[l - minleaf];
2096       localPointsBA[numLeavesBA]  = l;
2097       numLeavesBA++;
2098     }
2099     PetscCall(PetscFree(leafdataB));
2100   }
2101   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2102   PetscCall(PetscSFSetFromOptions(*sfBA));
2103   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2104   PetscFunctionReturn(PETSC_SUCCESS);
2105 }
2106 
2107 /*@
2108   PetscSFComposeInverse - Compose a new `PetscSF` by putting the inverse of the second `PetscSF` under the first one
2109 
2110   Input Parameters:
2111 + sfA - The first `PetscSF`
2112 - sfB - The second `PetscSF`
2113 
2114   Output Parameter:
2115 . sfBA - The composite `PetscSF`.
2116 
2117   Level: developer
2118 
2119   Notes:
2120   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
2121   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2122   second `PetscSF` must have a degree of 1, i.e., no roots have more than one leaf connected.
2123 
2124   `sfA`'s leaf space and `sfB`'s leaf space might be partially overlapped. The composition builds
2125   a graph with `sfA`'s roots and `sfB`'s roots only when there is a path between them. Unconnected
2126   roots are not in `sfBA`. Doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on the new `PetscSF` is equivalent to doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()`
2127   on `sfA`, then
2128   a `PetscSFReduceBegin()`/`PetscSFReduceEnd()` on `sfB`, on connected roots.
2129 
2130 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFCreateInverseSF()`
2131 @*/
2132 PetscErrorCode PetscSFComposeInverse(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
2133 {
2134   const PetscSFNode *remotePointsA, *remotePointsB;
2135   PetscSFNode       *remotePointsBA;
2136   const PetscInt    *localPointsA, *localPointsB;
2137   PetscSFNode       *reorderedRemotePointsA = NULL;
2138   PetscInt           i, numRootsA, numLeavesA, numLeavesBA, numRootsB, numLeavesB, minleaf, maxleaf, *localPointsBA;
2139   MPI_Op             op;
2140 #if defined(PETSC_USE_64BIT_INDICES)
2141   PetscBool iswin;
2142 #endif
2143 
2144   PetscFunctionBegin;
2145   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
2146   PetscSFCheckGraphSet(sfA, 1);
2147   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
2148   PetscSFCheckGraphSet(sfB, 2);
2149   PetscCheckSameComm(sfA, 1, sfB, 2);
2150   PetscAssertPointer(sfBA, 3);
2151   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
2152   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
2153 
2154   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
2155   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
2156 
2157   /* TODO: Check roots of sfB have degree of 1 */
2158   /* Once we implement it, we can replace the MPI_MAXLOC
2159      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2160      We use MPI_MAXLOC only to have a deterministic output from this routine if
2161      the root condition is not meet.
2162    */
2163   op = MPI_MAXLOC;
2164 #if defined(PETSC_USE_64BIT_INDICES)
2165   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2166   PetscCall(PetscObjectTypeCompare((PetscObject)sfB, PETSCSFWINDOW, &iswin));
2167   if (iswin) op = MPI_REPLACE;
2168 #endif
2169 
2170   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2171   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &reorderedRemotePointsA));
2172   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2173     reorderedRemotePointsA[i].rank  = -1;
2174     reorderedRemotePointsA[i].index = -1;
2175   }
2176   if (localPointsA) {
2177     for (i = 0; i < numLeavesA; i++) {
2178       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2179       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2180     }
2181   } else {
2182     for (i = 0; i < numLeavesA; i++) {
2183       if (i > maxleaf || i < minleaf) continue;
2184       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2185     }
2186   }
2187 
2188   PetscCall(PetscMalloc1(numRootsB, &localPointsBA));
2189   PetscCall(PetscMalloc1(numRootsB, &remotePointsBA));
2190   for (i = 0; i < numRootsB; i++) {
2191     remotePointsBA[i].rank  = -1;
2192     remotePointsBA[i].index = -1;
2193   }
2194 
2195   PetscCall(PetscSFReduceBegin(sfB, MPIU_2INT, PetscSafePointerPlusOffset(reorderedRemotePointsA, -minleaf), remotePointsBA, op));
2196   PetscCall(PetscSFReduceEnd(sfB, MPIU_2INT, PetscSafePointerPlusOffset(reorderedRemotePointsA, -minleaf), remotePointsBA, op));
2197   PetscCall(PetscFree(reorderedRemotePointsA));
2198   for (i = 0, numLeavesBA = 0; i < numRootsB; i++) {
2199     if (remotePointsBA[i].rank == -1) continue;
2200     remotePointsBA[numLeavesBA].rank  = remotePointsBA[i].rank;
2201     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2202     localPointsBA[numLeavesBA]        = i;
2203     numLeavesBA++;
2204   }
2205   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2206   PetscCall(PetscSFSetFromOptions(*sfBA));
2207   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2208   PetscFunctionReturn(PETSC_SUCCESS);
2209 }
2210 
2211 /*
2212   PetscSFCreateLocalSF_Private - Creates a local `PetscSF` that only has intra-process edges of the global `PetscSF`
2213 
2214   Input Parameter:
2215 . sf - The global `PetscSF`
2216 
2217   Output Parameter:
2218 . out - The local `PetscSF`
2219 
2220 .seealso: `PetscSF`, `PetscSFCreate()`
2221  */
2222 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf, PetscSF *out)
2223 {
2224   MPI_Comm           comm;
2225   PetscMPIInt        myrank;
2226   const PetscInt    *ilocal;
2227   const PetscSFNode *iremote;
2228   PetscInt           i, j, nroots, nleaves, lnleaves, *lilocal;
2229   PetscSFNode       *liremote;
2230   PetscSF            lsf;
2231 
2232   PetscFunctionBegin;
2233   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2234   if (sf->ops->CreateLocalSF) PetscUseTypeMethod(sf, CreateLocalSF, out);
2235   else {
2236     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2237     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
2238     PetscCallMPI(MPI_Comm_rank(comm, &myrank));
2239 
2240     /* Find out local edges and build a local SF */
2241     PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
2242     for (i = lnleaves = 0; i < nleaves; i++) {
2243       if (iremote[i].rank == (PetscInt)myrank) lnleaves++;
2244     }
2245     PetscCall(PetscMalloc1(lnleaves, &lilocal));
2246     PetscCall(PetscMalloc1(lnleaves, &liremote));
2247 
2248     for (i = j = 0; i < nleaves; i++) {
2249       if (iremote[i].rank == (PetscInt)myrank) {
2250         lilocal[j]        = ilocal ? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2251         liremote[j].rank  = 0;                      /* rank in PETSC_COMM_SELF */
2252         liremote[j].index = iremote[i].index;
2253         j++;
2254       }
2255     }
2256     PetscCall(PetscSFCreate(PETSC_COMM_SELF, &lsf));
2257     PetscCall(PetscSFSetFromOptions(lsf));
2258     PetscCall(PetscSFSetGraph(lsf, nroots, lnleaves, lilocal, PETSC_OWN_POINTER, liremote, PETSC_OWN_POINTER));
2259     PetscCall(PetscSFSetUp(lsf));
2260     *out = lsf;
2261   }
2262   PetscFunctionReturn(PETSC_SUCCESS);
2263 }
2264 
2265 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2266 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata)
2267 {
2268   PetscMemType rootmtype, leafmtype;
2269 
2270   PetscFunctionBegin;
2271   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2272   PetscCall(PetscSFSetUp(sf));
2273   PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
2274   PetscCall(PetscGetMemType(rootdata, &rootmtype));
2275   PetscCall(PetscGetMemType(leafdata, &leafmtype));
2276   PetscUseTypeMethod(sf, BcastToZero, unit, rootmtype, rootdata, leafmtype, leafdata);
2277   PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
2278   PetscFunctionReturn(PETSC_SUCCESS);
2279 }
2280 
2281 /*@
2282   PetscSFConcatenate - concatenate multiple `PetscSF` into one
2283 
2284   Input Parameters:
2285 + comm        - the communicator
2286 . nsfs        - the number of input `PetscSF`
2287 . sfs         - the array of input `PetscSF`
2288 . rootMode    - the root mode specifying how roots are handled
2289 - leafOffsets - the array of local leaf offsets, one for each input `PetscSF`, or `NULL` for contiguous storage
2290 
2291   Output Parameter:
2292 . newsf - The resulting `PetscSF`
2293 
2294   Level: advanced
2295 
2296   Notes:
2297   The communicator of all `PetscSF`s in `sfs` must be comm.
2298 
2299   Leaves are always concatenated locally, keeping them ordered by the input `PetscSF` index and original local order.
2300 
2301   The offsets in `leafOffsets` are added to the original leaf indices.
2302 
2303   If all input SFs use contiguous leaf storage (`ilocal` = `NULL`), `leafOffsets` can be passed as `NULL` as well.
2304   In this case, `NULL` is also passed as `ilocal` to the resulting `PetscSF`.
2305 
2306   If any input `PetscSF` has non-null `ilocal`, `leafOffsets` is needed to distinguish leaves from different input `PetscSF`s.
2307   In this case, user is responsible to provide correct offsets so that the resulting leaves are unique (otherwise an error occurs).
2308 
2309   All root modes retain the essential connectivity condition.
2310   If two leaves of the same input `PetscSF` are connected (sharing the same root), they are also connected in the output `PetscSF`.
2311   Parameter `rootMode` controls how the input root spaces are combined.
2312   For `PETSCSF_CONCATENATE_ROOTMODE_SHARED`, the root space is considered the same for each input `PetscSF` (checked in debug mode)
2313   and is also the same in the output `PetscSF`.
2314   For `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, the input root spaces are taken as separate and joined.
2315   `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` joins the root spaces locally;
2316   roots of sfs[0], sfs[1], sfs[2], ... are joined on each rank separately, ordered by input `PetscSF` and original local index, and renumbered contiguously.
2317   `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL` joins the root spaces globally;
2318   roots of sfs[0], sfs[1], sfs[2], ... are joined globally, ordered by input `PetscSF` index and original global index, and renumbered contiguously;
2319   the original root ranks are ignored.
2320   For both `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`,
2321   the output `PetscSF`'s root layout is such that the local number of roots is a sum of the input `PetscSF`'s local numbers of roots on each rank
2322   to keep the load balancing.
2323   However, for `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, roots can move to different ranks.
2324 
2325   Example:
2326   We can use src/vec/is/sf/tests/ex18.c to compare the root modes. By running
2327 .vb
2328   make -C $PETSC_DIR/src/vec/is/sf/tests ex18
2329   for m in {local,global,shared}; do
2330     mpirun -n 2 $PETSC_DIR/src/vec/is/sf/tests/ex18 -nsfs 2 -n 2 -root_mode $m -sf_view
2331   done
2332 .ve
2333   we generate two identical `PetscSF`s sf_0 and sf_1,
2334 .vb
2335   PetscSF Object: sf_0 2 MPI processes
2336     type: basic
2337     rank #leaves #roots
2338     [ 0]       4      2
2339     [ 1]       4      2
2340     leaves      roots       roots in global numbering
2341     ( 0,  0) <- ( 0,  0)  =   0
2342     ( 0,  1) <- ( 0,  1)  =   1
2343     ( 0,  2) <- ( 1,  0)  =   2
2344     ( 0,  3) <- ( 1,  1)  =   3
2345     ( 1,  0) <- ( 0,  0)  =   0
2346     ( 1,  1) <- ( 0,  1)  =   1
2347     ( 1,  2) <- ( 1,  0)  =   2
2348     ( 1,  3) <- ( 1,  1)  =   3
2349 .ve
2350   and pass them to `PetscSFConcatenate()` along with different choices of `rootMode`, yielding different result_sf\:
2351 .vb
2352   rootMode = local:
2353   PetscSF Object: result_sf 2 MPI processes
2354     type: basic
2355     rank #leaves #roots
2356     [ 0]       8      4
2357     [ 1]       8      4
2358     leaves      roots       roots in global numbering
2359     ( 0,  0) <- ( 0,  0)  =   0
2360     ( 0,  1) <- ( 0,  1)  =   1
2361     ( 0,  2) <- ( 1,  0)  =   4
2362     ( 0,  3) <- ( 1,  1)  =   5
2363     ( 0,  4) <- ( 0,  2)  =   2
2364     ( 0,  5) <- ( 0,  3)  =   3
2365     ( 0,  6) <- ( 1,  2)  =   6
2366     ( 0,  7) <- ( 1,  3)  =   7
2367     ( 1,  0) <- ( 0,  0)  =   0
2368     ( 1,  1) <- ( 0,  1)  =   1
2369     ( 1,  2) <- ( 1,  0)  =   4
2370     ( 1,  3) <- ( 1,  1)  =   5
2371     ( 1,  4) <- ( 0,  2)  =   2
2372     ( 1,  5) <- ( 0,  3)  =   3
2373     ( 1,  6) <- ( 1,  2)  =   6
2374     ( 1,  7) <- ( 1,  3)  =   7
2375 
2376   rootMode = global:
2377   PetscSF Object: result_sf 2 MPI processes
2378     type: basic
2379     rank #leaves #roots
2380     [ 0]       8      4
2381     [ 1]       8      4
2382     leaves      roots       roots in global numbering
2383     ( 0,  0) <- ( 0,  0)  =   0
2384     ( 0,  1) <- ( 0,  1)  =   1
2385     ( 0,  2) <- ( 0,  2)  =   2
2386     ( 0,  3) <- ( 0,  3)  =   3
2387     ( 0,  4) <- ( 1,  0)  =   4
2388     ( 0,  5) <- ( 1,  1)  =   5
2389     ( 0,  6) <- ( 1,  2)  =   6
2390     ( 0,  7) <- ( 1,  3)  =   7
2391     ( 1,  0) <- ( 0,  0)  =   0
2392     ( 1,  1) <- ( 0,  1)  =   1
2393     ( 1,  2) <- ( 0,  2)  =   2
2394     ( 1,  3) <- ( 0,  3)  =   3
2395     ( 1,  4) <- ( 1,  0)  =   4
2396     ( 1,  5) <- ( 1,  1)  =   5
2397     ( 1,  6) <- ( 1,  2)  =   6
2398     ( 1,  7) <- ( 1,  3)  =   7
2399 
2400   rootMode = shared:
2401   PetscSF Object: result_sf 2 MPI processes
2402     type: basic
2403     rank #leaves #roots
2404     [ 0]       8      2
2405     [ 1]       8      2
2406     leaves      roots       roots in global numbering
2407     ( 0,  0) <- ( 0,  0)  =   0
2408     ( 0,  1) <- ( 0,  1)  =   1
2409     ( 0,  2) <- ( 1,  0)  =   2
2410     ( 0,  3) <- ( 1,  1)  =   3
2411     ( 0,  4) <- ( 0,  0)  =   0
2412     ( 0,  5) <- ( 0,  1)  =   1
2413     ( 0,  6) <- ( 1,  0)  =   2
2414     ( 0,  7) <- ( 1,  1)  =   3
2415     ( 1,  0) <- ( 0,  0)  =   0
2416     ( 1,  1) <- ( 0,  1)  =   1
2417     ( 1,  2) <- ( 1,  0)  =   2
2418     ( 1,  3) <- ( 1,  1)  =   3
2419     ( 1,  4) <- ( 0,  0)  =   0
2420     ( 1,  5) <- ( 0,  1)  =   1
2421     ( 1,  6) <- ( 1,  0)  =   2
2422     ( 1,  7) <- ( 1,  1)  =   3
2423 .ve
2424 
2425 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFConcatenateRootMode`
2426 @*/
2427 PetscErrorCode PetscSFConcatenate(MPI_Comm comm, PetscInt nsfs, PetscSF sfs[], PetscSFConcatenateRootMode rootMode, PetscInt leafOffsets[], PetscSF *newsf)
2428 {
2429   PetscInt     i, s, nLeaves, nRoots;
2430   PetscInt    *leafArrayOffsets;
2431   PetscInt    *ilocal_new;
2432   PetscSFNode *iremote_new;
2433   PetscBool    all_ilocal_null = PETSC_FALSE;
2434   PetscLayout  glayout         = NULL;
2435   PetscInt    *gremote         = NULL;
2436   PetscMPIInt  rank, size;
2437 
2438   PetscFunctionBegin;
2439   if (PetscDefined(USE_DEBUG)) {
2440     PetscSF dummy; /* just to have a PetscObject on comm for input validation */
2441 
2442     PetscCall(PetscSFCreate(comm, &dummy));
2443     PetscValidLogicalCollectiveInt(dummy, nsfs, 2);
2444     PetscAssertPointer(sfs, 3);
2445     for (i = 0; i < nsfs; i++) {
2446       PetscValidHeaderSpecific(sfs[i], PETSCSF_CLASSID, 3);
2447       PetscCheckSameComm(dummy, 1, sfs[i], 3);
2448     }
2449     PetscValidLogicalCollectiveEnum(dummy, rootMode, 4);
2450     if (leafOffsets) PetscAssertPointer(leafOffsets, 5);
2451     PetscAssertPointer(newsf, 6);
2452     PetscCall(PetscSFDestroy(&dummy));
2453   }
2454   if (!nsfs) {
2455     PetscCall(PetscSFCreate(comm, newsf));
2456     PetscCall(PetscSFSetGraph(*newsf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
2457     PetscFunctionReturn(PETSC_SUCCESS);
2458   }
2459   PetscCallMPI(MPI_Comm_rank(comm, &rank));
2460   PetscCallMPI(MPI_Comm_size(comm, &size));
2461 
2462   /* Calculate leaf array offsets */
2463   PetscCall(PetscMalloc1(nsfs + 1, &leafArrayOffsets));
2464   leafArrayOffsets[0] = 0;
2465   for (s = 0; s < nsfs; s++) {
2466     PetscInt nl;
2467 
2468     PetscCall(PetscSFGetGraph(sfs[s], NULL, &nl, NULL, NULL));
2469     leafArrayOffsets[s + 1] = leafArrayOffsets[s] + nl;
2470   }
2471   nLeaves = leafArrayOffsets[nsfs];
2472 
2473   /* Calculate number of roots */
2474   switch (rootMode) {
2475   case PETSCSF_CONCATENATE_ROOTMODE_SHARED: {
2476     PetscCall(PetscSFGetGraph(sfs[0], &nRoots, NULL, NULL, NULL));
2477     if (PetscDefined(USE_DEBUG)) {
2478       for (s = 1; s < nsfs; s++) {
2479         PetscInt nr;
2480 
2481         PetscCall(PetscSFGetGraph(sfs[s], &nr, NULL, NULL, NULL));
2482         PetscCheck(nr == nRoots, comm, PETSC_ERR_ARG_SIZ, "rootMode = %s but sfs[%" PetscInt_FMT "] has a different number of roots (%" PetscInt_FMT ") than sfs[0] (%" PetscInt_FMT ")", PetscSFConcatenateRootModes[rootMode], s, nr, nRoots);
2483       }
2484     }
2485   } break;
2486   case PETSCSF_CONCATENATE_ROOTMODE_GLOBAL: {
2487     /* Calculate also global layout in this case */
2488     PetscInt    *nls;
2489     PetscLayout *lts;
2490     PetscInt   **inds;
2491     PetscInt     j;
2492     PetscInt     rootOffset = 0;
2493 
2494     PetscCall(PetscCalloc3(nsfs, &lts, nsfs, &nls, nsfs, &inds));
2495     PetscCall(PetscLayoutCreate(comm, &glayout));
2496     glayout->bs = 1;
2497     glayout->n  = 0;
2498     glayout->N  = 0;
2499     for (s = 0; s < nsfs; s++) {
2500       PetscCall(PetscSFGetGraphLayout(sfs[s], &lts[s], &nls[s], NULL, &inds[s]));
2501       glayout->n += lts[s]->n;
2502       glayout->N += lts[s]->N;
2503     }
2504     PetscCall(PetscLayoutSetUp(glayout));
2505     PetscCall(PetscMalloc1(nLeaves, &gremote));
2506     for (s = 0, j = 0; s < nsfs; s++) {
2507       for (i = 0; i < nls[s]; i++, j++) gremote[j] = inds[s][i] + rootOffset;
2508       rootOffset += lts[s]->N;
2509       PetscCall(PetscLayoutDestroy(&lts[s]));
2510       PetscCall(PetscFree(inds[s]));
2511     }
2512     PetscCall(PetscFree3(lts, nls, inds));
2513     nRoots = glayout->N;
2514   } break;
2515   case PETSCSF_CONCATENATE_ROOTMODE_LOCAL:
2516     /* nRoots calculated later in this case */
2517     break;
2518   default:
2519     SETERRQ(comm, PETSC_ERR_ARG_WRONG, "Invalid PetscSFConcatenateRootMode %d", rootMode);
2520   }
2521 
2522   if (!leafOffsets) {
2523     all_ilocal_null = PETSC_TRUE;
2524     for (s = 0; s < nsfs; s++) {
2525       const PetscInt *ilocal;
2526 
2527       PetscCall(PetscSFGetGraph(sfs[s], NULL, NULL, &ilocal, NULL));
2528       if (ilocal) {
2529         all_ilocal_null = PETSC_FALSE;
2530         break;
2531       }
2532     }
2533     PetscCheck(all_ilocal_null, PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "leafOffsets can be passed as NULL only if all SFs have ilocal = NULL");
2534   }
2535 
2536   /* Renumber and concatenate local leaves */
2537   ilocal_new = NULL;
2538   if (!all_ilocal_null) {
2539     PetscCall(PetscMalloc1(nLeaves, &ilocal_new));
2540     for (i = 0; i < nLeaves; i++) ilocal_new[i] = -1;
2541     for (s = 0; s < nsfs; s++) {
2542       const PetscInt *ilocal;
2543       PetscInt       *ilocal_l = PetscSafePointerPlusOffset(ilocal_new, leafArrayOffsets[s]);
2544       PetscInt        i, nleaves_l;
2545 
2546       PetscCall(PetscSFGetGraph(sfs[s], NULL, &nleaves_l, &ilocal, NULL));
2547       for (i = 0; i < nleaves_l; i++) ilocal_l[i] = (ilocal ? ilocal[i] : i) + leafOffsets[s];
2548     }
2549   }
2550 
2551   /* Renumber and concatenate remote roots */
2552   if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL || rootMode == PETSCSF_CONCATENATE_ROOTMODE_SHARED) {
2553     PetscInt rootOffset = 0;
2554 
2555     PetscCall(PetscMalloc1(nLeaves, &iremote_new));
2556     for (i = 0; i < nLeaves; i++) {
2557       iremote_new[i].rank  = -1;
2558       iremote_new[i].index = -1;
2559     }
2560     for (s = 0; s < nsfs; s++) {
2561       PetscInt           i, nl, nr;
2562       PetscSF            tmp_sf;
2563       const PetscSFNode *iremote;
2564       PetscSFNode       *tmp_rootdata;
2565       PetscSFNode       *tmp_leafdata = PetscSafePointerPlusOffset(iremote_new, leafArrayOffsets[s]);
2566 
2567       PetscCall(PetscSFGetGraph(sfs[s], &nr, &nl, NULL, &iremote));
2568       PetscCall(PetscSFCreate(comm, &tmp_sf));
2569       /* create helper SF with contiguous leaves */
2570       PetscCall(PetscSFSetGraph(tmp_sf, nr, nl, NULL, PETSC_USE_POINTER, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
2571       PetscCall(PetscSFSetUp(tmp_sf));
2572       PetscCall(PetscMalloc1(nr, &tmp_rootdata));
2573       if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) {
2574         for (i = 0; i < nr; i++) {
2575           tmp_rootdata[i].index = i + rootOffset;
2576           tmp_rootdata[i].rank  = (PetscInt)rank;
2577         }
2578         rootOffset += nr;
2579       } else {
2580         for (i = 0; i < nr; i++) {
2581           tmp_rootdata[i].index = i;
2582           tmp_rootdata[i].rank  = (PetscInt)rank;
2583         }
2584       }
2585       PetscCall(PetscSFBcastBegin(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2586       PetscCall(PetscSFBcastEnd(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2587       PetscCall(PetscSFDestroy(&tmp_sf));
2588       PetscCall(PetscFree(tmp_rootdata));
2589     }
2590     if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) nRoots = rootOffset; // else nRoots already calculated above
2591 
2592     /* Build the new SF */
2593     PetscCall(PetscSFCreate(comm, newsf));
2594     PetscCall(PetscSFSetGraph(*newsf, nRoots, nLeaves, ilocal_new, PETSC_OWN_POINTER, iremote_new, PETSC_OWN_POINTER));
2595   } else {
2596     /* Build the new SF */
2597     PetscCall(PetscSFCreate(comm, newsf));
2598     PetscCall(PetscSFSetGraphLayout(*newsf, glayout, nLeaves, ilocal_new, PETSC_OWN_POINTER, gremote));
2599   }
2600   PetscCall(PetscSFSetUp(*newsf));
2601   PetscCall(PetscSFViewFromOptions(*newsf, NULL, "-sf_concat_view"));
2602   PetscCall(PetscLayoutDestroy(&glayout));
2603   PetscCall(PetscFree(gremote));
2604   PetscCall(PetscFree(leafArrayOffsets));
2605   PetscFunctionReturn(PETSC_SUCCESS);
2606 }
2607