xref: /petsc/src/vec/is/sf/interface/sf.c (revision fbf9dbe564678ed6eff1806adbc4c4f01b9743f4)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petsc/private/viewerimpl.h>
4 #include <petsc/private/hashmapi.h>
5 
6 #if defined(PETSC_HAVE_CUDA)
7   #include <cuda_runtime.h>
8   #include <petscdevice_cuda.h>
9 #endif
10 
11 #if defined(PETSC_HAVE_HIP)
12   #include <hip/hip_runtime.h>
13 #endif
14 
15 #if defined(PETSC_CLANG_STATIC_ANALYZER)
16 void PetscSFCheckGraphSet(PetscSF, int);
17 #else
18   #if defined(PETSC_USE_DEBUG)
19     #define PetscSFCheckGraphSet(sf, arg) PetscCheck((sf)->graphset, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %d \"%s\" before %s()", (arg), #sf, PETSC_FUNCTION_NAME);
20   #else
21     #define PetscSFCheckGraphSet(sf, arg) \
22       do { \
23       } while (0)
24   #endif
25 #endif
26 
27 const char *const PetscSFDuplicateOptions[]     = {"CONFONLY", "RANKS", "GRAPH", "PetscSFDuplicateOption", "PETSCSF_DUPLICATE_", NULL};
28 const char *const PetscSFConcatenateRootModes[] = {"local", "shared", "global", "PetscSFConcatenateRootMode", "PETSCSF_CONCATENATE_ROOTMODE_", NULL};
29 
30 /*@
31    PetscSFCreate - create a star forest communication context
32 
33    Collective
34 
35    Input Parameter:
36 .  comm - communicator on which the star forest will operate
37 
38    Output Parameter:
39 .  sf - new star forest context
40 
41    Options Database Key:
42 .  -sf_type type - value of type may be
43 .vb
44     basic     -Use MPI persistent Isend/Irecv for communication (Default)
45     window    -Use MPI-3 one-sided window for communication
46     neighbor  -Use MPI-3 neighborhood collectives for communication
47 .ve
48 
49    Level: intermediate
50 
51    Note:
52    When one knows the communication graph is one of the predefined graph, such as `MPI_Alltoall()`, `MPI_Allgatherv()`,
53    `MPI_Gatherv()`, one can create a `PetscSF` and then set its graph with `PetscSFSetGraphWithPattern()`. These special
54    `SF`s are optimized and they have better performance than the general `SF`s.
55 
56 .seealso: `PetscSF`, `PetscSFSetType`, PetscSFSetGraph()`, `PetscSFSetGraphWithPattern()`, `PetscSFDestroy()`
57 @*/
58 PetscErrorCode PetscSFCreate(MPI_Comm comm, PetscSF *sf)
59 {
60   PetscSF b;
61 
62   PetscFunctionBegin;
63   PetscValidPointer(sf, 2);
64   PetscCall(PetscSFInitializePackage());
65 
66   PetscCall(PetscHeaderCreate(b, PETSCSF_CLASSID, "PetscSF", "Star Forest", "PetscSF", comm, PetscSFDestroy, PetscSFView));
67 
68   b->nroots    = -1;
69   b->nleaves   = -1;
70   b->minleaf   = PETSC_MAX_INT;
71   b->maxleaf   = PETSC_MIN_INT;
72   b->nranks    = -1;
73   b->rankorder = PETSC_TRUE;
74   b->ingroup   = MPI_GROUP_NULL;
75   b->outgroup  = MPI_GROUP_NULL;
76   b->graphset  = PETSC_FALSE;
77 #if defined(PETSC_HAVE_DEVICE)
78   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
79   b->use_stream_aware_mpi = PETSC_FALSE;
80   b->unknown_input_stream = PETSC_FALSE;
81   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
82   b->backend = PETSCSF_BACKEND_KOKKOS;
83   #elif defined(PETSC_HAVE_CUDA)
84   b->backend = PETSCSF_BACKEND_CUDA;
85   #elif defined(PETSC_HAVE_HIP)
86   b->backend = PETSCSF_BACKEND_HIP;
87   #endif
88 
89   #if defined(PETSC_HAVE_NVSHMEM)
90   b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
91   b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
92   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem", &b->use_nvshmem, NULL));
93   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem_get", &b->use_nvshmem_get, NULL));
94   #endif
95 #endif
96   b->vscat.from_n = -1;
97   b->vscat.to_n   = -1;
98   b->vscat.unit   = MPIU_SCALAR;
99   *sf             = b;
100   PetscFunctionReturn(PETSC_SUCCESS);
101 }
102 
103 /*@
104    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
105 
106    Collective
107 
108    Input Parameter:
109 .  sf - star forest
110 
111    Level: advanced
112 
113 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetGraph()`, `PetscSFDestroy()`
114 @*/
115 PetscErrorCode PetscSFReset(PetscSF sf)
116 {
117   PetscFunctionBegin;
118   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
119   PetscTryTypeMethod(sf, Reset);
120   sf->nroots   = -1;
121   sf->nleaves  = -1;
122   sf->minleaf  = PETSC_MAX_INT;
123   sf->maxleaf  = PETSC_MIN_INT;
124   sf->mine     = NULL;
125   sf->remote   = NULL;
126   sf->graphset = PETSC_FALSE;
127   PetscCall(PetscFree(sf->mine_alloc));
128   PetscCall(PetscFree(sf->remote_alloc));
129   sf->nranks = -1;
130   PetscCall(PetscFree4(sf->ranks, sf->roffset, sf->rmine, sf->rremote));
131   sf->degreeknown = PETSC_FALSE;
132   PetscCall(PetscFree(sf->degree));
133   if (sf->ingroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->ingroup));
134   if (sf->outgroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->outgroup));
135   if (sf->multi) sf->multi->multi = NULL;
136   PetscCall(PetscSFDestroy(&sf->multi));
137   PetscCall(PetscLayoutDestroy(&sf->map));
138 
139 #if defined(PETSC_HAVE_DEVICE)
140   for (PetscInt i = 0; i < 2; i++) PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, sf->rmine_d[i]));
141 #endif
142 
143   sf->setupcalled = PETSC_FALSE;
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 /*@C
148    PetscSFSetType - Set the `PetscSF` communication implementation
149 
150    Collective
151 
152    Input Parameters:
153 +  sf - the `PetscSF` context
154 -  type - a known method
155 .vb
156     PETSCSFWINDOW - MPI-2/3 one-sided
157     PETSCSFBASIC - basic implementation using MPI-1 two-sided
158 .ve
159 
160    Options Database Key:
161 .  -sf_type <type> - Sets the method; for example `basic` or `window` use -help for a list of available methods
162 
163   Level: intermediate
164 
165    Notes:
166    See `PetscSFType` for possible values
167 
168 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`
169 @*/
170 PetscErrorCode PetscSFSetType(PetscSF sf, PetscSFType type)
171 {
172   PetscBool match;
173   PetscErrorCode (*r)(PetscSF);
174 
175   PetscFunctionBegin;
176   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
177   PetscValidCharPointer(type, 2);
178 
179   PetscCall(PetscObjectTypeCompare((PetscObject)sf, type, &match));
180   if (match) PetscFunctionReturn(PETSC_SUCCESS);
181 
182   PetscCall(PetscFunctionListFind(PetscSFList, type, &r));
183   PetscCheck(r, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_UNKNOWN_TYPE, "Unable to find requested PetscSF type %s", type);
184   /* Destroy the previous PetscSF implementation context */
185   PetscTryTypeMethod(sf, Destroy);
186   PetscCall(PetscMemzero(sf->ops, sizeof(*sf->ops)));
187   PetscCall(PetscObjectChangeTypeName((PetscObject)sf, type));
188   PetscCall((*r)(sf));
189   PetscFunctionReturn(PETSC_SUCCESS);
190 }
191 
192 /*@C
193   PetscSFGetType - Get the `PetscSF` communication implementation
194 
195   Not Collective
196 
197   Input Parameter:
198 . sf  - the `PetscSF` context
199 
200   Output Parameter:
201 . type - the `PetscSF` type name
202 
203   Level: intermediate
204 
205 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetType()`, `PetscSFCreate()`
206 @*/
207 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
208 {
209   PetscFunctionBegin;
210   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
211   PetscValidPointer(type, 2);
212   *type = ((PetscObject)sf)->type_name;
213   PetscFunctionReturn(PETSC_SUCCESS);
214 }
215 
216 /*@C
217    PetscSFDestroy - destroy a star forest
218 
219    Collective
220 
221    Input Parameter:
222 .  sf - address of star forest
223 
224    Level: intermediate
225 
226 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFReset()`
227 @*/
228 PetscErrorCode PetscSFDestroy(PetscSF *sf)
229 {
230   PetscFunctionBegin;
231   if (!*sf) PetscFunctionReturn(PETSC_SUCCESS);
232   PetscValidHeaderSpecific((*sf), PETSCSF_CLASSID, 1);
233   if (--((PetscObject)(*sf))->refct > 0) {
234     *sf = NULL;
235     PetscFunctionReturn(PETSC_SUCCESS);
236   }
237   PetscCall(PetscSFReset(*sf));
238   PetscTryTypeMethod((*sf), Destroy);
239   PetscCall(PetscSFDestroy(&(*sf)->vscat.lsf));
240   if ((*sf)->vscat.bs > 1) PetscCallMPI(MPI_Type_free(&(*sf)->vscat.unit));
241 #if defined(PETSC_HAVE_MPIX_STREAM)
242   if ((*sf)->use_stream_aware_mpi) {
243     PetscCallMPI(MPIX_Stream_free(&(*sf)->mpi_stream));
244     PetscCallMPI(MPI_Comm_free(&(*sf)->stream_comm));
245   }
246 #endif
247   PetscCall(PetscHeaderDestroy(sf));
248   PetscFunctionReturn(PETSC_SUCCESS);
249 }
250 
251 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
252 {
253   PetscInt           i, nleaves;
254   PetscMPIInt        size;
255   const PetscInt    *ilocal;
256   const PetscSFNode *iremote;
257 
258   PetscFunctionBegin;
259   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(PETSC_SUCCESS);
260   PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, &iremote));
261   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
262   for (i = 0; i < nleaves; i++) {
263     const PetscInt rank   = iremote[i].rank;
264     const PetscInt remote = iremote[i].index;
265     const PetscInt leaf   = ilocal ? ilocal[i] : i;
266     PetscCheck(rank >= 0 && rank < size, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided rank (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be in [0, %d)", rank, i, size);
267     PetscCheck(remote >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided index (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be >= 0", remote, i);
268     PetscCheck(leaf >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided location (%" PetscInt_FMT ") for leaf %" PetscInt_FMT " is invalid, should be >= 0", leaf, i);
269   }
270   PetscFunctionReturn(PETSC_SUCCESS);
271 }
272 
273 /*@
274    PetscSFSetUp - set up communication structures for a `PetscSF`, after this is done it may be used to perform communication
275 
276    Collective
277 
278    Input Parameter:
279 .  sf - star forest communication object
280 
281    Level: beginner
282 
283 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetFromOptions()`, `PetscSFSetType()`
284 @*/
285 PetscErrorCode PetscSFSetUp(PetscSF sf)
286 {
287   PetscFunctionBegin;
288   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
289   PetscSFCheckGraphSet(sf, 1);
290   if (sf->setupcalled) PetscFunctionReturn(PETSC_SUCCESS);
291   PetscCall(PetscLogEventBegin(PETSCSF_SetUp, sf, 0, 0, 0));
292   PetscCall(PetscSFCheckGraphValid_Private(sf));
293   if (!((PetscObject)sf)->type_name) PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); /* Zero all sf->ops */
294   PetscTryTypeMethod(sf, SetUp);
295 #if defined(PETSC_HAVE_CUDA)
296   if (sf->backend == PETSCSF_BACKEND_CUDA) {
297     sf->ops->Malloc = PetscSFMalloc_CUDA;
298     sf->ops->Free   = PetscSFFree_CUDA;
299   }
300 #endif
301 #if defined(PETSC_HAVE_HIP)
302   if (sf->backend == PETSCSF_BACKEND_HIP) {
303     sf->ops->Malloc = PetscSFMalloc_HIP;
304     sf->ops->Free   = PetscSFFree_HIP;
305   }
306 #endif
307 
308 #if defined(PETSC_HAVE_KOKKOS)
309   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
310     sf->ops->Malloc = PetscSFMalloc_Kokkos;
311     sf->ops->Free   = PetscSFFree_Kokkos;
312   }
313 #endif
314   PetscCall(PetscLogEventEnd(PETSCSF_SetUp, sf, 0, 0, 0));
315   sf->setupcalled = PETSC_TRUE;
316   PetscFunctionReturn(PETSC_SUCCESS);
317 }
318 
319 /*@
320    PetscSFSetFromOptions - set `PetscSF` options using the options database
321 
322    Logically Collective
323 
324    Input Parameter:
325 .  sf - star forest
326 
327    Options Database Keys:
328 +  -sf_type               - implementation type, see `PetscSFSetType()`
329 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
330 .  -sf_use_default_stream - Assume callers of `PetscSF` computed the input root/leafdata with the default CUDA stream. `PetscSF` will also
331                             use the default stream to process data. Therefore, no stream synchronization is needed between `PetscSF` and its caller (default: true).
332                             If true, this option only works with `-use_gpu_aware_mpi 1`.
333 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is CUDA-stream aware and `PetscSF` won't sync streams for send/recv buffers passed to MPI (default: false).
334                                If true, this option only works with `-use_gpu_aware_mpi 1`.
335 
336 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently `PetscSF` has these backends: cuda, hip and Kokkos.
337                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
338                               the only available is kokkos.
339 
340    Level: intermediate
341 
342 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetType()`
343 @*/
344 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
345 {
346   PetscSFType deft;
347   char        type[256];
348   PetscBool   flg;
349 
350   PetscFunctionBegin;
351   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
352   PetscObjectOptionsBegin((PetscObject)sf);
353   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
354   PetscCall(PetscOptionsFList("-sf_type", "PetscSF implementation type", "PetscSFSetType", PetscSFList, deft, type, sizeof(type), &flg));
355   PetscCall(PetscSFSetType(sf, flg ? type : deft));
356   PetscCall(PetscOptionsBool("-sf_rank_order", "sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise", "PetscSFSetRankOrder", sf->rankorder, &sf->rankorder, NULL));
357 #if defined(PETSC_HAVE_DEVICE)
358   {
359     char      backendstr[32] = {0};
360     PetscBool isCuda = PETSC_FALSE, isHip = PETSC_FALSE, isKokkos = PETSC_FALSE, set;
361     /* Change the defaults set in PetscSFCreate() with command line options */
362     PetscCall(PetscOptionsBool("-sf_unknown_input_stream", "SF root/leafdata is computed on arbitrary streams unknown to SF", "PetscSFSetFromOptions", sf->unknown_input_stream, &sf->unknown_input_stream, NULL));
363     PetscCall(PetscOptionsBool("-sf_use_stream_aware_mpi", "Assume the underlying MPI is cuda-stream aware", "PetscSFSetFromOptions", sf->use_stream_aware_mpi, &sf->use_stream_aware_mpi, NULL));
364     PetscCall(PetscOptionsString("-sf_backend", "Select the device backend SF uses", "PetscSFSetFromOptions", NULL, backendstr, sizeof(backendstr), &set));
365     PetscCall(PetscStrcasecmp("cuda", backendstr, &isCuda));
366     PetscCall(PetscStrcasecmp("kokkos", backendstr, &isKokkos));
367     PetscCall(PetscStrcasecmp("hip", backendstr, &isHip));
368   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
369     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
370     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
371     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
372     else PetscCheck(!set, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
373   #elif defined(PETSC_HAVE_KOKKOS)
374     PetscCheck(!set || isKokkos, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You can only choose kokkos", backendstr);
375   #endif
376 
377   #if defined(PETSC_HAVE_CUDA) && defined(PETSC_HAVE_MPIX_STREAM)
378     if (sf->use_stream_aware_mpi) {
379       MPI_Info info;
380 
381       PetscCallMPI(MPI_Info_create(&info));
382       PetscCallMPI(MPI_Info_set(info, "type", "cudaStream_t"));
383       PetscCallMPI(MPIX_Info_set_hex(info, "value", &PetscDefaultCudaStream, sizeof(PetscDefaultCudaStream)));
384       PetscCallMPI(MPIX_Stream_create(info, &sf->mpi_stream));
385       PetscCallMPI(MPI_Info_free(&info));
386       PetscCallMPI(MPIX_Stream_comm_create(PetscObjectComm((PetscObject)sf), sf->mpi_stream, &sf->stream_comm));
387     }
388   #endif
389   }
390 #endif
391   PetscTryTypeMethod(sf, SetFromOptions, PetscOptionsObject);
392   PetscOptionsEnd();
393   PetscFunctionReturn(PETSC_SUCCESS);
394 }
395 
396 /*@
397    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
398 
399    Logically Collective
400 
401    Input Parameters:
402 +  sf - star forest
403 -  flg - `PETSC_TRUE` to sort, `PETSC_FALSE` to skip sorting (lower setup cost, but non-deterministic)
404 
405    Level: advanced
406 
407 .seealso: `PetscSF`, `PetscSFType`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`
408 @*/
409 PetscErrorCode PetscSFSetRankOrder(PetscSF sf, PetscBool flg)
410 {
411   PetscFunctionBegin;
412   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
413   PetscValidLogicalCollectiveBool(sf, flg, 2);
414   PetscCheck(!sf->multi, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
415   sf->rankorder = flg;
416   PetscFunctionReturn(PETSC_SUCCESS);
417 }
418 
419 /*@C
420    PetscSFSetGraph - Set a parallel star forest
421 
422    Collective
423 
424    Input Parameters:
425 +  sf - star forest
426 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
427 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
428 .  ilocal - locations of leaves in leafdata buffers, pass `NULL` for contiguous storage (locations must be >= 0, enforced
429 during setup in debug mode)
430 .  localmode - copy mode for `ilocal`
431 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
432 during setup in debug mode)
433 -  remotemode - copy mode for `iremote`
434 
435    Level: intermediate
436 
437    Notes:
438    Leaf indices in `ilocal` must be unique, otherwise an error occurs.
439 
440    Input arrays `ilocal` and `iremote` follow the `PetscCopyMode` semantics.
441    In particular, if `localmode` or `remotemode` is `PETSC_OWN_POINTER` or `PETSC_USE_POINTER`,
442    PETSc might modify the respective array;
443    if `PETSC_USE_POINTER`, the user must delete the array after `PetscSFDestroy()`.
444    Only if `PETSC_COPY_VALUES` is used, the respective array is guaranteed to stay intact and a const array can be passed (but a cast to non-const is needed).
445 
446    Fortran Note:
447    In Fortran you must use `PETSC_COPY_VALUES` for `localmode` and `remotemode`.
448 
449    Developer Note:
450    We sort leaves to check for duplicates and contiguousness and to find minleaf/maxleaf.
451    This also allows to compare leaf sets of two `PetscSF`s easily.
452 
453 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
454 @*/
455 PetscErrorCode PetscSFSetGraph(PetscSF sf, PetscInt nroots, PetscInt nleaves, PetscInt *ilocal, PetscCopyMode localmode, PetscSFNode *iremote, PetscCopyMode remotemode)
456 {
457   PetscBool unique, contiguous;
458 
459   PetscFunctionBegin;
460   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
461   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal, 4);
462   if (nleaves > 0) PetscValidPointer(iremote, 6);
463   PetscCheck(nroots >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nroots %" PetscInt_FMT ", cannot be negative", nroots);
464   PetscCheck(nleaves >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nleaves %" PetscInt_FMT ", cannot be negative", nleaves);
465   /* enums may be handled as unsigned by some compilers, NVHPC for example, the int cast
466    * below is to prevent NVHPC from warning about meaningless comparison of unsigned with zero */
467   PetscCheck((int)localmode >= PETSC_COPY_VALUES && localmode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong localmode %d", localmode);
468   PetscCheck((int)remotemode >= PETSC_COPY_VALUES && remotemode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong remotemode %d", remotemode);
469 
470   if (sf->nroots >= 0) { /* Reset only if graph already set */
471     PetscCall(PetscSFReset(sf));
472   }
473 
474   PetscCall(PetscLogEventBegin(PETSCSF_SetGraph, sf, 0, 0, 0));
475 
476   sf->nroots  = nroots;
477   sf->nleaves = nleaves;
478 
479   if (localmode == PETSC_COPY_VALUES && ilocal) {
480     PetscInt *tlocal = NULL;
481 
482     PetscCall(PetscMalloc1(nleaves, &tlocal));
483     PetscCall(PetscArraycpy(tlocal, ilocal, nleaves));
484     ilocal = tlocal;
485   }
486   if (remotemode == PETSC_COPY_VALUES) {
487     PetscSFNode *tremote = NULL;
488 
489     PetscCall(PetscMalloc1(nleaves, &tremote));
490     PetscCall(PetscArraycpy(tremote, iremote, nleaves));
491     iremote = tremote;
492   }
493 
494   if (nleaves && ilocal) {
495     PetscSFNode work;
496 
497     PetscCall(PetscSortIntWithDataArray(nleaves, ilocal, iremote, sizeof(PetscSFNode), &work));
498     PetscCall(PetscSortedCheckDupsInt(nleaves, ilocal, &unique));
499     unique = PetscNot(unique);
500     PetscCheck(sf->allow_multi_leaves || unique, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Input ilocal has duplicate entries which is not allowed for this PetscSF");
501     sf->minleaf = ilocal[0];
502     sf->maxleaf = ilocal[nleaves - 1];
503     contiguous  = (PetscBool)(unique && ilocal[0] == 0 && ilocal[nleaves - 1] == nleaves - 1);
504   } else {
505     sf->minleaf = 0;
506     sf->maxleaf = nleaves - 1;
507     unique      = PETSC_TRUE;
508     contiguous  = PETSC_TRUE;
509   }
510 
511   if (contiguous) {
512     if (localmode == PETSC_USE_POINTER) {
513       ilocal = NULL;
514     } else {
515       PetscCall(PetscFree(ilocal));
516     }
517   }
518   sf->mine = ilocal;
519   if (localmode == PETSC_USE_POINTER) {
520     sf->mine_alloc = NULL;
521   } else {
522     sf->mine_alloc = ilocal;
523   }
524   sf->remote = iremote;
525   if (remotemode == PETSC_USE_POINTER) {
526     sf->remote_alloc = NULL;
527   } else {
528     sf->remote_alloc = iremote;
529   }
530   PetscCall(PetscLogEventEnd(PETSCSF_SetGraph, sf, 0, 0, 0));
531   sf->graphset = PETSC_TRUE;
532   PetscFunctionReturn(PETSC_SUCCESS);
533 }
534 
535 /*@
536   PetscSFSetGraphWithPattern - Sets the graph of a `PetscSF` with a specific pattern
537 
538   Collective
539 
540   Input Parameters:
541 + sf      - The `PetscSF`
542 . map     - Layout of roots over all processes (insignificant when pattern is `PETSCSF_PATTERN_ALLTOALL`)
543 - pattern - One of `PETSCSF_PATTERN_ALLGATHER`, `PETSCSF_PATTERN_GATHER`, `PETSCSF_PATTERN_ALLTOALL`
544 
545   Level: intermediate
546 
547   Notes:
548   It is easier to explain `PetscSFPattern` using vectors. Suppose we have an MPI vector `x` and its `PetscLayout` is `map`.
549   `n` and `N` are the local and global sizes of `x` respectively.
550 
551   With `PETSCSF_PATTERN_ALLGATHER`, the routine creates a graph that if one does `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on it, it will copy `x` to
552   sequential vectors `y` on all MPI processes.
553 
554   With `PETSCSF_PATTERN_GATHER`, the routine creates a graph that if one does `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on it, it will copy `x` to a
555   sequential vector `y` on rank 0.
556 
557   In above cases, entries of `x` are roots and entries of `y` are leaves.
558 
559   With `PETSCSF_PATTERN_ALLTOALL`, map is insignificant. Suppose NP is size of `sf`'s communicator. The routine
560   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
561   of rank j. Here 0 <=i,j<NP. It is a kind of `MPI_Alltoall()` with sendcount/recvcount being 1. Note that it does
562   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
563   items with `MPI_Type_contiguous` and use that as the <unit> argument in SF routines.
564 
565   In this case, roots and leaves are symmetric.
566 
567 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
568  @*/
569 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf, PetscLayout map, PetscSFPattern pattern)
570 {
571   MPI_Comm    comm;
572   PetscInt    n, N, res[2];
573   PetscMPIInt rank, size;
574   PetscSFType type;
575 
576   PetscFunctionBegin;
577   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
578   if (pattern != PETSCSF_PATTERN_ALLTOALL) PetscValidPointer(map, 2);
579   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
580   PetscCheck(pattern >= PETSCSF_PATTERN_ALLGATHER && pattern <= PETSCSF_PATTERN_ALLTOALL, comm, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported PetscSFPattern %d", pattern);
581   PetscCallMPI(MPI_Comm_rank(comm, &rank));
582   PetscCallMPI(MPI_Comm_size(comm, &size));
583 
584   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
585     type = PETSCSFALLTOALL;
586     PetscCall(PetscLayoutCreate(comm, &sf->map));
587     PetscCall(PetscLayoutSetLocalSize(sf->map, size));
588     PetscCall(PetscLayoutSetSize(sf->map, ((PetscInt)size) * size));
589     PetscCall(PetscLayoutSetUp(sf->map));
590   } else {
591     PetscCall(PetscLayoutGetLocalSize(map, &n));
592     PetscCall(PetscLayoutGetSize(map, &N));
593     res[0] = n;
594     res[1] = -n;
595     /* Check if n are same over all ranks so that we can optimize it */
596     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, res, 2, MPIU_INT, MPI_MAX, comm));
597     if (res[0] == -res[1]) { /* same n */
598       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER : PETSCSFGATHER;
599     } else {
600       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
601     }
602     PetscCall(PetscLayoutReference(map, &sf->map));
603   }
604   PetscCall(PetscSFSetType(sf, type));
605 
606   sf->pattern = pattern;
607   sf->mine    = NULL; /* Contiguous */
608 
609   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
610      Also set other easy stuff.
611    */
612   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
613     sf->nleaves = N;
614     sf->nroots  = n;
615     sf->nranks  = size;
616     sf->minleaf = 0;
617     sf->maxleaf = N - 1;
618   } else if (pattern == PETSCSF_PATTERN_GATHER) {
619     sf->nleaves = rank ? 0 : N;
620     sf->nroots  = n;
621     sf->nranks  = rank ? 0 : size;
622     sf->minleaf = 0;
623     sf->maxleaf = rank ? -1 : N - 1;
624   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
625     sf->nleaves = size;
626     sf->nroots  = size;
627     sf->nranks  = size;
628     sf->minleaf = 0;
629     sf->maxleaf = size - 1;
630   }
631   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
632   sf->graphset = PETSC_TRUE;
633   PetscFunctionReturn(PETSC_SUCCESS);
634 }
635 
636 /*@
637    PetscSFCreateInverseSF - given a `PetscSF` in which all vertices have degree 1, creates the inverse map
638 
639    Collective
640 
641    Input Parameter:
642 .  sf - star forest to invert
643 
644    Output Parameter:
645 .  isf - inverse of `sf`
646 
647    Level: advanced
648 
649    Notes:
650    All roots must have degree 1.
651 
652    The local space may be a permutation, but cannot be sparse.
653 
654 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetGraph()`
655 @*/
656 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf, PetscSF *isf)
657 {
658   PetscMPIInt     rank;
659   PetscInt        i, nroots, nleaves, maxlocal, count, *newilocal;
660   const PetscInt *ilocal;
661   PetscSFNode    *roots, *leaves;
662 
663   PetscFunctionBegin;
664   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
665   PetscSFCheckGraphSet(sf, 1);
666   PetscValidPointer(isf, 2);
667 
668   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, NULL));
669   maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
670 
671   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
672   PetscCall(PetscMalloc2(nroots, &roots, maxlocal, &leaves));
673   for (i = 0; i < maxlocal; i++) {
674     leaves[i].rank  = rank;
675     leaves[i].index = i;
676   }
677   for (i = 0; i < nroots; i++) {
678     roots[i].rank  = -1;
679     roots[i].index = -1;
680   }
681   PetscCall(PetscSFReduceBegin(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
682   PetscCall(PetscSFReduceEnd(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
683 
684   /* Check whether our leaves are sparse */
685   for (i = 0, count = 0; i < nroots; i++)
686     if (roots[i].rank >= 0) count++;
687   if (count == nroots) newilocal = NULL;
688   else { /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */ PetscCall(PetscMalloc1(count, &newilocal));
689     for (i = 0, count = 0; i < nroots; i++) {
690       if (roots[i].rank >= 0) {
691         newilocal[count]   = i;
692         roots[count].rank  = roots[i].rank;
693         roots[count].index = roots[i].index;
694         count++;
695       }
696     }
697   }
698 
699   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, isf));
700   PetscCall(PetscSFSetGraph(*isf, maxlocal, count, newilocal, PETSC_OWN_POINTER, roots, PETSC_COPY_VALUES));
701   PetscCall(PetscFree2(roots, leaves));
702   PetscFunctionReturn(PETSC_SUCCESS);
703 }
704 
705 /*@
706    PetscSFDuplicate - duplicate a `PetscSF`, optionally preserving rank connectivity and graph
707 
708    Collective
709 
710    Input Parameters:
711 +  sf - communication object to duplicate
712 -  opt - `PETSCSF_DUPLICATE_CONFONLY`, `PETSCSF_DUPLICATE_RANKS`, or `PETSCSF_DUPLICATE_GRAPH` (see `PetscSFDuplicateOption`)
713 
714    Output Parameter:
715 .  newsf - new communication object
716 
717    Level: beginner
718 
719 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFSetType()`, `PetscSFSetGraph()`
720 @*/
721 PetscErrorCode PetscSFDuplicate(PetscSF sf, PetscSFDuplicateOption opt, PetscSF *newsf)
722 {
723   PetscSFType  type;
724   MPI_Datatype dtype = MPIU_SCALAR;
725 
726   PetscFunctionBegin;
727   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
728   PetscValidLogicalCollectiveEnum(sf, opt, 2);
729   PetscValidPointer(newsf, 3);
730   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sf), newsf));
731   PetscCall(PetscSFGetType(sf, &type));
732   if (type) PetscCall(PetscSFSetType(*newsf, type));
733   (*newsf)->allow_multi_leaves = sf->allow_multi_leaves; /* Dup this flag earlier since PetscSFSetGraph() below checks on this flag */
734   if (opt == PETSCSF_DUPLICATE_GRAPH) {
735     PetscSFCheckGraphSet(sf, 1);
736     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
737       PetscInt           nroots, nleaves;
738       const PetscInt    *ilocal;
739       const PetscSFNode *iremote;
740       PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
741       PetscCall(PetscSFSetGraph(*newsf, nroots, nleaves, (PetscInt *)ilocal, PETSC_COPY_VALUES, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
742     } else {
743       PetscCall(PetscSFSetGraphWithPattern(*newsf, sf->map, sf->pattern));
744     }
745   }
746   /* Since oldtype is committed, so is newtype, according to MPI */
747   if (sf->vscat.bs > 1) PetscCallMPI(MPI_Type_dup(sf->vscat.unit, &dtype));
748   (*newsf)->vscat.bs     = sf->vscat.bs;
749   (*newsf)->vscat.unit   = dtype;
750   (*newsf)->vscat.to_n   = sf->vscat.to_n;
751   (*newsf)->vscat.from_n = sf->vscat.from_n;
752   /* Do not copy lsf. Build it on demand since it is rarely used */
753 
754 #if defined(PETSC_HAVE_DEVICE)
755   (*newsf)->backend              = sf->backend;
756   (*newsf)->unknown_input_stream = sf->unknown_input_stream;
757   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
758   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
759 #endif
760   PetscTryTypeMethod(sf, Duplicate, opt, *newsf);
761   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
762   PetscFunctionReturn(PETSC_SUCCESS);
763 }
764 
765 /*@C
766    PetscSFGetGraph - Get the graph specifying a parallel star forest
767 
768    Not Collective
769 
770    Input Parameter:
771 .  sf - star forest
772 
773    Output Parameters:
774 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
775 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
776 .  ilocal - locations of leaves in leafdata buffers (if returned value is `NULL`, it means leaves are in contiguous storage)
777 -  iremote - remote locations of root vertices for each leaf on the current process
778 
779    Level: intermediate
780 
781    Notes:
782      We are not currently requiring that the graph is set, thus returning `nroots` = -1 if it has not been set yet
783 
784      The returned `ilocal` and `iremote` might contain values in different order than the input ones in `PetscSFSetGraph()`
785 
786    Fortran Notes:
787      The returned `iremote` array is a copy and must be deallocated after use. Consequently, if you
788      want to update the graph, you must call `PetscSFSetGraph()` after modifying the `iremote` array.
789 
790      To check for a `NULL` `ilocal` use
791 $      if (loc(ilocal) == loc(PETSC_NULL_INTEGER)) then
792 
793 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`
794 @*/
795 PetscErrorCode PetscSFGetGraph(PetscSF sf, PetscInt *nroots, PetscInt *nleaves, const PetscInt **ilocal, const PetscSFNode **iremote)
796 {
797   PetscFunctionBegin;
798   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
799   if (sf->ops->GetGraph) {
800     PetscCall((sf->ops->GetGraph)(sf, nroots, nleaves, ilocal, iremote));
801   } else {
802     if (nroots) *nroots = sf->nroots;
803     if (nleaves) *nleaves = sf->nleaves;
804     if (ilocal) *ilocal = sf->mine;
805     if (iremote) *iremote = sf->remote;
806   }
807   PetscFunctionReturn(PETSC_SUCCESS);
808 }
809 
810 /*@
811    PetscSFGetLeafRange - Get the active leaf ranges
812 
813    Not Collective
814 
815    Input Parameter:
816 .  sf - star forest
817 
818    Output Parameters:
819 +  minleaf - minimum active leaf on this process. Returns 0 if there are no leaves.
820 -  maxleaf - maximum active leaf on this process. Returns -1 if there are no leaves.
821 
822    Level: developer
823 
824 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
825 @*/
826 PetscErrorCode PetscSFGetLeafRange(PetscSF sf, PetscInt *minleaf, PetscInt *maxleaf)
827 {
828   PetscFunctionBegin;
829   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
830   PetscSFCheckGraphSet(sf, 1);
831   if (minleaf) *minleaf = sf->minleaf;
832   if (maxleaf) *maxleaf = sf->maxleaf;
833   PetscFunctionReturn(PETSC_SUCCESS);
834 }
835 
836 /*@C
837    PetscSFViewFromOptions - View a `PetscSF` based on arguments in the options database
838 
839    Collective
840 
841    Input Parameters:
842 +  A - the star forest
843 .  obj - Optional object that provides the prefix for the option names
844 -  name - command line option
845 
846    Level: intermediate
847 
848   Note:
849   See `PetscObjectViewFromOptions()` for possible `PetscViewer` and `PetscViewerFormat`
850 
851 .seealso: `PetscSF`, `PetscSFView`, `PetscObjectViewFromOptions()`, `PetscSFCreate()`
852 @*/
853 PetscErrorCode PetscSFViewFromOptions(PetscSF A, PetscObject obj, const char name[])
854 {
855   PetscFunctionBegin;
856   PetscValidHeaderSpecific(A, PETSCSF_CLASSID, 1);
857   PetscCall(PetscObjectViewFromOptions((PetscObject)A, obj, name));
858   PetscFunctionReturn(PETSC_SUCCESS);
859 }
860 
861 /*@C
862    PetscSFView - view a star forest
863 
864    Collective
865 
866    Input Parameters:
867 +  sf - star forest
868 -  viewer - viewer to display graph, for example `PETSC_VIEWER_STDOUT_WORLD`
869 
870    Level: beginner
871 
872 .seealso: `PetscSF`, `PetscViewer`, `PetscSFCreate()`, `PetscSFSetGraph()`
873 @*/
874 PetscErrorCode PetscSFView(PetscSF sf, PetscViewer viewer)
875 {
876   PetscBool         iascii;
877   PetscViewerFormat format;
878 
879   PetscFunctionBegin;
880   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
881   if (!viewer) PetscCall(PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf), &viewer));
882   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
883   PetscCheckSameComm(sf, 1, viewer, 2);
884   if (sf->graphset) PetscCall(PetscSFSetUp(sf));
885   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
886   if (iascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) {
887     PetscMPIInt rank;
888     PetscInt    ii, i, j;
889 
890     PetscCall(PetscObjectPrintClassNamePrefixType((PetscObject)sf, viewer));
891     PetscCall(PetscViewerASCIIPushTab(viewer));
892     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
893       if (!sf->graphset) {
894         PetscCall(PetscViewerASCIIPrintf(viewer, "PetscSFSetGraph() has not been called yet\n"));
895         PetscCall(PetscViewerASCIIPopTab(viewer));
896         PetscFunctionReturn(PETSC_SUCCESS);
897       }
898       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
899       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
900       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Number of roots=%" PetscInt_FMT ", leaves=%" PetscInt_FMT ", remote ranks=%" PetscInt_FMT "\n", rank, sf->nroots, sf->nleaves, sf->nranks));
901       for (i = 0; i < sf->nleaves; i++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %" PetscInt_FMT " <- (%" PetscInt_FMT ",%" PetscInt_FMT ")\n", rank, sf->mine ? sf->mine[i] : i, sf->remote[i].rank, sf->remote[i].index));
902       PetscCall(PetscViewerFlush(viewer));
903       PetscCall(PetscViewerGetFormat(viewer, &format));
904       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
905         PetscMPIInt *tmpranks, *perm;
906         PetscCall(PetscMalloc2(sf->nranks, &tmpranks, sf->nranks, &perm));
907         PetscCall(PetscArraycpy(tmpranks, sf->ranks, sf->nranks));
908         for (i = 0; i < sf->nranks; i++) perm[i] = i;
909         PetscCall(PetscSortMPIIntWithArray(sf->nranks, tmpranks, perm));
910         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Roots referenced by my leaves, by rank\n", rank));
911         for (ii = 0; ii < sf->nranks; ii++) {
912           i = perm[ii];
913           PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %d: %" PetscInt_FMT " edges\n", rank, sf->ranks[i], sf->roffset[i + 1] - sf->roffset[i]));
914           for (j = sf->roffset[i]; j < sf->roffset[i + 1]; j++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d]    %" PetscInt_FMT " <- %" PetscInt_FMT "\n", rank, sf->rmine[j], sf->rremote[j]));
915         }
916         PetscCall(PetscFree2(tmpranks, perm));
917       }
918       PetscCall(PetscViewerFlush(viewer));
919       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
920     }
921     PetscCall(PetscViewerASCIIPopTab(viewer));
922   }
923   PetscTryTypeMethod(sf, View, viewer);
924   PetscFunctionReturn(PETSC_SUCCESS);
925 }
926 
927 /*@C
928    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
929 
930    Not Collective
931 
932    Input Parameter:
933 .  sf - star forest
934 
935    Output Parameters:
936 +  nranks - number of ranks referenced by local part
937 .  ranks - [`nranks`] array of ranks
938 .  roffset - [`nranks`+1] offset in `rmine`/`rremote` for each rank
939 .  rmine - [`roffset`[`nranks`]] concatenated array holding local indices referencing each remote rank
940 -  rremote - [`roffset`[`nranks`]] concatenated array holding remote indices referenced for each remote rank
941 
942    Level: developer
943 
944 .seealso: `PetscSF`, `PetscSFGetLeafRanks()`
945 @*/
946 PetscErrorCode PetscSFGetRootRanks(PetscSF sf, PetscInt *nranks, const PetscMPIInt **ranks, const PetscInt **roffset, const PetscInt **rmine, const PetscInt **rremote)
947 {
948   PetscFunctionBegin;
949   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
950   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
951   if (sf->ops->GetRootRanks) {
952     PetscCall((sf->ops->GetRootRanks)(sf, nranks, ranks, roffset, rmine, rremote));
953   } else {
954     /* The generic implementation */
955     if (nranks) *nranks = sf->nranks;
956     if (ranks) *ranks = sf->ranks;
957     if (roffset) *roffset = sf->roffset;
958     if (rmine) *rmine = sf->rmine;
959     if (rremote) *rremote = sf->rremote;
960   }
961   PetscFunctionReturn(PETSC_SUCCESS);
962 }
963 
964 /*@C
965    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
966 
967    Not Collective
968 
969    Input Parameter:
970 .  sf - star forest
971 
972    Output Parameters:
973 +  niranks - number of leaf ranks referencing roots on this process
974 .  iranks - [`niranks`] array of ranks
975 .  ioffset - [`niranks`+1] offset in `irootloc` for each rank
976 -  irootloc - [`ioffset`[`niranks`]] concatenated array holding local indices of roots referenced by each leaf rank
977 
978    Level: developer
979 
980 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
981 @*/
982 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf, PetscInt *niranks, const PetscMPIInt **iranks, const PetscInt **ioffset, const PetscInt **irootloc)
983 {
984   PetscFunctionBegin;
985   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
986   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
987   if (sf->ops->GetLeafRanks) {
988     PetscCall((sf->ops->GetLeafRanks)(sf, niranks, iranks, ioffset, irootloc));
989   } else {
990     PetscSFType type;
991     PetscCall(PetscSFGetType(sf, &type));
992     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
993   }
994   PetscFunctionReturn(PETSC_SUCCESS);
995 }
996 
997 static PetscBool InList(PetscMPIInt needle, PetscMPIInt n, const PetscMPIInt *list)
998 {
999   PetscInt i;
1000   for (i = 0; i < n; i++) {
1001     if (needle == list[i]) return PETSC_TRUE;
1002   }
1003   return PETSC_FALSE;
1004 }
1005 
1006 /*@C
1007    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by `PetscSF` implementations.
1008 
1009    Collective
1010 
1011    Input Parameters:
1012 +  sf - `PetscSF` to set up; `PetscSFSetGraph()` must have been called
1013 -  dgroup - `MPI_Group` of ranks to be distinguished (e.g., for self or shared memory exchange)
1014 
1015    Level: developer
1016 
1017 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
1018 @*/
1019 PetscErrorCode PetscSFSetUpRanks(PetscSF sf, MPI_Group dgroup)
1020 {
1021   PetscHMapI    table;
1022   PetscHashIter pos;
1023   PetscMPIInt   size, groupsize, *groupranks;
1024   PetscInt     *rcount, *ranks;
1025   PetscInt      i, irank = -1, orank = -1;
1026 
1027   PetscFunctionBegin;
1028   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1029   PetscSFCheckGraphSet(sf, 1);
1030   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
1031   PetscCall(PetscHMapICreateWithSize(10, &table));
1032   for (i = 0; i < sf->nleaves; i++) {
1033     /* Log 1-based rank */
1034     PetscCall(PetscHMapISetWithMode(table, sf->remote[i].rank + 1, 1, ADD_VALUES));
1035   }
1036   PetscCall(PetscHMapIGetSize(table, &sf->nranks));
1037   PetscCall(PetscMalloc4(sf->nranks, &sf->ranks, sf->nranks + 1, &sf->roffset, sf->nleaves, &sf->rmine, sf->nleaves, &sf->rremote));
1038   PetscCall(PetscMalloc2(sf->nranks, &rcount, sf->nranks, &ranks));
1039   PetscHashIterBegin(table, pos);
1040   for (i = 0; i < sf->nranks; i++) {
1041     PetscHashIterGetKey(table, pos, ranks[i]);
1042     PetscHashIterGetVal(table, pos, rcount[i]);
1043     PetscHashIterNext(table, pos);
1044     ranks[i]--; /* Convert back to 0-based */
1045   }
1046   PetscCall(PetscHMapIDestroy(&table));
1047 
1048   /* We expect that dgroup is reliably "small" while nranks could be large */
1049   {
1050     MPI_Group    group = MPI_GROUP_NULL;
1051     PetscMPIInt *dgroupranks;
1052     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1053     PetscCallMPI(MPI_Group_size(dgroup, &groupsize));
1054     PetscCall(PetscMalloc1(groupsize, &dgroupranks));
1055     PetscCall(PetscMalloc1(groupsize, &groupranks));
1056     for (i = 0; i < groupsize; i++) dgroupranks[i] = i;
1057     if (groupsize) PetscCallMPI(MPI_Group_translate_ranks(dgroup, groupsize, dgroupranks, group, groupranks));
1058     PetscCallMPI(MPI_Group_free(&group));
1059     PetscCall(PetscFree(dgroupranks));
1060   }
1061 
1062   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1063   for (sf->ndranks = 0, i = sf->nranks; sf->ndranks < i;) {
1064     for (i--; sf->ndranks < i; i--) { /* Scan i backward looking for distinguished rank */
1065       if (InList(ranks[i], groupsize, groupranks)) break;
1066     }
1067     for (; sf->ndranks <= i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1068       if (!InList(ranks[sf->ndranks], groupsize, groupranks)) break;
1069     }
1070     if (sf->ndranks < i) { /* Swap ranks[sf->ndranks] with ranks[i] */
1071       PetscInt tmprank, tmpcount;
1072 
1073       tmprank             = ranks[i];
1074       tmpcount            = rcount[i];
1075       ranks[i]            = ranks[sf->ndranks];
1076       rcount[i]           = rcount[sf->ndranks];
1077       ranks[sf->ndranks]  = tmprank;
1078       rcount[sf->ndranks] = tmpcount;
1079       sf->ndranks++;
1080     }
1081   }
1082   PetscCall(PetscFree(groupranks));
1083   PetscCall(PetscSortIntWithArray(sf->ndranks, ranks, rcount));
1084   PetscCall(PetscSortIntWithArray(sf->nranks - sf->ndranks, ranks + sf->ndranks, rcount + sf->ndranks));
1085   sf->roffset[0] = 0;
1086   for (i = 0; i < sf->nranks; i++) {
1087     PetscCall(PetscMPIIntCast(ranks[i], sf->ranks + i));
1088     sf->roffset[i + 1] = sf->roffset[i] + rcount[i];
1089     rcount[i]          = 0;
1090   }
1091   for (i = 0, irank = -1, orank = -1; i < sf->nleaves; i++) {
1092     /* short circuit */
1093     if (orank != sf->remote[i].rank) {
1094       /* Search for index of iremote[i].rank in sf->ranks */
1095       PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->ndranks, sf->ranks, &irank));
1096       if (irank < 0) {
1097         PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->nranks - sf->ndranks, sf->ranks + sf->ndranks, &irank));
1098         if (irank >= 0) irank += sf->ndranks;
1099       }
1100       orank = sf->remote[i].rank;
1101     }
1102     PetscCheck(irank >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Could not find rank %" PetscInt_FMT " in array", sf->remote[i].rank);
1103     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1104     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1105     rcount[irank]++;
1106   }
1107   PetscCall(PetscFree2(rcount, ranks));
1108   PetscFunctionReturn(PETSC_SUCCESS);
1109 }
1110 
1111 /*@C
1112    PetscSFGetGroups - gets incoming and outgoing process groups
1113 
1114    Collective
1115 
1116    Input Parameter:
1117 .  sf - star forest
1118 
1119    Output Parameters:
1120 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1121 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1122 
1123    Level: developer
1124 
1125 .seealso: `PetscSF`, `PetscSFGetWindow()`, `PetscSFRestoreWindow()`
1126 @*/
1127 PetscErrorCode PetscSFGetGroups(PetscSF sf, MPI_Group *incoming, MPI_Group *outgoing)
1128 {
1129   MPI_Group group = MPI_GROUP_NULL;
1130 
1131   PetscFunctionBegin;
1132   PetscCheck(sf->nranks >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUpRanks() before obtaining groups");
1133   if (sf->ingroup == MPI_GROUP_NULL) {
1134     PetscInt        i;
1135     const PetscInt *indegree;
1136     PetscMPIInt     rank, *outranks, *inranks;
1137     PetscSFNode    *remote;
1138     PetscSF         bgcount;
1139 
1140     /* Compute the number of incoming ranks */
1141     PetscCall(PetscMalloc1(sf->nranks, &remote));
1142     for (i = 0; i < sf->nranks; i++) {
1143       remote[i].rank  = sf->ranks[i];
1144       remote[i].index = 0;
1145     }
1146     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, &bgcount));
1147     PetscCall(PetscSFSetGraph(bgcount, 1, sf->nranks, NULL, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1148     PetscCall(PetscSFComputeDegreeBegin(bgcount, &indegree));
1149     PetscCall(PetscSFComputeDegreeEnd(bgcount, &indegree));
1150     /* Enumerate the incoming ranks */
1151     PetscCall(PetscMalloc2(indegree[0], &inranks, sf->nranks, &outranks));
1152     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1153     for (i = 0; i < sf->nranks; i++) outranks[i] = rank;
1154     PetscCall(PetscSFGatherBegin(bgcount, MPI_INT, outranks, inranks));
1155     PetscCall(PetscSFGatherEnd(bgcount, MPI_INT, outranks, inranks));
1156     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1157     PetscCallMPI(MPI_Group_incl(group, indegree[0], inranks, &sf->ingroup));
1158     PetscCallMPI(MPI_Group_free(&group));
1159     PetscCall(PetscFree2(inranks, outranks));
1160     PetscCall(PetscSFDestroy(&bgcount));
1161   }
1162   *incoming = sf->ingroup;
1163 
1164   if (sf->outgroup == MPI_GROUP_NULL) {
1165     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1166     PetscCallMPI(MPI_Group_incl(group, sf->nranks, sf->ranks, &sf->outgroup));
1167     PetscCallMPI(MPI_Group_free(&group));
1168   }
1169   *outgoing = sf->outgroup;
1170   PetscFunctionReturn(PETSC_SUCCESS);
1171 }
1172 
1173 /*@
1174    PetscSFGetMultiSF - gets the inner `PetscSF` implementing gathers and scatters
1175 
1176    Collective
1177 
1178    Input Parameter:
1179 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1180 
1181    Output Parameter:
1182 .  multi - star forest with split roots, such that each root has degree exactly 1
1183 
1184    Level: developer
1185 
1186    Note:
1187    In most cases, users should use `PetscSFGatherBegin()` and `PetscSFScatterBegin()` instead of manipulating multi
1188    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1189    edge, it is a candidate for future optimization that might involve its removal.
1190 
1191 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`, `PetscSFComputeMultiRootOriginalNumbering()`
1192 @*/
1193 PetscErrorCode PetscSFGetMultiSF(PetscSF sf, PetscSF *multi)
1194 {
1195   PetscFunctionBegin;
1196   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1197   PetscValidPointer(multi, 2);
1198   if (sf->nroots < 0) { /* Graph has not been set yet; why do we need this? */
1199     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1200     *multi           = sf->multi;
1201     sf->multi->multi = sf->multi;
1202     PetscFunctionReturn(PETSC_SUCCESS);
1203   }
1204   if (!sf->multi) {
1205     const PetscInt *indegree;
1206     PetscInt        i, *inoffset, *outones, *outoffset, maxlocal;
1207     PetscSFNode    *remote;
1208     maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
1209     PetscCall(PetscSFComputeDegreeBegin(sf, &indegree));
1210     PetscCall(PetscSFComputeDegreeEnd(sf, &indegree));
1211     PetscCall(PetscMalloc3(sf->nroots + 1, &inoffset, maxlocal, &outones, maxlocal, &outoffset));
1212     inoffset[0] = 0;
1213     for (i = 0; i < sf->nroots; i++) inoffset[i + 1] = inoffset[i] + indegree[i];
1214     for (i = 0; i < maxlocal; i++) outones[i] = 1;
1215     PetscCall(PetscSFFetchAndOpBegin(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1216     PetscCall(PetscSFFetchAndOpEnd(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1217     for (i = 0; i < sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1218     if (PetscDefined(USE_DEBUG)) {                               /* Check that the expected number of increments occurred */
1219       for (i = 0; i < sf->nroots; i++) PetscCheck(inoffset[i] + indegree[i] == inoffset[i + 1], PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incorrect result after PetscSFFetchAndOp");
1220     }
1221     PetscCall(PetscMalloc1(sf->nleaves, &remote));
1222     for (i = 0; i < sf->nleaves; i++) {
1223       remote[i].rank  = sf->remote[i].rank;
1224       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1225     }
1226     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1227     sf->multi->multi = sf->multi;
1228     PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1229     if (sf->rankorder) { /* Sort the ranks */
1230       PetscMPIInt  rank;
1231       PetscInt    *inranks, *newoffset, *outranks, *newoutoffset, *tmpoffset, maxdegree;
1232       PetscSFNode *newremote;
1233       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1234       for (i = 0, maxdegree = 0; i < sf->nroots; i++) maxdegree = PetscMax(maxdegree, indegree[i]);
1235       PetscCall(PetscMalloc5(sf->multi->nroots, &inranks, sf->multi->nroots, &newoffset, maxlocal, &outranks, maxlocal, &newoutoffset, maxdegree, &tmpoffset));
1236       for (i = 0; i < maxlocal; i++) outranks[i] = rank;
1237       PetscCall(PetscSFReduceBegin(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1238       PetscCall(PetscSFReduceEnd(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1239       /* Sort the incoming ranks at each vertex, build the inverse map */
1240       for (i = 0; i < sf->nroots; i++) {
1241         PetscInt j;
1242         for (j = 0; j < indegree[i]; j++) tmpoffset[j] = j;
1243         PetscCall(PetscSortIntWithArray(indegree[i], inranks + inoffset[i], tmpoffset));
1244         for (j = 0; j < indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1245       }
1246       PetscCall(PetscSFBcastBegin(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1247       PetscCall(PetscSFBcastEnd(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1248       PetscCall(PetscMalloc1(sf->nleaves, &newremote));
1249       for (i = 0; i < sf->nleaves; i++) {
1250         newremote[i].rank  = sf->remote[i].rank;
1251         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1252       }
1253       PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, newremote, PETSC_OWN_POINTER));
1254       PetscCall(PetscFree5(inranks, newoffset, outranks, newoutoffset, tmpoffset));
1255     }
1256     PetscCall(PetscFree3(inoffset, outones, outoffset));
1257   }
1258   *multi = sf->multi;
1259   PetscFunctionReturn(PETSC_SUCCESS);
1260 }
1261 
1262 /*@C
1263    PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots of a `PetscSF`, does not remap indices
1264 
1265    Collective
1266 
1267    Input Parameters:
1268 +  sf - original star forest
1269 .  nselected  - number of selected roots on this process
1270 -  selected   - indices of the selected roots on this process
1271 
1272    Output Parameter:
1273 .  esf - new star forest
1274 
1275    Level: advanced
1276 
1277    Note:
1278    To use the new `PetscSF`, it may be necessary to know the indices of the leaves that are still participating. This can
1279    be done by calling PetscSFGetGraph().
1280 
1281 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1282 @*/
1283 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *esf)
1284 {
1285   PetscInt           i, j, n, nroots, nleaves, esf_nleaves, *new_ilocal, minleaf, maxleaf, maxlocal;
1286   const PetscInt    *ilocal;
1287   signed char       *rootdata, *leafdata, *leafmem;
1288   const PetscSFNode *iremote;
1289   PetscSFNode       *new_iremote;
1290   MPI_Comm           comm;
1291 
1292   PetscFunctionBegin;
1293   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1294   PetscSFCheckGraphSet(sf, 1);
1295   if (nselected) PetscValidIntPointer(selected, 3);
1296   PetscValidPointer(esf, 4);
1297 
1298   PetscCall(PetscSFSetUp(sf));
1299   PetscCall(PetscLogEventBegin(PETSCSF_EmbedSF, sf, 0, 0, 0));
1300   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1301   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
1302 
1303   if (PetscDefined(USE_DEBUG)) { /* Error out if selected[] has dups or  out of range indices */
1304     PetscBool dups;
1305     PetscCall(PetscCheckDupsInt(nselected, selected, &dups));
1306     PetscCheck(!dups, comm, PETSC_ERR_ARG_WRONG, "selected[] has dups");
1307     for (i = 0; i < nselected; i++) PetscCheck(selected[i] >= 0 && selected[i] < nroots, comm, PETSC_ERR_ARG_OUTOFRANGE, "selected root indice %" PetscInt_FMT " is out of [0,%" PetscInt_FMT ")", selected[i], nroots);
1308   }
1309 
1310   if (sf->ops->CreateEmbeddedRootSF) PetscUseTypeMethod(sf, CreateEmbeddedRootSF, nselected, selected, esf);
1311   else {
1312     /* A generic version of creating embedded sf */
1313     PetscCall(PetscSFGetLeafRange(sf, &minleaf, &maxleaf));
1314     maxlocal = maxleaf - minleaf + 1;
1315     PetscCall(PetscCalloc2(nroots, &rootdata, maxlocal, &leafmem));
1316     leafdata = leafmem - minleaf;
1317     /* Tag selected roots and bcast to leaves */
1318     for (i = 0; i < nselected; i++) rootdata[selected[i]] = 1;
1319     PetscCall(PetscSFBcastBegin(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1320     PetscCall(PetscSFBcastEnd(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1321 
1322     /* Build esf with leaves that are still connected */
1323     esf_nleaves = 0;
1324     for (i = 0; i < nleaves; i++) {
1325       j = ilocal ? ilocal[i] : i;
1326       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1327          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1328       */
1329       esf_nleaves += (leafdata[j] ? 1 : 0);
1330     }
1331     PetscCall(PetscMalloc1(esf_nleaves, &new_ilocal));
1332     PetscCall(PetscMalloc1(esf_nleaves, &new_iremote));
1333     for (i = n = 0; i < nleaves; i++) {
1334       j = ilocal ? ilocal[i] : i;
1335       if (leafdata[j]) {
1336         new_ilocal[n]        = j;
1337         new_iremote[n].rank  = iremote[i].rank;
1338         new_iremote[n].index = iremote[i].index;
1339         ++n;
1340       }
1341     }
1342     PetscCall(PetscSFCreate(comm, esf));
1343     PetscCall(PetscSFSetFromOptions(*esf));
1344     PetscCall(PetscSFSetGraph(*esf, nroots, esf_nleaves, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1345     PetscCall(PetscFree2(rootdata, leafmem));
1346   }
1347   PetscCall(PetscLogEventEnd(PETSCSF_EmbedSF, sf, 0, 0, 0));
1348   PetscFunctionReturn(PETSC_SUCCESS);
1349 }
1350 
1351 /*@C
1352   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves of a `PetscSF`, does not remap indices
1353 
1354   Collective
1355 
1356   Input Parameters:
1357 + sf - original star forest
1358 . nselected  - number of selected leaves on this process
1359 - selected   - indices of the selected leaves on this process
1360 
1361   Output Parameter:
1362 .  newsf - new star forest
1363 
1364   Level: advanced
1365 
1366 .seealso: `PetscSF`, `PetscSFCreateEmbeddedRootSF()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1367 @*/
1368 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *newsf)
1369 {
1370   const PetscSFNode *iremote;
1371   PetscSFNode       *new_iremote;
1372   const PetscInt    *ilocal;
1373   PetscInt           i, nroots, *leaves, *new_ilocal;
1374   MPI_Comm           comm;
1375 
1376   PetscFunctionBegin;
1377   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1378   PetscSFCheckGraphSet(sf, 1);
1379   if (nselected) PetscValidIntPointer(selected, 3);
1380   PetscValidPointer(newsf, 4);
1381 
1382   /* Uniq selected[] and put results in leaves[] */
1383   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1384   PetscCall(PetscMalloc1(nselected, &leaves));
1385   PetscCall(PetscArraycpy(leaves, selected, nselected));
1386   PetscCall(PetscSortedRemoveDupsInt(&nselected, leaves));
1387   PetscCheck(!nselected || !(leaves[0] < 0 || leaves[nselected - 1] >= sf->nleaves), comm, PETSC_ERR_ARG_OUTOFRANGE, "Min/Max leaf indices %" PetscInt_FMT "/%" PetscInt_FMT " are not in [0,%" PetscInt_FMT ")", leaves[0], leaves[nselected - 1], sf->nleaves);
1388 
1389   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1390   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) PetscUseTypeMethod(sf, CreateEmbeddedLeafSF, nselected, leaves, newsf);
1391   else {
1392     PetscCall(PetscSFGetGraph(sf, &nroots, NULL, &ilocal, &iremote));
1393     PetscCall(PetscMalloc1(nselected, &new_ilocal));
1394     PetscCall(PetscMalloc1(nselected, &new_iremote));
1395     for (i = 0; i < nselected; ++i) {
1396       const PetscInt l     = leaves[i];
1397       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1398       new_iremote[i].rank  = iremote[l].rank;
1399       new_iremote[i].index = iremote[l].index;
1400     }
1401     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, newsf));
1402     PetscCall(PetscSFSetGraph(*newsf, nroots, nselected, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1403   }
1404   PetscCall(PetscFree(leaves));
1405   PetscFunctionReturn(PETSC_SUCCESS);
1406 }
1407 
1408 /*@C
1409    PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to `PetscSFBcastEnd()`
1410 
1411    Collective
1412 
1413    Input Parameters:
1414 +  sf - star forest on which to communicate
1415 .  unit - data type associated with each node
1416 .  rootdata - buffer to broadcast
1417 -  op - operation to use for reduction
1418 
1419    Output Parameter:
1420 .  leafdata - buffer to be reduced with values from each leaf's respective root
1421 
1422    Level: intermediate
1423 
1424    Note:
1425     When PETSc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1426     are host pointers or device pointers, which may incur a noticeable cost. If you already knew the info, you should
1427     use `PetscSFBcastWithMemTypeBegin()` instead.
1428 
1429 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastWithMemTypeBegin()`
1430 @*/
1431 PetscErrorCode PetscSFBcastBegin(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1432 {
1433   PetscMemType rootmtype, leafmtype;
1434 
1435   PetscFunctionBegin;
1436   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1437   PetscCall(PetscSFSetUp(sf));
1438   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1439   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1440   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1441   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1442   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1443   PetscFunctionReturn(PETSC_SUCCESS);
1444 }
1445 
1446 /*@C
1447    PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call
1448    to `PetscSFBcastEnd()`
1449 
1450    Collective
1451 
1452    Input Parameters:
1453 +  sf - star forest on which to communicate
1454 .  unit - data type associated with each node
1455 .  rootmtype - memory type of rootdata
1456 .  rootdata - buffer to broadcast
1457 .  leafmtype - memory type of leafdata
1458 -  op - operation to use for reduction
1459 
1460    Output Parameter:
1461 .  leafdata - buffer to be reduced with values from each leaf's respective root
1462 
1463    Level: intermediate
1464 
1465 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastBegin()`
1466 @*/
1467 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
1468 {
1469   PetscFunctionBegin;
1470   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1471   PetscCall(PetscSFSetUp(sf));
1472   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1473   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1474   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1475   PetscFunctionReturn(PETSC_SUCCESS);
1476 }
1477 
1478 /*@C
1479    PetscSFBcastEnd - end a broadcast and reduce operation started with `PetscSFBcastBegin()` or `PetscSFBcastWithMemTypeBegin()`
1480 
1481    Collective
1482 
1483    Input Parameters:
1484 +  sf - star forest
1485 .  unit - data type
1486 .  rootdata - buffer to broadcast
1487 -  op - operation to use for reduction
1488 
1489    Output Parameter:
1490 .  leafdata - buffer to be reduced with values from each leaf's respective root
1491 
1492    Level: intermediate
1493 
1494 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFReduceEnd()`
1495 @*/
1496 PetscErrorCode PetscSFBcastEnd(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1497 {
1498   PetscFunctionBegin;
1499   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1500   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastEnd, sf, 0, 0, 0));
1501   PetscUseTypeMethod(sf, BcastEnd, unit, rootdata, leafdata, op);
1502   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastEnd, sf, 0, 0, 0));
1503   PetscFunctionReturn(PETSC_SUCCESS);
1504 }
1505 
1506 /*@C
1507    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to `PetscSFReduceEnd()`
1508 
1509    Collective
1510 
1511    Input Parameters:
1512 +  sf - star forest
1513 .  unit - data type
1514 .  leafdata - values to reduce
1515 -  op - reduction operation
1516 
1517    Output Parameter:
1518 .  rootdata - result of reduction of values from all leaves of each root
1519 
1520    Level: intermediate
1521 
1522    Note:
1523     When PETSc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1524     are host pointers or device pointers, which may incur a noticeable cost. If you already knew the info, you should
1525     use `PetscSFReduceWithMemTypeBegin()` instead.
1526 
1527 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceWithMemTypeBegin()`, `PetscSFReduceEnd()`
1528 @*/
1529 PetscErrorCode PetscSFReduceBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1530 {
1531   PetscMemType rootmtype, leafmtype;
1532 
1533   PetscFunctionBegin;
1534   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1535   PetscCall(PetscSFSetUp(sf));
1536   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1537   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1538   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1539   PetscCall((sf->ops->ReduceBegin)(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1540   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1541   PetscFunctionReturn(PETSC_SUCCESS);
1542 }
1543 
1544 /*@C
1545    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to `PetscSFReduceEnd()`
1546 
1547    Collective
1548 
1549    Input Parameters:
1550 +  sf - star forest
1551 .  unit - data type
1552 .  leafmtype - memory type of leafdata
1553 .  leafdata - values to reduce
1554 .  rootmtype - memory type of rootdata
1555 -  op - reduction operation
1556 
1557    Output Parameter:
1558 .  rootdata - result of reduction of values from all leaves of each root
1559 
1560    Level: intermediate
1561 
1562 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceBegin()`, `PetscSFReduceEnd()`
1563 @*/
1564 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
1565 {
1566   PetscFunctionBegin;
1567   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1568   PetscCall(PetscSFSetUp(sf));
1569   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1570   PetscCall((sf->ops->ReduceBegin)(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1571   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1572   PetscFunctionReturn(PETSC_SUCCESS);
1573 }
1574 
1575 /*@C
1576    PetscSFReduceEnd - end a reduction operation started with `PetscSFReduceBegin()` or `PetscSFReduceWithMemTypeBegin()`
1577 
1578    Collective
1579 
1580    Input Parameters:
1581 +  sf - star forest
1582 .  unit - data type
1583 .  leafdata - values to reduce
1584 -  op - reduction operation
1585 
1586    Output Parameter:
1587 .  rootdata - result of reduction of values from all leaves of each root
1588 
1589    Level: intermediate
1590 
1591 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFBcastEnd()`, `PetscSFReduceBegin()`, `PetscSFReduceWithMemTypeBegin()`
1592 @*/
1593 PetscErrorCode PetscSFReduceEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1594 {
1595   PetscFunctionBegin;
1596   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1597   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1598   PetscUseTypeMethod(sf, ReduceEnd, unit, leafdata, rootdata, op);
1599   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1600   PetscFunctionReturn(PETSC_SUCCESS);
1601 }
1602 
1603 /*@C
1604    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value,
1605    to be completed with `PetscSFFetchAndOpEnd()`
1606 
1607    Collective
1608 
1609    Input Parameters:
1610 +  sf - star forest
1611 .  unit - data type
1612 .  leafdata - leaf values to use in reduction
1613 -  op - operation to use for reduction
1614 
1615    Output Parameters:
1616 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1617 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1618 
1619    Level: advanced
1620 
1621    Note:
1622    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1623    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1624    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1625    integers.
1626 
1627 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`
1628 @*/
1629 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1630 {
1631   PetscMemType rootmtype, leafmtype, leafupdatemtype;
1632 
1633   PetscFunctionBegin;
1634   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1635   PetscCall(PetscSFSetUp(sf));
1636   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1637   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1638   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1639   PetscCall(PetscGetMemType(leafupdate, &leafupdatemtype));
1640   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1641   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1642   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1643   PetscFunctionReturn(PETSC_SUCCESS);
1644 }
1645 
1646 /*@C
1647    PetscSFFetchAndOpWithMemTypeBegin - begin operation with explicit memory types that fetches values from root and updates atomically by
1648    applying operation using my leaf value, to be completed with `PetscSFFetchAndOpEnd()`
1649 
1650    Collective
1651 
1652    Input Parameters:
1653 +  sf - star forest
1654 .  unit - data type
1655 .  rootmtype - memory type of rootdata
1656 .  leafmtype - memory type of leafdata
1657 .  leafdata - leaf values to use in reduction
1658 .  leafupdatemtype - memory type of leafupdate
1659 -  op - operation to use for reduction
1660 
1661    Output Parameters:
1662 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1663 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1664 
1665    Level: advanced
1666 
1667    Note:
1668    See `PetscSFFetchAndOpBegin()` for more details.
1669 
1670 .seealso: `PetscSF`, `PetscSFFetchAndOpBegin()`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`, `PetscSFFetchAndOpEnd()`
1671 @*/
1672 PetscErrorCode PetscSFFetchAndOpWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, PetscMemType leafupdatemtype, void *leafupdate, MPI_Op op)
1673 {
1674   PetscFunctionBegin;
1675   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1676   PetscCall(PetscSFSetUp(sf));
1677   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1678   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1679   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1680   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1681   PetscFunctionReturn(PETSC_SUCCESS);
1682 }
1683 
1684 /*@C
1685    PetscSFFetchAndOpEnd - end operation started in matching call to `PetscSFFetchAndOpBegin()` or `PetscSFFetchAndOpWithMemTypeBegin()`
1686    to fetch values from roots and update atomically by applying operation using my leaf value
1687 
1688    Collective
1689 
1690    Input Parameters:
1691 +  sf - star forest
1692 .  unit - data type
1693 .  leafdata - leaf values to use in reduction
1694 -  op - operation to use for reduction
1695 
1696    Output Parameters:
1697 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1698 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1699 
1700    Level: advanced
1701 
1702 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFReduceEnd()`, `PetscSFSetGraph()`, `PetscSFFetchAndOpBegin()`, `PetscSFFetchAndOpWithMemTypeBegin()`
1703 @*/
1704 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1705 {
1706   PetscFunctionBegin;
1707   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1708   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1709   PetscUseTypeMethod(sf, FetchAndOpEnd, unit, rootdata, leafdata, leafupdate, op);
1710   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1711   PetscFunctionReturn(PETSC_SUCCESS);
1712 }
1713 
1714 /*@C
1715    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with `PetscSFComputeDegreeEnd()`
1716 
1717    Collective
1718 
1719    Input Parameter:
1720 .  sf - star forest
1721 
1722    Output Parameter:
1723 .  degree - degree of each root vertex
1724 
1725    Level: advanced
1726 
1727    Note:
1728    The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence there is no need to call `PetscFree()` on it.
1729 
1730 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeEnd()`
1731 @*/
1732 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf, const PetscInt **degree)
1733 {
1734   PetscFunctionBegin;
1735   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1736   PetscSFCheckGraphSet(sf, 1);
1737   PetscValidPointer(degree, 2);
1738   if (!sf->degreeknown) {
1739     PetscInt i, nroots = sf->nroots, maxlocal;
1740     PetscCheck(!sf->degree, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1741     maxlocal = sf->maxleaf - sf->minleaf + 1;
1742     PetscCall(PetscMalloc1(nroots, &sf->degree));
1743     PetscCall(PetscMalloc1(PetscMax(maxlocal, 1), &sf->degreetmp)); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1744     for (i = 0; i < nroots; i++) sf->degree[i] = 0;
1745     for (i = 0; i < maxlocal; i++) sf->degreetmp[i] = 1;
1746     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1747   }
1748   *degree = NULL;
1749   PetscFunctionReturn(PETSC_SUCCESS);
1750 }
1751 
1752 /*@C
1753    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with `PetscSFComputeDegreeBegin()`
1754 
1755    Collective
1756 
1757    Input Parameter:
1758 .  sf - star forest
1759 
1760    Output Parameter:
1761 .  degree - degree of each root vertex
1762 
1763    Level: developer
1764 
1765    Note:
1766    The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence there is no need to call `PetscFree()` on it.
1767 
1768 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeBegin()`
1769 @*/
1770 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf, const PetscInt **degree)
1771 {
1772   PetscFunctionBegin;
1773   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1774   PetscSFCheckGraphSet(sf, 1);
1775   PetscValidPointer(degree, 2);
1776   if (!sf->degreeknown) {
1777     PetscCheck(sf->degreetmp, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1778     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1779     PetscCall(PetscFree(sf->degreetmp));
1780     sf->degreeknown = PETSC_TRUE;
1781   }
1782   *degree = sf->degree;
1783   PetscFunctionReturn(PETSC_SUCCESS);
1784 }
1785 
1786 /*@C
1787    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-`PetscSF` returned by `PetscSFGetMultiSF()`).
1788    Each multi-root is assigned index of the corresponding original root.
1789 
1790    Collective
1791 
1792    Input Parameters:
1793 +  sf - star forest
1794 -  degree - degree of each root vertex, computed with `PetscSFComputeDegreeBegin()`/`PetscSFComputeDegreeEnd()`
1795 
1796    Output Parameters:
1797 +  nMultiRoots - (optional) number of multi-roots (roots of multi-`PetscSF`)
1798 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is `nMultiRoots`
1799 
1800    Level: developer
1801 
1802    Note:
1803    The returned array `multiRootsOrigNumbering` is newly allocated and should be destroyed with `PetscFree()` when no longer needed.
1804 
1805 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFComputeDegreeEnd()`, `PetscSFGetMultiSF()`
1806 @*/
1807 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1808 {
1809   PetscSF  msf;
1810   PetscInt i, j, k, nroots, nmroots;
1811 
1812   PetscFunctionBegin;
1813   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1814   PetscCall(PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL));
1815   if (nroots) PetscValidIntPointer(degree, 2);
1816   if (nMultiRoots) PetscValidIntPointer(nMultiRoots, 3);
1817   PetscValidPointer(multiRootsOrigNumbering, 4);
1818   PetscCall(PetscSFGetMultiSF(sf, &msf));
1819   PetscCall(PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL));
1820   PetscCall(PetscMalloc1(nmroots, multiRootsOrigNumbering));
1821   for (i = 0, j = 0, k = 0; i < nroots; i++) {
1822     if (!degree[i]) continue;
1823     for (j = 0; j < degree[i]; j++, k++) (*multiRootsOrigNumbering)[k] = i;
1824   }
1825   PetscCheck(k == nmroots, PETSC_COMM_SELF, PETSC_ERR_PLIB, "sanity check fail");
1826   if (nMultiRoots) *nMultiRoots = nmroots;
1827   PetscFunctionReturn(PETSC_SUCCESS);
1828 }
1829 
1830 /*@C
1831    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with `PetscSFGatherEnd()`
1832 
1833    Collective
1834 
1835    Input Parameters:
1836 +  sf - star forest
1837 .  unit - data type
1838 -  leafdata - leaf data to gather to roots
1839 
1840    Output Parameter:
1841 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1842 
1843    Level: intermediate
1844 
1845 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterBegin()`
1846 @*/
1847 PetscErrorCode PetscSFGatherBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1848 {
1849   PetscSF multi = NULL;
1850 
1851   PetscFunctionBegin;
1852   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1853   PetscCall(PetscSFSetUp(sf));
1854   PetscCall(PetscSFGetMultiSF(sf, &multi));
1855   PetscCall(PetscSFReduceBegin(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1856   PetscFunctionReturn(PETSC_SUCCESS);
1857 }
1858 
1859 /*@C
1860    PetscSFGatherEnd - ends pointwise gather operation that was started with `PetscSFGatherBegin()`
1861 
1862    Collective
1863 
1864    Input Parameters:
1865 +  sf - star forest
1866 .  unit - data type
1867 -  leafdata - leaf data to gather to roots
1868 
1869    Output Parameter:
1870 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1871 
1872    Level: intermediate
1873 
1874 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterEnd()`
1875 @*/
1876 PetscErrorCode PetscSFGatherEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1877 {
1878   PetscSF multi = NULL;
1879 
1880   PetscFunctionBegin;
1881   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1882   PetscCall(PetscSFGetMultiSF(sf, &multi));
1883   PetscCall(PetscSFReduceEnd(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1884   PetscFunctionReturn(PETSC_SUCCESS);
1885 }
1886 
1887 /*@C
1888    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with `PetscSFScatterEnd()`
1889 
1890    Collective
1891 
1892    Input Parameters:
1893 +  sf - star forest
1894 .  unit - data type
1895 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1896 
1897    Output Parameter:
1898 .  leafdata - leaf data to be update with personal data from each respective root
1899 
1900    Level: intermediate
1901 
1902 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterEnd()`
1903 @*/
1904 PetscErrorCode PetscSFScatterBegin(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1905 {
1906   PetscSF multi = NULL;
1907 
1908   PetscFunctionBegin;
1909   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1910   PetscCall(PetscSFSetUp(sf));
1911   PetscCall(PetscSFGetMultiSF(sf, &multi));
1912   PetscCall(PetscSFBcastBegin(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1913   PetscFunctionReturn(PETSC_SUCCESS);
1914 }
1915 
1916 /*@C
1917    PetscSFScatterEnd - ends pointwise scatter operation that was started with `PetscSFScatterBegin()`
1918 
1919    Collective
1920 
1921    Input Parameters:
1922 +  sf - star forest
1923 .  unit - data type
1924 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1925 
1926    Output Parameter:
1927 .  leafdata - leaf data to be update with personal data from each respective root
1928 
1929    Level: intermediate
1930 
1931 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterBegin()`
1932 @*/
1933 PetscErrorCode PetscSFScatterEnd(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1934 {
1935   PetscSF multi = NULL;
1936 
1937   PetscFunctionBegin;
1938   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1939   PetscCall(PetscSFGetMultiSF(sf, &multi));
1940   PetscCall(PetscSFBcastEnd(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1941   PetscFunctionReturn(PETSC_SUCCESS);
1942 }
1943 
1944 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1945 {
1946   PetscInt        i, n, nleaves;
1947   const PetscInt *ilocal = NULL;
1948   PetscHSetI      seen;
1949 
1950   PetscFunctionBegin;
1951   if (PetscDefined(USE_DEBUG)) {
1952     PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, NULL));
1953     PetscCall(PetscHSetICreate(&seen));
1954     for (i = 0; i < nleaves; i++) {
1955       const PetscInt leaf = ilocal ? ilocal[i] : i;
1956       PetscCall(PetscHSetIAdd(seen, leaf));
1957     }
1958     PetscCall(PetscHSetIGetSize(seen, &n));
1959     PetscCheck(n == nleaves, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided leaves have repeated values: all leaves must be unique");
1960     PetscCall(PetscHSetIDestroy(&seen));
1961   }
1962   PetscFunctionReturn(PETSC_SUCCESS);
1963 }
1964 
1965 /*@
1966   PetscSFCompose - Compose a new `PetscSF` by putting the second `PetscSF` under the first one in a top (roots) down (leaves) view
1967 
1968   Input Parameters:
1969 + sfA - The first `PetscSF`
1970 - sfB - The second `PetscSF`
1971 
1972   Output Parameter:
1973 . sfBA - The composite `PetscSF`
1974 
1975   Level: developer
1976 
1977   Notes:
1978   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
1979   forests, i.e. the same leaf is not connected with different roots.
1980 
1981   `sfA`'s leaf space and `sfB`'s root space might be partially overlapped. The composition builds
1982   a graph with `sfA`'s roots and `sfB`'s leaves only when there is a path between them. Unconnected
1983   nodes (roots or leaves) are not in `sfBA`. Doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on the new `PetscSF` is equivalent to doing a
1984   `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on `sfA`, then a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on `sfB`, on connected nodes.
1985 
1986 .seealso: `PetscSF`, `PetscSFComposeInverse()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`
1987 @*/
1988 PetscErrorCode PetscSFCompose(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
1989 {
1990   const PetscSFNode *remotePointsA, *remotePointsB;
1991   PetscSFNode       *remotePointsBA = NULL, *reorderedRemotePointsA = NULL, *leafdataB;
1992   const PetscInt    *localPointsA, *localPointsB;
1993   PetscInt          *localPointsBA;
1994   PetscInt           i, numRootsA, numLeavesA, numRootsB, numLeavesB, minleaf, maxleaf, numLeavesBA;
1995   PetscBool          denseB;
1996 
1997   PetscFunctionBegin;
1998   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
1999   PetscSFCheckGraphSet(sfA, 1);
2000   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
2001   PetscSFCheckGraphSet(sfB, 2);
2002   PetscCheckSameComm(sfA, 1, sfB, 2);
2003   PetscValidPointer(sfBA, 3);
2004   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
2005   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
2006 
2007   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
2008   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
2009   /* Make sure that PetscSFBcast{Begin, End}(sfB, ...) works with root data of size
2010      numRootsB; otherwise, garbage will be broadcasted.
2011      Example (comm size = 1):
2012      sfA: 0 <- (0, 0)
2013      sfB: 100 <- (0, 0)
2014           101 <- (0, 1)
2015      Here, we have remotePointsA = [(0, 0)], but for remotePointsA to be a valid tartget
2016      of sfB, it has to be recasted as [(0, 0), (-1, -1)] so that points 100 and 101 would
2017      receive (0, 0) and (-1, -1), respectively, when PetscSFBcast(sfB, ...) is called on
2018      remotePointsA; if not recasted, point 101 would receive a garbage value.             */
2019   PetscCall(PetscMalloc1(numRootsB, &reorderedRemotePointsA));
2020   for (i = 0; i < numRootsB; i++) {
2021     reorderedRemotePointsA[i].rank  = -1;
2022     reorderedRemotePointsA[i].index = -1;
2023   }
2024   for (i = 0; i < numLeavesA; i++) {
2025     PetscInt localp = localPointsA ? localPointsA[i] : i;
2026 
2027     if (localp >= numRootsB) continue;
2028     reorderedRemotePointsA[localp] = remotePointsA[i];
2029   }
2030   remotePointsA = reorderedRemotePointsA;
2031   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2032   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &leafdataB));
2033   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2034     leafdataB[i].rank  = -1;
2035     leafdataB[i].index = -1;
2036   }
2037   PetscCall(PetscSFBcastBegin(sfB, MPIU_2INT, remotePointsA, leafdataB - minleaf, MPI_REPLACE));
2038   PetscCall(PetscSFBcastEnd(sfB, MPIU_2INT, remotePointsA, leafdataB - minleaf, MPI_REPLACE));
2039   PetscCall(PetscFree(reorderedRemotePointsA));
2040 
2041   denseB = (PetscBool)!localPointsB;
2042   for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2043     if (leafdataB[localPointsB ? localPointsB[i] - minleaf : i].rank == -1) denseB = PETSC_FALSE;
2044     else numLeavesBA++;
2045   }
2046   if (denseB) {
2047     localPointsBA  = NULL;
2048     remotePointsBA = leafdataB;
2049   } else {
2050     PetscCall(PetscMalloc1(numLeavesBA, &localPointsBA));
2051     PetscCall(PetscMalloc1(numLeavesBA, &remotePointsBA));
2052     for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2053       const PetscInt l = localPointsB ? localPointsB[i] : i;
2054 
2055       if (leafdataB[l - minleaf].rank == -1) continue;
2056       remotePointsBA[numLeavesBA] = leafdataB[l - minleaf];
2057       localPointsBA[numLeavesBA]  = l;
2058       numLeavesBA++;
2059     }
2060     PetscCall(PetscFree(leafdataB));
2061   }
2062   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2063   PetscCall(PetscSFSetFromOptions(*sfBA));
2064   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2065   PetscFunctionReturn(PETSC_SUCCESS);
2066 }
2067 
2068 /*@
2069   PetscSFComposeInverse - Compose a new `PetscSF` by putting the inverse of the second `PetscSF` under the first one
2070 
2071   Input Parameters:
2072 + sfA - The first `PetscSF`
2073 - sfB - The second `PetscSF`
2074 
2075   Output Parameter:
2076 . sfBA - The composite `PetscSF`.
2077 
2078   Level: developer
2079 
2080   Notes:
2081   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
2082   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2083   second `PetscSF` must have a degree of 1, i.e., no roots have more than one leaf connected.
2084 
2085   `sfA`'s leaf space and `sfB`'s leaf space might be partially overlapped. The composition builds
2086   a graph with `sfA`'s roots and `sfB`'s roots only when there is a path between them. Unconnected
2087   roots are not in `sfBA`. Doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on the new `PetscSF` is equivalent to doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()`
2088   on `sfA`, then
2089   a `PetscSFReduceBegin()`/`PetscSFReduceEnd()` on `sfB`, on connected roots.
2090 
2091 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFCreateInverseSF()`
2092 @*/
2093 PetscErrorCode PetscSFComposeInverse(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
2094 {
2095   const PetscSFNode *remotePointsA, *remotePointsB;
2096   PetscSFNode       *remotePointsBA;
2097   const PetscInt    *localPointsA, *localPointsB;
2098   PetscSFNode       *reorderedRemotePointsA = NULL;
2099   PetscInt           i, numRootsA, numLeavesA, numLeavesBA, numRootsB, numLeavesB, minleaf, maxleaf, *localPointsBA;
2100   MPI_Op             op;
2101 #if defined(PETSC_USE_64BIT_INDICES)
2102   PetscBool iswin;
2103 #endif
2104 
2105   PetscFunctionBegin;
2106   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
2107   PetscSFCheckGraphSet(sfA, 1);
2108   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
2109   PetscSFCheckGraphSet(sfB, 2);
2110   PetscCheckSameComm(sfA, 1, sfB, 2);
2111   PetscValidPointer(sfBA, 3);
2112   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
2113   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
2114 
2115   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
2116   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
2117 
2118   /* TODO: Check roots of sfB have degree of 1 */
2119   /* Once we implement it, we can replace the MPI_MAXLOC
2120      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2121      We use MPI_MAXLOC only to have a deterministic output from this routine if
2122      the root condition is not meet.
2123    */
2124   op = MPI_MAXLOC;
2125 #if defined(PETSC_USE_64BIT_INDICES)
2126   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2127   PetscCall(PetscObjectTypeCompare((PetscObject)sfB, PETSCSFWINDOW, &iswin));
2128   if (iswin) op = MPI_REPLACE;
2129 #endif
2130 
2131   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2132   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &reorderedRemotePointsA));
2133   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2134     reorderedRemotePointsA[i].rank  = -1;
2135     reorderedRemotePointsA[i].index = -1;
2136   }
2137   if (localPointsA) {
2138     for (i = 0; i < numLeavesA; i++) {
2139       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2140       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2141     }
2142   } else {
2143     for (i = 0; i < numLeavesA; i++) {
2144       if (i > maxleaf || i < minleaf) continue;
2145       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2146     }
2147   }
2148 
2149   PetscCall(PetscMalloc1(numRootsB, &localPointsBA));
2150   PetscCall(PetscMalloc1(numRootsB, &remotePointsBA));
2151   for (i = 0; i < numRootsB; i++) {
2152     remotePointsBA[i].rank  = -1;
2153     remotePointsBA[i].index = -1;
2154   }
2155 
2156   PetscCall(PetscSFReduceBegin(sfB, MPIU_2INT, reorderedRemotePointsA - minleaf, remotePointsBA, op));
2157   PetscCall(PetscSFReduceEnd(sfB, MPIU_2INT, reorderedRemotePointsA - minleaf, remotePointsBA, op));
2158   PetscCall(PetscFree(reorderedRemotePointsA));
2159   for (i = 0, numLeavesBA = 0; i < numRootsB; i++) {
2160     if (remotePointsBA[i].rank == -1) continue;
2161     remotePointsBA[numLeavesBA].rank  = remotePointsBA[i].rank;
2162     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2163     localPointsBA[numLeavesBA]        = i;
2164     numLeavesBA++;
2165   }
2166   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2167   PetscCall(PetscSFSetFromOptions(*sfBA));
2168   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2169   PetscFunctionReturn(PETSC_SUCCESS);
2170 }
2171 
2172 /*
2173   PetscSFCreateLocalSF_Private - Creates a local `PetscSF` that only has intra-process edges of the global `PetscSF`
2174 
2175   Input Parameter:
2176 . sf - The global `PetscSF`
2177 
2178   Output Parameter:
2179 . out - The local `PetscSF`
2180 
2181 .seealso: `PetscSF`, `PetscSFCreate()`
2182  */
2183 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf, PetscSF *out)
2184 {
2185   MPI_Comm           comm;
2186   PetscMPIInt        myrank;
2187   const PetscInt    *ilocal;
2188   const PetscSFNode *iremote;
2189   PetscInt           i, j, nroots, nleaves, lnleaves, *lilocal;
2190   PetscSFNode       *liremote;
2191   PetscSF            lsf;
2192 
2193   PetscFunctionBegin;
2194   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2195   if (sf->ops->CreateLocalSF) PetscUseTypeMethod(sf, CreateLocalSF, out);
2196   else {
2197     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2198     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
2199     PetscCallMPI(MPI_Comm_rank(comm, &myrank));
2200 
2201     /* Find out local edges and build a local SF */
2202     PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
2203     for (i = lnleaves = 0; i < nleaves; i++) {
2204       if (iremote[i].rank == (PetscInt)myrank) lnleaves++;
2205     }
2206     PetscCall(PetscMalloc1(lnleaves, &lilocal));
2207     PetscCall(PetscMalloc1(lnleaves, &liremote));
2208 
2209     for (i = j = 0; i < nleaves; i++) {
2210       if (iremote[i].rank == (PetscInt)myrank) {
2211         lilocal[j]        = ilocal ? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2212         liremote[j].rank  = 0;                      /* rank in PETSC_COMM_SELF */
2213         liremote[j].index = iremote[i].index;
2214         j++;
2215       }
2216     }
2217     PetscCall(PetscSFCreate(PETSC_COMM_SELF, &lsf));
2218     PetscCall(PetscSFSetFromOptions(lsf));
2219     PetscCall(PetscSFSetGraph(lsf, nroots, lnleaves, lilocal, PETSC_OWN_POINTER, liremote, PETSC_OWN_POINTER));
2220     PetscCall(PetscSFSetUp(lsf));
2221     *out = lsf;
2222   }
2223   PetscFunctionReturn(PETSC_SUCCESS);
2224 }
2225 
2226 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2227 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata)
2228 {
2229   PetscMemType rootmtype, leafmtype;
2230 
2231   PetscFunctionBegin;
2232   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2233   PetscCall(PetscSFSetUp(sf));
2234   PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
2235   PetscCall(PetscGetMemType(rootdata, &rootmtype));
2236   PetscCall(PetscGetMemType(leafdata, &leafmtype));
2237   PetscUseTypeMethod(sf, BcastToZero, unit, rootmtype, rootdata, leafmtype, leafdata);
2238   PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
2239   PetscFunctionReturn(PETSC_SUCCESS);
2240 }
2241 
2242 /*@
2243   PetscSFConcatenate - concatenate multiple `PetscSF` into one
2244 
2245   Input Parameters:
2246 + comm - the communicator
2247 . nsfs - the number of input `PetscSF`
2248 . sfs  - the array of input `PetscSF`
2249 . rootMode - the root mode specifying how roots are handled
2250 - leafOffsets - the array of local leaf offsets, one for each input `PetscSF`, or `NULL` for contiguous storage
2251 
2252   Output Parameter:
2253 . newsf - The resulting `PetscSF`
2254 
2255   Level: advanced
2256 
2257   Notes:
2258   The communicator of all `PetscSF`s in `sfs` must be comm.
2259 
2260   Leaves are always concatenated locally, keeping them ordered by the input `PetscSF` index and original local order.
2261 
2262   The offsets in `leafOffsets` are added to the original leaf indices.
2263 
2264   If all input SFs use contiguous leaf storage (`ilocal` = `NULL`), `leafOffsets` can be passed as `NULL` as well.
2265   In this case, `NULL` is also passed as `ilocal` to the resulting `PetscSF`.
2266 
2267   If any input `PetscSF` has non-null `ilocal`, `leafOffsets` is needed to distinguish leaves from different input `PetscSF`s.
2268   In this case, user is responsible to provide correct offsets so that the resulting leaves are unique (otherwise an error occurs).
2269 
2270   All root modes retain the essential connectivity condition.
2271   If two leaves of the same input `PetscSF` are connected (sharing the same root), they are also connected in the output `PetscSF`.
2272   Parameter `rootMode` controls how the input root spaces are combined.
2273   For `PETSCSF_CONCATENATE_ROOTMODE_SHARED`, the root space is considered the same for each input `PetscSF` (checked in debug mode)
2274   and is also the same in the output `PetscSF`.
2275   For `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, the input root spaces are taken as separate and joined.
2276   `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` joins the root spaces locally;
2277   roots of sfs[0], sfs[1], sfs[2], ... are joined on each rank separately, ordered by input `PetscSF` and original local index, and renumbered contiguously.
2278   `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL` joins the root spaces globally;
2279   roots of sfs[0], sfs[1], sfs[2], ... are joined globally, ordered by input `PetscSF` index and original global index, and renumbered contiguously;
2280   the original root ranks are ignored.
2281   For both `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`,
2282   the output `PetscSF`'s root layout is such that the local number of roots is a sum of the input `PetscSF`'s local numbers of roots on each rank
2283   to keep the load balancing.
2284   However, for `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, roots can move to different ranks.
2285 
2286    Example:
2287    We can use src/vec/is/sf/tests/ex18.c to compare the root modes. By running
2288 .vb
2289   make -C $PETSC_DIR/src/vec/is/sf/tests ex18
2290   for m in {local,global,shared}; do
2291     mpirun -n 2 $PETSC_DIR/src/vec/is/sf/tests/ex18 -nsfs 2 -n 2 -root_mode $m -sf_view
2292   done
2293 .ve
2294    we generate two identical `PetscSF`s sf_0 and sf_1,
2295 .vb
2296   PetscSF Object: sf_0 2 MPI processes
2297     type: basic
2298     rank #leaves #roots
2299     [ 0]       4      2
2300     [ 1]       4      2
2301     leaves      roots       roots in global numbering
2302     ( 0,  0) <- ( 0,  0)  =   0
2303     ( 0,  1) <- ( 0,  1)  =   1
2304     ( 0,  2) <- ( 1,  0)  =   2
2305     ( 0,  3) <- ( 1,  1)  =   3
2306     ( 1,  0) <- ( 0,  0)  =   0
2307     ( 1,  1) <- ( 0,  1)  =   1
2308     ( 1,  2) <- ( 1,  0)  =   2
2309     ( 1,  3) <- ( 1,  1)  =   3
2310 .ve
2311    and pass them to `PetscSFConcatenate()` along with different choices of `rootMode`, yielding different result_sf:
2312 .vb
2313   rootMode = local:
2314   PetscSF Object: result_sf 2 MPI processes
2315     type: basic
2316     rank #leaves #roots
2317     [ 0]       8      4
2318     [ 1]       8      4
2319     leaves      roots       roots in global numbering
2320     ( 0,  0) <- ( 0,  0)  =   0
2321     ( 0,  1) <- ( 0,  1)  =   1
2322     ( 0,  2) <- ( 1,  0)  =   4
2323     ( 0,  3) <- ( 1,  1)  =   5
2324     ( 0,  4) <- ( 0,  2)  =   2
2325     ( 0,  5) <- ( 0,  3)  =   3
2326     ( 0,  6) <- ( 1,  2)  =   6
2327     ( 0,  7) <- ( 1,  3)  =   7
2328     ( 1,  0) <- ( 0,  0)  =   0
2329     ( 1,  1) <- ( 0,  1)  =   1
2330     ( 1,  2) <- ( 1,  0)  =   4
2331     ( 1,  3) <- ( 1,  1)  =   5
2332     ( 1,  4) <- ( 0,  2)  =   2
2333     ( 1,  5) <- ( 0,  3)  =   3
2334     ( 1,  6) <- ( 1,  2)  =   6
2335     ( 1,  7) <- ( 1,  3)  =   7
2336 
2337   rootMode = global:
2338   PetscSF Object: result_sf 2 MPI processes
2339     type: basic
2340     rank #leaves #roots
2341     [ 0]       8      4
2342     [ 1]       8      4
2343     leaves      roots       roots in global numbering
2344     ( 0,  0) <- ( 0,  0)  =   0
2345     ( 0,  1) <- ( 0,  1)  =   1
2346     ( 0,  2) <- ( 0,  2)  =   2
2347     ( 0,  3) <- ( 0,  3)  =   3
2348     ( 0,  4) <- ( 1,  0)  =   4
2349     ( 0,  5) <- ( 1,  1)  =   5
2350     ( 0,  6) <- ( 1,  2)  =   6
2351     ( 0,  7) <- ( 1,  3)  =   7
2352     ( 1,  0) <- ( 0,  0)  =   0
2353     ( 1,  1) <- ( 0,  1)  =   1
2354     ( 1,  2) <- ( 0,  2)  =   2
2355     ( 1,  3) <- ( 0,  3)  =   3
2356     ( 1,  4) <- ( 1,  0)  =   4
2357     ( 1,  5) <- ( 1,  1)  =   5
2358     ( 1,  6) <- ( 1,  2)  =   6
2359     ( 1,  7) <- ( 1,  3)  =   7
2360 
2361   rootMode = shared:
2362   PetscSF Object: result_sf 2 MPI processes
2363     type: basic
2364     rank #leaves #roots
2365     [ 0]       8      2
2366     [ 1]       8      2
2367     leaves      roots       roots in global numbering
2368     ( 0,  0) <- ( 0,  0)  =   0
2369     ( 0,  1) <- ( 0,  1)  =   1
2370     ( 0,  2) <- ( 1,  0)  =   2
2371     ( 0,  3) <- ( 1,  1)  =   3
2372     ( 0,  4) <- ( 0,  0)  =   0
2373     ( 0,  5) <- ( 0,  1)  =   1
2374     ( 0,  6) <- ( 1,  0)  =   2
2375     ( 0,  7) <- ( 1,  1)  =   3
2376     ( 1,  0) <- ( 0,  0)  =   0
2377     ( 1,  1) <- ( 0,  1)  =   1
2378     ( 1,  2) <- ( 1,  0)  =   2
2379     ( 1,  3) <- ( 1,  1)  =   3
2380     ( 1,  4) <- ( 0,  0)  =   0
2381     ( 1,  5) <- ( 0,  1)  =   1
2382     ( 1,  6) <- ( 1,  0)  =   2
2383     ( 1,  7) <- ( 1,  1)  =   3
2384 .ve
2385 
2386 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFConcatenateRootMode`
2387 @*/
2388 PetscErrorCode PetscSFConcatenate(MPI_Comm comm, PetscInt nsfs, PetscSF sfs[], PetscSFConcatenateRootMode rootMode, PetscInt leafOffsets[], PetscSF *newsf)
2389 {
2390   PetscInt     i, s, nLeaves, nRoots;
2391   PetscInt    *leafArrayOffsets;
2392   PetscInt    *ilocal_new;
2393   PetscSFNode *iremote_new;
2394   PetscBool    all_ilocal_null = PETSC_FALSE;
2395   PetscLayout  glayout         = NULL;
2396   PetscInt    *gremote         = NULL;
2397   PetscMPIInt  rank, size;
2398 
2399   PetscFunctionBegin;
2400   if (PetscDefined(USE_DEBUG)) {
2401     PetscSF dummy; /* just to have a PetscObject on comm for input validation */
2402 
2403     PetscCall(PetscSFCreate(comm, &dummy));
2404     PetscValidLogicalCollectiveInt(dummy, nsfs, 2);
2405     PetscValidPointer(sfs, 3);
2406     for (i = 0; i < nsfs; i++) {
2407       PetscValidHeaderSpecific(sfs[i], PETSCSF_CLASSID, 3);
2408       PetscCheckSameComm(dummy, 1, sfs[i], 3);
2409     }
2410     PetscValidLogicalCollectiveEnum(dummy, rootMode, 4);
2411     if (leafOffsets) PetscValidIntPointer(leafOffsets, 5);
2412     PetscValidPointer(newsf, 6);
2413     PetscCall(PetscSFDestroy(&dummy));
2414   }
2415   if (!nsfs) {
2416     PetscCall(PetscSFCreate(comm, newsf));
2417     PetscCall(PetscSFSetGraph(*newsf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
2418     PetscFunctionReturn(PETSC_SUCCESS);
2419   }
2420   PetscCallMPI(MPI_Comm_rank(comm, &rank));
2421   PetscCallMPI(MPI_Comm_size(comm, &size));
2422 
2423   /* Calculate leaf array offsets */
2424   PetscCall(PetscMalloc1(nsfs + 1, &leafArrayOffsets));
2425   leafArrayOffsets[0] = 0;
2426   for (s = 0; s < nsfs; s++) {
2427     PetscInt nl;
2428 
2429     PetscCall(PetscSFGetGraph(sfs[s], NULL, &nl, NULL, NULL));
2430     leafArrayOffsets[s + 1] = leafArrayOffsets[s] + nl;
2431   }
2432   nLeaves = leafArrayOffsets[nsfs];
2433 
2434   /* Calculate number of roots */
2435   switch (rootMode) {
2436   case PETSCSF_CONCATENATE_ROOTMODE_SHARED: {
2437     PetscCall(PetscSFGetGraph(sfs[0], &nRoots, NULL, NULL, NULL));
2438     if (PetscDefined(USE_DEBUG)) {
2439       for (s = 1; s < nsfs; s++) {
2440         PetscInt nr;
2441 
2442         PetscCall(PetscSFGetGraph(sfs[s], &nr, NULL, NULL, NULL));
2443         PetscCheck(nr == nRoots, comm, PETSC_ERR_ARG_SIZ, "rootMode = %s but sfs[%" PetscInt_FMT "] has a different number of roots (%" PetscInt_FMT ") than sfs[0] (%" PetscInt_FMT ")", PetscSFConcatenateRootModes[rootMode], s, nr, nRoots);
2444       }
2445     }
2446   } break;
2447   case PETSCSF_CONCATENATE_ROOTMODE_GLOBAL: {
2448     /* Calculate also global layout in this case */
2449     PetscInt    *nls;
2450     PetscLayout *lts;
2451     PetscInt   **inds;
2452     PetscInt     j;
2453     PetscInt     rootOffset = 0;
2454 
2455     PetscCall(PetscCalloc3(nsfs, &lts, nsfs, &nls, nsfs, &inds));
2456     PetscCall(PetscLayoutCreate(comm, &glayout));
2457     glayout->bs = 1;
2458     glayout->n  = 0;
2459     glayout->N  = 0;
2460     for (s = 0; s < nsfs; s++) {
2461       PetscCall(PetscSFGetGraphLayout(sfs[s], &lts[s], &nls[s], NULL, &inds[s]));
2462       glayout->n += lts[s]->n;
2463       glayout->N += lts[s]->N;
2464     }
2465     PetscCall(PetscLayoutSetUp(glayout));
2466     PetscCall(PetscMalloc1(nLeaves, &gremote));
2467     for (s = 0, j = 0; s < nsfs; s++) {
2468       for (i = 0; i < nls[s]; i++, j++) gremote[j] = inds[s][i] + rootOffset;
2469       rootOffset += lts[s]->N;
2470       PetscCall(PetscLayoutDestroy(&lts[s]));
2471       PetscCall(PetscFree(inds[s]));
2472     }
2473     PetscCall(PetscFree3(lts, nls, inds));
2474     nRoots = glayout->N;
2475   } break;
2476   case PETSCSF_CONCATENATE_ROOTMODE_LOCAL:
2477     /* nRoots calculated later in this case */
2478     break;
2479   default:
2480     SETERRQ(comm, PETSC_ERR_ARG_WRONG, "Invalid PetscSFConcatenateRootMode %d", rootMode);
2481   }
2482 
2483   if (!leafOffsets) {
2484     all_ilocal_null = PETSC_TRUE;
2485     for (s = 0; s < nsfs; s++) {
2486       const PetscInt *ilocal;
2487 
2488       PetscCall(PetscSFGetGraph(sfs[s], NULL, NULL, &ilocal, NULL));
2489       if (ilocal) {
2490         all_ilocal_null = PETSC_FALSE;
2491         break;
2492       }
2493     }
2494     PetscCheck(all_ilocal_null, PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "leafOffsets can be passed as NULL only if all SFs have ilocal = NULL");
2495   }
2496 
2497   /* Renumber and concatenate local leaves */
2498   ilocal_new = NULL;
2499   if (!all_ilocal_null) {
2500     PetscCall(PetscMalloc1(nLeaves, &ilocal_new));
2501     for (i = 0; i < nLeaves; i++) ilocal_new[i] = -1;
2502     for (s = 0; s < nsfs; s++) {
2503       const PetscInt *ilocal;
2504       PetscInt       *ilocal_l = &ilocal_new[leafArrayOffsets[s]];
2505       PetscInt        i, nleaves_l;
2506 
2507       PetscCall(PetscSFGetGraph(sfs[s], NULL, &nleaves_l, &ilocal, NULL));
2508       for (i = 0; i < nleaves_l; i++) ilocal_l[i] = (ilocal ? ilocal[i] : i) + leafOffsets[s];
2509     }
2510   }
2511 
2512   /* Renumber and concatenate remote roots */
2513   if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL || rootMode == PETSCSF_CONCATENATE_ROOTMODE_SHARED) {
2514     PetscInt rootOffset = 0;
2515 
2516     PetscCall(PetscMalloc1(nLeaves, &iremote_new));
2517     for (i = 0; i < nLeaves; i++) {
2518       iremote_new[i].rank  = -1;
2519       iremote_new[i].index = -1;
2520     }
2521     for (s = 0; s < nsfs; s++) {
2522       PetscInt           i, nl, nr;
2523       PetscSF            tmp_sf;
2524       const PetscSFNode *iremote;
2525       PetscSFNode       *tmp_rootdata;
2526       PetscSFNode       *tmp_leafdata = &iremote_new[leafArrayOffsets[s]];
2527 
2528       PetscCall(PetscSFGetGraph(sfs[s], &nr, &nl, NULL, &iremote));
2529       PetscCall(PetscSFCreate(comm, &tmp_sf));
2530       /* create helper SF with contiguous leaves */
2531       PetscCall(PetscSFSetGraph(tmp_sf, nr, nl, NULL, PETSC_USE_POINTER, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
2532       PetscCall(PetscSFSetUp(tmp_sf));
2533       PetscCall(PetscMalloc1(nr, &tmp_rootdata));
2534       if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) {
2535         for (i = 0; i < nr; i++) {
2536           tmp_rootdata[i].index = i + rootOffset;
2537           tmp_rootdata[i].rank  = (PetscInt)rank;
2538         }
2539         rootOffset += nr;
2540       } else {
2541         for (i = 0; i < nr; i++) {
2542           tmp_rootdata[i].index = i;
2543           tmp_rootdata[i].rank  = (PetscInt)rank;
2544         }
2545       }
2546       PetscCall(PetscSFBcastBegin(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2547       PetscCall(PetscSFBcastEnd(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2548       PetscCall(PetscSFDestroy(&tmp_sf));
2549       PetscCall(PetscFree(tmp_rootdata));
2550     }
2551     if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) nRoots = rootOffset; // else nRoots already calculated above
2552 
2553     /* Build the new SF */
2554     PetscCall(PetscSFCreate(comm, newsf));
2555     PetscCall(PetscSFSetGraph(*newsf, nRoots, nLeaves, ilocal_new, PETSC_OWN_POINTER, iremote_new, PETSC_OWN_POINTER));
2556   } else {
2557     /* Build the new SF */
2558     PetscCall(PetscSFCreate(comm, newsf));
2559     PetscCall(PetscSFSetGraphLayout(*newsf, glayout, nLeaves, ilocal_new, PETSC_OWN_POINTER, gremote));
2560   }
2561   PetscCall(PetscSFSetUp(*newsf));
2562   PetscCall(PetscSFViewFromOptions(*newsf, NULL, "-sf_concat_view"));
2563   PetscCall(PetscLayoutDestroy(&glayout));
2564   PetscCall(PetscFree(gremote));
2565   PetscCall(PetscFree(leafArrayOffsets));
2566   PetscFunctionReturn(PETSC_SUCCESS);
2567 }
2568