xref: /petsc/src/vec/is/sf/interface/sf.c (revision 9140fee14acbea959c6ed74f4836a1a89f708038)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petsc/private/viewerimpl.h>
4 #include <petsc/private/hashmapi.h>
5 
6 #if defined(PETSC_HAVE_CUDA)
7   #include <cuda_runtime.h>
8   #include <petscdevice_cuda.h>
9 #endif
10 
11 #if defined(PETSC_HAVE_HIP)
12   #include <hip/hip_runtime.h>
13 #endif
14 
15 #if defined(PETSC_CLANG_STATIC_ANALYZER)
16 extern void PetscSFCheckGraphSet(PetscSF, int);
17 #else
18   #if defined(PETSC_USE_DEBUG)
19     #define PetscSFCheckGraphSet(sf, arg) PetscCheck((sf)->graphset, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %d \"%s\" before %s()", (arg), #sf, PETSC_FUNCTION_NAME)
20   #else
21     #define PetscSFCheckGraphSet(sf, arg) \
22       do { \
23       } while (0)
24   #endif
25 #endif
26 
27 const char *const PetscSFDuplicateOptions[]     = {"CONFONLY", "RANKS", "GRAPH", "PetscSFDuplicateOption", "PETSCSF_DUPLICATE_", NULL};
28 const char *const PetscSFConcatenateRootModes[] = {"local", "shared", "global", "PetscSFConcatenateRootMode", "PETSCSF_CONCATENATE_ROOTMODE_", NULL};
29 
30 /*@
31   PetscSFCreate - create a star forest communication context
32 
33   Collective
34 
35   Input Parameter:
36 . comm - communicator on which the star forest will operate
37 
38   Output Parameter:
39 . sf - new star forest context
40 
41   Options Database Key:
42 + -sf_type basic                 - Use MPI persistent Isend/Irecv for communication (Default)
43 . -sf_type window                - Use MPI-3 one-sided window for communication
44 . -sf_type neighbor              - Use MPI-3 neighborhood collectives for communication
45 - -sf_neighbor_persistent <bool> - If true, use MPI-4 persistent neighborhood collectives for communication (used along with -sf_type neighbor)
46 
47   Level: intermediate
48 
49   Note:
50   When one knows the communication graph is one of the predefined graph, such as `MPI_Alltoall()`, `MPI_Allgatherv()`,
51   `MPI_Gatherv()`, one can create a `PetscSF` and then set its graph with `PetscSFSetGraphWithPattern()`. These special
52   `SF`s are optimized and they have better performance than the general `SF`s.
53 
54 .seealso: `PetscSF`, `PetscSFSetType`, `PetscSFSetGraph()`, `PetscSFSetGraphWithPattern()`, `PetscSFDestroy()`
55 @*/
56 PetscErrorCode PetscSFCreate(MPI_Comm comm, PetscSF *sf)
57 {
58   PetscSF b;
59 
60   PetscFunctionBegin;
61   PetscAssertPointer(sf, 2);
62   PetscCall(PetscSFInitializePackage());
63 
64   PetscCall(PetscHeaderCreate(b, PETSCSF_CLASSID, "PetscSF", "Star Forest", "PetscSF", comm, PetscSFDestroy, PetscSFView));
65 
66   b->nroots    = -1;
67   b->nleaves   = -1;
68   b->minleaf   = PETSC_MAX_INT;
69   b->maxleaf   = PETSC_MIN_INT;
70   b->nranks    = -1;
71   b->rankorder = PETSC_TRUE;
72   b->ingroup   = MPI_GROUP_NULL;
73   b->outgroup  = MPI_GROUP_NULL;
74   b->graphset  = PETSC_FALSE;
75 #if defined(PETSC_HAVE_DEVICE)
76   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
77   b->use_stream_aware_mpi = PETSC_FALSE;
78   b->unknown_input_stream = PETSC_FALSE;
79   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
80   b->backend = PETSCSF_BACKEND_KOKKOS;
81   #elif defined(PETSC_HAVE_CUDA)
82   b->backend = PETSCSF_BACKEND_CUDA;
83   #elif defined(PETSC_HAVE_HIP)
84   b->backend = PETSCSF_BACKEND_HIP;
85   #endif
86 
87   #if defined(PETSC_HAVE_NVSHMEM)
88   b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
89   b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
90   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem", &b->use_nvshmem, NULL));
91   PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_nvshmem_get", &b->use_nvshmem_get, NULL));
92   #endif
93 #endif
94   b->vscat.from_n = -1;
95   b->vscat.to_n   = -1;
96   b->vscat.unit   = MPIU_SCALAR;
97   *sf             = b;
98   PetscFunctionReturn(PETSC_SUCCESS);
99 }
100 
101 /*@
102   PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
103 
104   Collective
105 
106   Input Parameter:
107 . sf - star forest
108 
109   Level: advanced
110 
111 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetGraph()`, `PetscSFDestroy()`
112 @*/
113 PetscErrorCode PetscSFReset(PetscSF sf)
114 {
115   PetscFunctionBegin;
116   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
117   PetscTryTypeMethod(sf, Reset);
118   sf->nroots   = -1;
119   sf->nleaves  = -1;
120   sf->minleaf  = PETSC_MAX_INT;
121   sf->maxleaf  = PETSC_MIN_INT;
122   sf->mine     = NULL;
123   sf->remote   = NULL;
124   sf->graphset = PETSC_FALSE;
125   PetscCall(PetscFree(sf->mine_alloc));
126   PetscCall(PetscFree(sf->remote_alloc));
127   sf->nranks = -1;
128   PetscCall(PetscFree4(sf->ranks, sf->roffset, sf->rmine, sf->rremote));
129   sf->degreeknown = PETSC_FALSE;
130   PetscCall(PetscFree(sf->degree));
131   if (sf->ingroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->ingroup));
132   if (sf->outgroup != MPI_GROUP_NULL) PetscCallMPI(MPI_Group_free(&sf->outgroup));
133   if (sf->multi) sf->multi->multi = NULL;
134   PetscCall(PetscSFDestroy(&sf->multi));
135   PetscCall(PetscLayoutDestroy(&sf->map));
136 
137 #if defined(PETSC_HAVE_DEVICE)
138   for (PetscInt i = 0; i < 2; i++) PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, sf->rmine_d[i]));
139 #endif
140 
141   sf->setupcalled = PETSC_FALSE;
142   PetscFunctionReturn(PETSC_SUCCESS);
143 }
144 
145 /*@C
146   PetscSFSetType - Set the `PetscSF` communication implementation
147 
148   Collective
149 
150   Input Parameters:
151 + sf   - the `PetscSF` context
152 - type - a known method
153 .vb
154     PETSCSFWINDOW - MPI-2/3 one-sided
155     PETSCSFBASIC - basic implementation using MPI-1 two-sided
156 .ve
157 
158   Options Database Key:
159 . -sf_type <type> - Sets the method; for example `basic` or `window` use -help for a list of available methods
160 
161   Level: intermediate
162 
163   Notes:
164   See `PetscSFType` for possible values
165 
166 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`
167 @*/
168 PetscErrorCode PetscSFSetType(PetscSF sf, PetscSFType type)
169 {
170   PetscBool match;
171   PetscErrorCode (*r)(PetscSF);
172 
173   PetscFunctionBegin;
174   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
175   PetscAssertPointer(type, 2);
176 
177   PetscCall(PetscObjectTypeCompare((PetscObject)sf, type, &match));
178   if (match) PetscFunctionReturn(PETSC_SUCCESS);
179 
180   PetscCall(PetscFunctionListFind(PetscSFList, type, &r));
181   PetscCheck(r, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_UNKNOWN_TYPE, "Unable to find requested PetscSF type %s", type);
182   /* Destroy the previous PetscSF implementation context */
183   PetscTryTypeMethod(sf, Destroy);
184   PetscCall(PetscMemzero(sf->ops, sizeof(*sf->ops)));
185   PetscCall(PetscObjectChangeTypeName((PetscObject)sf, type));
186   PetscCall((*r)(sf));
187   PetscFunctionReturn(PETSC_SUCCESS);
188 }
189 
190 /*@C
191   PetscSFGetType - Get the `PetscSF` communication implementation
192 
193   Not Collective
194 
195   Input Parameter:
196 . sf - the `PetscSF` context
197 
198   Output Parameter:
199 . type - the `PetscSF` type name
200 
201   Level: intermediate
202 
203 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetType()`, `PetscSFCreate()`
204 @*/
205 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
206 {
207   PetscFunctionBegin;
208   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
209   PetscAssertPointer(type, 2);
210   *type = ((PetscObject)sf)->type_name;
211   PetscFunctionReturn(PETSC_SUCCESS);
212 }
213 
214 /*@C
215   PetscSFDestroy - destroy a star forest
216 
217   Collective
218 
219   Input Parameter:
220 . sf - address of star forest
221 
222   Level: intermediate
223 
224 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFReset()`
225 @*/
226 PetscErrorCode PetscSFDestroy(PetscSF *sf)
227 {
228   PetscFunctionBegin;
229   if (!*sf) PetscFunctionReturn(PETSC_SUCCESS);
230   PetscValidHeaderSpecific((*sf), PETSCSF_CLASSID, 1);
231   if (--((PetscObject)(*sf))->refct > 0) {
232     *sf = NULL;
233     PetscFunctionReturn(PETSC_SUCCESS);
234   }
235   PetscCall(PetscSFReset(*sf));
236   PetscTryTypeMethod((*sf), Destroy);
237   PetscCall(PetscSFDestroy(&(*sf)->vscat.lsf));
238   if ((*sf)->vscat.bs > 1) PetscCallMPI(MPI_Type_free(&(*sf)->vscat.unit));
239 #if defined(PETSC_HAVE_MPIX_STREAM)
240   if ((*sf)->use_stream_aware_mpi) {
241     PetscCallMPI(MPIX_Stream_free(&(*sf)->mpi_stream));
242     PetscCallMPI(MPI_Comm_free(&(*sf)->stream_comm));
243   }
244 #endif
245   PetscCall(PetscHeaderDestroy(sf));
246   PetscFunctionReturn(PETSC_SUCCESS);
247 }
248 
249 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
250 {
251   PetscInt           i, nleaves;
252   PetscMPIInt        size;
253   const PetscInt    *ilocal;
254   const PetscSFNode *iremote;
255 
256   PetscFunctionBegin;
257   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(PETSC_SUCCESS);
258   PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, &iremote));
259   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
260   for (i = 0; i < nleaves; i++) {
261     const PetscInt rank   = iremote[i].rank;
262     const PetscInt remote = iremote[i].index;
263     const PetscInt leaf   = ilocal ? ilocal[i] : i;
264     PetscCheck(rank >= 0 && rank < size, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided rank (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be in [0, %d)", rank, i, size);
265     PetscCheck(remote >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided index (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be >= 0", remote, i);
266     PetscCheck(leaf >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided location (%" PetscInt_FMT ") for leaf %" PetscInt_FMT " is invalid, should be >= 0", leaf, i);
267   }
268   PetscFunctionReturn(PETSC_SUCCESS);
269 }
270 
271 /*@
272   PetscSFSetUp - set up communication structures for a `PetscSF`, after this is done it may be used to perform communication
273 
274   Collective
275 
276   Input Parameter:
277 . sf - star forest communication object
278 
279   Level: beginner
280 
281 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetFromOptions()`, `PetscSFSetType()`
282 @*/
283 PetscErrorCode PetscSFSetUp(PetscSF sf)
284 {
285   PetscFunctionBegin;
286   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
287   PetscSFCheckGraphSet(sf, 1);
288   if (sf->setupcalled) PetscFunctionReturn(PETSC_SUCCESS);
289   PetscCall(PetscLogEventBegin(PETSCSF_SetUp, sf, 0, 0, 0));
290   PetscCall(PetscSFCheckGraphValid_Private(sf));
291   if (!((PetscObject)sf)->type_name) PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); /* Zero all sf->ops */
292   PetscTryTypeMethod(sf, SetUp);
293 #if defined(PETSC_HAVE_CUDA)
294   if (sf->backend == PETSCSF_BACKEND_CUDA) {
295     sf->ops->Malloc = PetscSFMalloc_CUDA;
296     sf->ops->Free   = PetscSFFree_CUDA;
297   }
298 #endif
299 #if defined(PETSC_HAVE_HIP)
300   if (sf->backend == PETSCSF_BACKEND_HIP) {
301     sf->ops->Malloc = PetscSFMalloc_HIP;
302     sf->ops->Free   = PetscSFFree_HIP;
303   }
304 #endif
305 
306 #if defined(PETSC_HAVE_KOKKOS)
307   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
308     sf->ops->Malloc = PetscSFMalloc_Kokkos;
309     sf->ops->Free   = PetscSFFree_Kokkos;
310   }
311 #endif
312   PetscCall(PetscLogEventEnd(PETSCSF_SetUp, sf, 0, 0, 0));
313   sf->setupcalled = PETSC_TRUE;
314   PetscFunctionReturn(PETSC_SUCCESS);
315 }
316 
317 /*@
318   PetscSFSetFromOptions - set `PetscSF` options using the options database
319 
320   Logically Collective
321 
322   Input Parameter:
323 . sf - star forest
324 
325   Options Database Keys:
326 + -sf_type                                                                                                         - implementation type, see `PetscSFSetType()`
327 . -sf_rank_order                                                                                                   - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
328 . -sf_use_default_stream                                                                                           - Assume callers of `PetscSF` computed the input root/leafdata with the default CUDA stream. `PetscSF` will also
329                             use the default stream to process data. Therefore, no stream synchronization is needed between `PetscSF` and its caller (default: true).
330                             If true, this option only works with `-use_gpu_aware_mpi 1`.
331 . -sf_use_stream_aware_mpi                                                                                         - Assume the underlying MPI is CUDA-stream aware and `PetscSF` won't sync streams for send/recv buffers passed to MPI (default: false).
332                                If true, this option only works with `-use_gpu_aware_mpi 1`.
333 
334 - -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently `PetscSF` has these backends: cuda - hip and Kokkos.
335                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
336                               the only available is kokkos.
337 
338   Level: intermediate
339 
340 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetType()`
341 @*/
342 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
343 {
344   PetscSFType deft;
345   char        type[256];
346   PetscBool   flg;
347 
348   PetscFunctionBegin;
349   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
350   PetscObjectOptionsBegin((PetscObject)sf);
351   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
352   PetscCall(PetscOptionsFList("-sf_type", "PetscSF implementation type", "PetscSFSetType", PetscSFList, deft, type, sizeof(type), &flg));
353   PetscCall(PetscSFSetType(sf, flg ? type : deft));
354   PetscCall(PetscOptionsBool("-sf_rank_order", "sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise", "PetscSFSetRankOrder", sf->rankorder, &sf->rankorder, NULL));
355 #if defined(PETSC_HAVE_DEVICE)
356   {
357     char      backendstr[32] = {0};
358     PetscBool isCuda = PETSC_FALSE, isHip = PETSC_FALSE, isKokkos = PETSC_FALSE, set;
359     /* Change the defaults set in PetscSFCreate() with command line options */
360     PetscCall(PetscOptionsBool("-sf_unknown_input_stream", "SF root/leafdata is computed on arbitrary streams unknown to SF", "PetscSFSetFromOptions", sf->unknown_input_stream, &sf->unknown_input_stream, NULL));
361     PetscCall(PetscOptionsBool("-sf_use_stream_aware_mpi", "Assume the underlying MPI is cuda-stream aware", "PetscSFSetFromOptions", sf->use_stream_aware_mpi, &sf->use_stream_aware_mpi, NULL));
362     PetscCall(PetscOptionsString("-sf_backend", "Select the device backend SF uses", "PetscSFSetFromOptions", NULL, backendstr, sizeof(backendstr), &set));
363     PetscCall(PetscStrcasecmp("cuda", backendstr, &isCuda));
364     PetscCall(PetscStrcasecmp("kokkos", backendstr, &isKokkos));
365     PetscCall(PetscStrcasecmp("hip", backendstr, &isHip));
366   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
367     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
368     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
369     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
370     else PetscCheck(!set, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
371   #elif defined(PETSC_HAVE_KOKKOS)
372     PetscCheck(!set || isKokkos, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "-sf_backend %s is not supported. You can only choose kokkos", backendstr);
373   #endif
374 
375   #if defined(PETSC_HAVE_CUDA) && defined(PETSC_HAVE_MPIX_STREAM)
376     if (sf->use_stream_aware_mpi) {
377       MPI_Info info;
378 
379       PetscCallMPI(MPI_Info_create(&info));
380       PetscCallMPI(MPI_Info_set(info, "type", "cudaStream_t"));
381       PetscCallMPI(MPIX_Info_set_hex(info, "value", &PetscDefaultCudaStream, sizeof(PetscDefaultCudaStream)));
382       PetscCallMPI(MPIX_Stream_create(info, &sf->mpi_stream));
383       PetscCallMPI(MPI_Info_free(&info));
384       PetscCallMPI(MPIX_Stream_comm_create(PetscObjectComm((PetscObject)sf), sf->mpi_stream, &sf->stream_comm));
385     }
386   #endif
387   }
388 #endif
389   PetscTryTypeMethod(sf, SetFromOptions, PetscOptionsObject);
390   PetscOptionsEnd();
391   PetscFunctionReturn(PETSC_SUCCESS);
392 }
393 
394 /*@
395   PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
396 
397   Logically Collective
398 
399   Input Parameters:
400 + sf  - star forest
401 - flg - `PETSC_TRUE` to sort, `PETSC_FALSE` to skip sorting (lower setup cost, but non-deterministic)
402 
403   Level: advanced
404 
405 .seealso: `PetscSF`, `PetscSFType`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`
406 @*/
407 PetscErrorCode PetscSFSetRankOrder(PetscSF sf, PetscBool flg)
408 {
409   PetscFunctionBegin;
410   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
411   PetscValidLogicalCollectiveBool(sf, flg, 2);
412   PetscCheck(!sf->multi, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
413   sf->rankorder = flg;
414   PetscFunctionReturn(PETSC_SUCCESS);
415 }
416 
417 /*@C
418   PetscSFSetGraph - Set a parallel star forest
419 
420   Collective
421 
422   Input Parameters:
423 + sf         - star forest
424 . nroots     - number of root vertices on the current process (these are possible targets for other process to attach leaves)
425 . nleaves    - number of leaf vertices on the current process, each of these references a root on any process
426 . ilocal     - locations of leaves in leafdata buffers, pass `NULL` for contiguous storage (locations must be >= 0, enforced
427 during setup in debug mode)
428 . localmode  - copy mode for `ilocal`
429 . iremote    - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
430 during setup in debug mode)
431 - remotemode - copy mode for `iremote`
432 
433   Level: intermediate
434 
435   Notes:
436   Leaf indices in `ilocal` must be unique, otherwise an error occurs.
437 
438   Input arrays `ilocal` and `iremote` follow the `PetscCopyMode` semantics.
439   In particular, if `localmode` or `remotemode` is `PETSC_OWN_POINTER` or `PETSC_USE_POINTER`,
440   PETSc might modify the respective array;
441   if `PETSC_USE_POINTER`, the user must delete the array after `PetscSFDestroy()`.
442   Only if `PETSC_COPY_VALUES` is used, the respective array is guaranteed to stay intact and a const array can be passed (but a cast to non-const is needed).
443 
444   Fortran Notes:
445   In Fortran you must use `PETSC_COPY_VALUES` for `localmode` and `remotemode`.
446 
447   Developer Notes:
448   We sort leaves to check for duplicates and contiguousness and to find minleaf/maxleaf.
449   This also allows to compare leaf sets of two `PetscSF`s easily.
450 
451 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
452 @*/
453 PetscErrorCode PetscSFSetGraph(PetscSF sf, PetscInt nroots, PetscInt nleaves, PetscInt *ilocal, PetscCopyMode localmode, PetscSFNode *iremote, PetscCopyMode remotemode)
454 {
455   PetscBool unique, contiguous;
456 
457   PetscFunctionBegin;
458   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
459   if (nleaves > 0 && ilocal) PetscAssertPointer(ilocal, 4);
460   if (nleaves > 0) PetscAssertPointer(iremote, 6);
461   PetscCheck(nroots >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nroots %" PetscInt_FMT ", cannot be negative", nroots);
462   PetscCheck(nleaves >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nleaves %" PetscInt_FMT ", cannot be negative", nleaves);
463   /* enums may be handled as unsigned by some compilers, NVHPC for example, the int cast
464    * below is to prevent NVHPC from warning about meaningless comparison of unsigned with zero */
465   PetscCheck((int)localmode >= PETSC_COPY_VALUES && localmode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong localmode %d", localmode);
466   PetscCheck((int)remotemode >= PETSC_COPY_VALUES && remotemode <= PETSC_USE_POINTER, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong remotemode %d", remotemode);
467 
468   if (sf->nroots >= 0) { /* Reset only if graph already set */
469     PetscCall(PetscSFReset(sf));
470   }
471 
472   PetscCall(PetscLogEventBegin(PETSCSF_SetGraph, sf, 0, 0, 0));
473 
474   sf->nroots  = nroots;
475   sf->nleaves = nleaves;
476 
477   if (localmode == PETSC_COPY_VALUES && ilocal) {
478     PetscInt *tlocal = NULL;
479 
480     PetscCall(PetscMalloc1(nleaves, &tlocal));
481     PetscCall(PetscArraycpy(tlocal, ilocal, nleaves));
482     ilocal = tlocal;
483   }
484   if (remotemode == PETSC_COPY_VALUES) {
485     PetscSFNode *tremote = NULL;
486 
487     PetscCall(PetscMalloc1(nleaves, &tremote));
488     PetscCall(PetscArraycpy(tremote, iremote, nleaves));
489     iremote = tremote;
490   }
491 
492   if (nleaves && ilocal) {
493     PetscSFNode work;
494 
495     PetscCall(PetscSortIntWithDataArray(nleaves, ilocal, iremote, sizeof(PetscSFNode), &work));
496     PetscCall(PetscSortedCheckDupsInt(nleaves, ilocal, &unique));
497     unique = PetscNot(unique);
498     PetscCheck(sf->allow_multi_leaves || unique, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Input ilocal has duplicate entries which is not allowed for this PetscSF");
499     sf->minleaf = ilocal[0];
500     sf->maxleaf = ilocal[nleaves - 1];
501     contiguous  = (PetscBool)(unique && ilocal[0] == 0 && ilocal[nleaves - 1] == nleaves - 1);
502   } else {
503     sf->minleaf = 0;
504     sf->maxleaf = nleaves - 1;
505     unique      = PETSC_TRUE;
506     contiguous  = PETSC_TRUE;
507   }
508 
509   if (contiguous) {
510     if (localmode == PETSC_USE_POINTER) {
511       ilocal = NULL;
512     } else {
513       PetscCall(PetscFree(ilocal));
514     }
515   }
516   sf->mine = ilocal;
517   if (localmode == PETSC_USE_POINTER) {
518     sf->mine_alloc = NULL;
519   } else {
520     sf->mine_alloc = ilocal;
521   }
522   sf->remote = iremote;
523   if (remotemode == PETSC_USE_POINTER) {
524     sf->remote_alloc = NULL;
525   } else {
526     sf->remote_alloc = iremote;
527   }
528   PetscCall(PetscLogEventEnd(PETSCSF_SetGraph, sf, 0, 0, 0));
529   sf->graphset = PETSC_TRUE;
530   PetscFunctionReturn(PETSC_SUCCESS);
531 }
532 
533 /*@
534   PetscSFSetGraphWithPattern - Sets the graph of a `PetscSF` with a specific pattern
535 
536   Collective
537 
538   Input Parameters:
539 + sf      - The `PetscSF`
540 . map     - Layout of roots over all processes (insignificant when pattern is `PETSCSF_PATTERN_ALLTOALL`)
541 - pattern - One of `PETSCSF_PATTERN_ALLGATHER`, `PETSCSF_PATTERN_GATHER`, `PETSCSF_PATTERN_ALLTOALL`
542 
543   Level: intermediate
544 
545   Notes:
546   It is easier to explain `PetscSFPattern` using vectors. Suppose we have an MPI vector `x` and its `PetscLayout` is `map`.
547   `n` and `N` are the local and global sizes of `x` respectively.
548 
549   With `PETSCSF_PATTERN_ALLGATHER`, the routine creates a graph that if one does `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on it, it will copy `x` to
550   sequential vectors `y` on all MPI processes.
551 
552   With `PETSCSF_PATTERN_GATHER`, the routine creates a graph that if one does `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on it, it will copy `x` to a
553   sequential vector `y` on rank 0.
554 
555   In above cases, entries of `x` are roots and entries of `y` are leaves.
556 
557   With `PETSCSF_PATTERN_ALLTOALL`, map is insignificant. Suppose NP is size of `sf`'s communicator. The routine
558   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
559   of rank j. Here 0 <=i,j<NP. It is a kind of `MPI_Alltoall()` with sendcount/recvcount being 1. Note that it does
560   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
561   items with `MPI_Type_contiguous` and use that as the <unit> argument in SF routines.
562 
563   In this case, roots and leaves are symmetric.
564 
565 .seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFGetGraph()`
566  @*/
567 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf, PetscLayout map, PetscSFPattern pattern)
568 {
569   MPI_Comm    comm;
570   PetscInt    n, N, res[2];
571   PetscMPIInt rank, size;
572   PetscSFType type;
573 
574   PetscFunctionBegin;
575   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
576   if (pattern != PETSCSF_PATTERN_ALLTOALL) PetscAssertPointer(map, 2);
577   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
578   PetscCheck(pattern >= PETSCSF_PATTERN_ALLGATHER && pattern <= PETSCSF_PATTERN_ALLTOALL, comm, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported PetscSFPattern %d", pattern);
579   PetscCallMPI(MPI_Comm_rank(comm, &rank));
580   PetscCallMPI(MPI_Comm_size(comm, &size));
581 
582   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
583     type = PETSCSFALLTOALL;
584     PetscCall(PetscLayoutCreate(comm, &sf->map));
585     PetscCall(PetscLayoutSetLocalSize(sf->map, size));
586     PetscCall(PetscLayoutSetSize(sf->map, ((PetscInt)size) * size));
587     PetscCall(PetscLayoutSetUp(sf->map));
588   } else {
589     PetscCall(PetscLayoutGetLocalSize(map, &n));
590     PetscCall(PetscLayoutGetSize(map, &N));
591     res[0] = n;
592     res[1] = -n;
593     /* Check if n are same over all ranks so that we can optimize it */
594     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, res, 2, MPIU_INT, MPI_MAX, comm));
595     if (res[0] == -res[1]) { /* same n */
596       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER : PETSCSFGATHER;
597     } else {
598       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
599     }
600     PetscCall(PetscLayoutReference(map, &sf->map));
601   }
602   PetscCall(PetscSFSetType(sf, type));
603 
604   sf->pattern = pattern;
605   sf->mine    = NULL; /* Contiguous */
606 
607   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
608      Also set other easy stuff.
609    */
610   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
611     sf->nleaves = N;
612     sf->nroots  = n;
613     sf->nranks  = size;
614     sf->minleaf = 0;
615     sf->maxleaf = N - 1;
616   } else if (pattern == PETSCSF_PATTERN_GATHER) {
617     sf->nleaves = rank ? 0 : N;
618     sf->nroots  = n;
619     sf->nranks  = rank ? 0 : size;
620     sf->minleaf = 0;
621     sf->maxleaf = rank ? -1 : N - 1;
622   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
623     sf->nleaves = size;
624     sf->nroots  = size;
625     sf->nranks  = size;
626     sf->minleaf = 0;
627     sf->maxleaf = size - 1;
628   }
629   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
630   sf->graphset = PETSC_TRUE;
631   PetscFunctionReturn(PETSC_SUCCESS);
632 }
633 
634 /*@
635   PetscSFCreateInverseSF - given a `PetscSF` in which all vertices have degree 1, creates the inverse map
636 
637   Collective
638 
639   Input Parameter:
640 . sf - star forest to invert
641 
642   Output Parameter:
643 . isf - inverse of `sf`
644 
645   Level: advanced
646 
647   Notes:
648   All roots must have degree 1.
649 
650   The local space may be a permutation, but cannot be sparse.
651 
652 .seealso: `PetscSF`, `PetscSFType`, `PetscSFSetGraph()`
653 @*/
654 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf, PetscSF *isf)
655 {
656   PetscMPIInt     rank;
657   PetscInt        i, nroots, nleaves, maxlocal, count, *newilocal;
658   const PetscInt *ilocal;
659   PetscSFNode    *roots, *leaves;
660 
661   PetscFunctionBegin;
662   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
663   PetscSFCheckGraphSet(sf, 1);
664   PetscAssertPointer(isf, 2);
665 
666   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, NULL));
667   maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
668 
669   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
670   PetscCall(PetscMalloc2(nroots, &roots, maxlocal, &leaves));
671   for (i = 0; i < maxlocal; i++) {
672     leaves[i].rank  = rank;
673     leaves[i].index = i;
674   }
675   for (i = 0; i < nroots; i++) {
676     roots[i].rank  = -1;
677     roots[i].index = -1;
678   }
679   PetscCall(PetscSFReduceBegin(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
680   PetscCall(PetscSFReduceEnd(sf, MPIU_2INT, leaves, roots, MPI_REPLACE));
681 
682   /* Check whether our leaves are sparse */
683   for (i = 0, count = 0; i < nroots; i++)
684     if (roots[i].rank >= 0) count++;
685   if (count == nroots) newilocal = NULL;
686   else { /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */ PetscCall(PetscMalloc1(count, &newilocal));
687     for (i = 0, count = 0; i < nroots; i++) {
688       if (roots[i].rank >= 0) {
689         newilocal[count]   = i;
690         roots[count].rank  = roots[i].rank;
691         roots[count].index = roots[i].index;
692         count++;
693       }
694     }
695   }
696 
697   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, isf));
698   PetscCall(PetscSFSetGraph(*isf, maxlocal, count, newilocal, PETSC_OWN_POINTER, roots, PETSC_COPY_VALUES));
699   PetscCall(PetscFree2(roots, leaves));
700   PetscFunctionReturn(PETSC_SUCCESS);
701 }
702 
703 /*@
704   PetscSFDuplicate - duplicate a `PetscSF`, optionally preserving rank connectivity and graph
705 
706   Collective
707 
708   Input Parameters:
709 + sf  - communication object to duplicate
710 - opt - `PETSCSF_DUPLICATE_CONFONLY`, `PETSCSF_DUPLICATE_RANKS`, or `PETSCSF_DUPLICATE_GRAPH` (see `PetscSFDuplicateOption`)
711 
712   Output Parameter:
713 . newsf - new communication object
714 
715   Level: beginner
716 
717 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFSetType()`, `PetscSFSetGraph()`
718 @*/
719 PetscErrorCode PetscSFDuplicate(PetscSF sf, PetscSFDuplicateOption opt, PetscSF *newsf)
720 {
721   PetscSFType  type;
722   MPI_Datatype dtype = MPIU_SCALAR;
723 
724   PetscFunctionBegin;
725   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
726   PetscValidLogicalCollectiveEnum(sf, opt, 2);
727   PetscAssertPointer(newsf, 3);
728   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sf), newsf));
729   PetscCall(PetscSFGetType(sf, &type));
730   if (type) PetscCall(PetscSFSetType(*newsf, type));
731   (*newsf)->allow_multi_leaves = sf->allow_multi_leaves; /* Dup this flag earlier since PetscSFSetGraph() below checks on this flag */
732   if (opt == PETSCSF_DUPLICATE_GRAPH) {
733     PetscSFCheckGraphSet(sf, 1);
734     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
735       PetscInt           nroots, nleaves;
736       const PetscInt    *ilocal;
737       const PetscSFNode *iremote;
738       PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
739       PetscCall(PetscSFSetGraph(*newsf, nroots, nleaves, (PetscInt *)ilocal, PETSC_COPY_VALUES, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
740     } else {
741       PetscCall(PetscSFSetGraphWithPattern(*newsf, sf->map, sf->pattern));
742     }
743   }
744   /* Since oldtype is committed, so is newtype, according to MPI */
745   if (sf->vscat.bs > 1) PetscCallMPI(MPI_Type_dup(sf->vscat.unit, &dtype));
746   (*newsf)->vscat.bs     = sf->vscat.bs;
747   (*newsf)->vscat.unit   = dtype;
748   (*newsf)->vscat.to_n   = sf->vscat.to_n;
749   (*newsf)->vscat.from_n = sf->vscat.from_n;
750   /* Do not copy lsf. Build it on demand since it is rarely used */
751 
752 #if defined(PETSC_HAVE_DEVICE)
753   (*newsf)->backend              = sf->backend;
754   (*newsf)->unknown_input_stream = sf->unknown_input_stream;
755   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
756   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
757 #endif
758   PetscTryTypeMethod(sf, Duplicate, opt, *newsf);
759   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
760   PetscFunctionReturn(PETSC_SUCCESS);
761 }
762 
763 /*@C
764   PetscSFGetGraph - Get the graph specifying a parallel star forest
765 
766   Not Collective
767 
768   Input Parameter:
769 . sf - star forest
770 
771   Output Parameters:
772 + nroots  - number of root vertices on the current process (these are possible targets for other process to attach leaves)
773 . nleaves - number of leaf vertices on the current process, each of these references a root on any process
774 . ilocal  - locations of leaves in leafdata buffers (if returned value is `NULL`, it means leaves are in contiguous storage)
775 - iremote - remote locations of root vertices for each leaf on the current process
776 
777   Level: intermediate
778 
779   Notes:
780   We are not currently requiring that the graph is set, thus returning `nroots` = -1 if it has not been set yet
781 
782   The returned `ilocal` and `iremote` might contain values in different order than the input ones in `PetscSFSetGraph()`
783 
784   Fortran Notes:
785   The returned `iremote` array is a copy and must be deallocated after use. Consequently, if you
786   want to update the graph, you must call `PetscSFSetGraph()` after modifying the `iremote` array.
787 
788   To check for a `NULL` `ilocal` use
789 $      if (loc(ilocal) == loc(PETSC_NULL_INTEGER)) then
790 
791 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`
792 @*/
793 PetscErrorCode PetscSFGetGraph(PetscSF sf, PetscInt *nroots, PetscInt *nleaves, const PetscInt **ilocal, const PetscSFNode **iremote)
794 {
795   PetscFunctionBegin;
796   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
797   if (sf->ops->GetGraph) {
798     PetscCall((sf->ops->GetGraph)(sf, nroots, nleaves, ilocal, iremote));
799   } else {
800     if (nroots) *nroots = sf->nroots;
801     if (nleaves) *nleaves = sf->nleaves;
802     if (ilocal) *ilocal = sf->mine;
803     if (iremote) *iremote = sf->remote;
804   }
805   PetscFunctionReturn(PETSC_SUCCESS);
806 }
807 
808 /*@
809   PetscSFGetLeafRange - Get the active leaf ranges
810 
811   Not Collective
812 
813   Input Parameter:
814 . sf - star forest
815 
816   Output Parameters:
817 + minleaf - minimum active leaf on this process. Returns 0 if there are no leaves.
818 - maxleaf - maximum active leaf on this process. Returns -1 if there are no leaves.
819 
820   Level: developer
821 
822 .seealso: `PetscSF`, `PetscSFType`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
823 @*/
824 PetscErrorCode PetscSFGetLeafRange(PetscSF sf, PetscInt *minleaf, PetscInt *maxleaf)
825 {
826   PetscFunctionBegin;
827   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
828   PetscSFCheckGraphSet(sf, 1);
829   if (minleaf) *minleaf = sf->minleaf;
830   if (maxleaf) *maxleaf = sf->maxleaf;
831   PetscFunctionReturn(PETSC_SUCCESS);
832 }
833 
834 /*@C
835   PetscSFViewFromOptions - View a `PetscSF` based on arguments in the options database
836 
837   Collective
838 
839   Input Parameters:
840 + A    - the star forest
841 . obj  - Optional object that provides the prefix for the option names
842 - name - command line option
843 
844   Level: intermediate
845 
846   Note:
847   See `PetscObjectViewFromOptions()` for possible `PetscViewer` and `PetscViewerFormat`
848 
849 .seealso: `PetscSF`, `PetscSFView`, `PetscObjectViewFromOptions()`, `PetscSFCreate()`
850 @*/
851 PetscErrorCode PetscSFViewFromOptions(PetscSF A, PetscObject obj, const char name[])
852 {
853   PetscFunctionBegin;
854   PetscValidHeaderSpecific(A, PETSCSF_CLASSID, 1);
855   PetscCall(PetscObjectViewFromOptions((PetscObject)A, obj, name));
856   PetscFunctionReturn(PETSC_SUCCESS);
857 }
858 
859 /*@C
860   PetscSFView - view a star forest
861 
862   Collective
863 
864   Input Parameters:
865 + sf     - star forest
866 - viewer - viewer to display graph, for example `PETSC_VIEWER_STDOUT_WORLD`
867 
868   Level: beginner
869 
870 .seealso: `PetscSF`, `PetscViewer`, `PetscSFCreate()`, `PetscSFSetGraph()`
871 @*/
872 PetscErrorCode PetscSFView(PetscSF sf, PetscViewer viewer)
873 {
874   PetscBool         iascii;
875   PetscViewerFormat format;
876 
877   PetscFunctionBegin;
878   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
879   if (!viewer) PetscCall(PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf), &viewer));
880   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
881   PetscCheckSameComm(sf, 1, viewer, 2);
882   if (sf->graphset) PetscCall(PetscSFSetUp(sf));
883   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
884   if (iascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) {
885     PetscMPIInt rank;
886     PetscInt    ii, i, j;
887 
888     PetscCall(PetscObjectPrintClassNamePrefixType((PetscObject)sf, viewer));
889     PetscCall(PetscViewerASCIIPushTab(viewer));
890     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
891       if (!sf->graphset) {
892         PetscCall(PetscViewerASCIIPrintf(viewer, "PetscSFSetGraph() has not been called yet\n"));
893         PetscCall(PetscViewerASCIIPopTab(viewer));
894         PetscFunctionReturn(PETSC_SUCCESS);
895       }
896       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
897       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
898       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Number of roots=%" PetscInt_FMT ", leaves=%" PetscInt_FMT ", remote ranks=%" PetscInt_FMT "\n", rank, sf->nroots, sf->nleaves, sf->nranks));
899       for (i = 0; i < sf->nleaves; i++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %" PetscInt_FMT " <- (%" PetscInt_FMT ",%" PetscInt_FMT ")\n", rank, sf->mine ? sf->mine[i] : i, sf->remote[i].rank, sf->remote[i].index));
900       PetscCall(PetscViewerFlush(viewer));
901       PetscCall(PetscViewerGetFormat(viewer, &format));
902       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
903         PetscMPIInt *tmpranks, *perm;
904         PetscCall(PetscMalloc2(sf->nranks, &tmpranks, sf->nranks, &perm));
905         PetscCall(PetscArraycpy(tmpranks, sf->ranks, sf->nranks));
906         for (i = 0; i < sf->nranks; i++) perm[i] = i;
907         PetscCall(PetscSortMPIIntWithArray(sf->nranks, tmpranks, perm));
908         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Roots referenced by my leaves, by rank\n", rank));
909         for (ii = 0; ii < sf->nranks; ii++) {
910           i = perm[ii];
911           PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] %d: %" PetscInt_FMT " edges\n", rank, sf->ranks[i], sf->roffset[i + 1] - sf->roffset[i]));
912           for (j = sf->roffset[i]; j < sf->roffset[i + 1]; j++) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d]    %" PetscInt_FMT " <- %" PetscInt_FMT "\n", rank, sf->rmine[j], sf->rremote[j]));
913         }
914         PetscCall(PetscFree2(tmpranks, perm));
915       }
916       PetscCall(PetscViewerFlush(viewer));
917       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
918     }
919     PetscCall(PetscViewerASCIIPopTab(viewer));
920   }
921   PetscTryTypeMethod(sf, View, viewer);
922   PetscFunctionReturn(PETSC_SUCCESS);
923 }
924 
925 /*@C
926   PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
927 
928   Not Collective
929 
930   Input Parameter:
931 . sf - star forest
932 
933   Output Parameters:
934 + nranks  - number of ranks referenced by local part
935 . ranks   - [`nranks`] array of ranks
936 . roffset - [`nranks`+1] offset in `rmine`/`rremote` for each rank
937 . rmine   - [`roffset`[`nranks`]] concatenated array holding local indices referencing each remote rank
938 - rremote - [`roffset`[`nranks`]] concatenated array holding remote indices referenced for each remote rank
939 
940   Level: developer
941 
942 .seealso: `PetscSF`, `PetscSFGetLeafRanks()`
943 @*/
944 PetscErrorCode PetscSFGetRootRanks(PetscSF sf, PetscInt *nranks, const PetscMPIInt **ranks, const PetscInt **roffset, const PetscInt **rmine, const PetscInt **rremote)
945 {
946   PetscFunctionBegin;
947   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
948   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
949   if (sf->ops->GetRootRanks) {
950     PetscCall((sf->ops->GetRootRanks)(sf, nranks, ranks, roffset, rmine, rremote));
951   } else {
952     /* The generic implementation */
953     if (nranks) *nranks = sf->nranks;
954     if (ranks) *ranks = sf->ranks;
955     if (roffset) *roffset = sf->roffset;
956     if (rmine) *rmine = sf->rmine;
957     if (rremote) *rremote = sf->rremote;
958   }
959   PetscFunctionReturn(PETSC_SUCCESS);
960 }
961 
962 /*@C
963   PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
964 
965   Not Collective
966 
967   Input Parameter:
968 . sf - star forest
969 
970   Output Parameters:
971 + niranks  - number of leaf ranks referencing roots on this process
972 . iranks   - [`niranks`] array of ranks
973 . ioffset  - [`niranks`+1] offset in `irootloc` for each rank
974 - irootloc - [`ioffset`[`niranks`]] concatenated array holding local indices of roots referenced by each leaf rank
975 
976   Level: developer
977 
978 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
979 @*/
980 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf, PetscInt *niranks, const PetscMPIInt **iranks, const PetscInt **ioffset, const PetscInt **irootloc)
981 {
982   PetscFunctionBegin;
983   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
984   PetscCheck(sf->setupcalled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUp() before obtaining ranks");
985   if (sf->ops->GetLeafRanks) {
986     PetscCall((sf->ops->GetLeafRanks)(sf, niranks, iranks, ioffset, irootloc));
987   } else {
988     PetscSFType type;
989     PetscCall(PetscSFGetType(sf, &type));
990     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
991   }
992   PetscFunctionReturn(PETSC_SUCCESS);
993 }
994 
995 static PetscBool InList(PetscMPIInt needle, PetscMPIInt n, const PetscMPIInt *list)
996 {
997   PetscInt i;
998   for (i = 0; i < n; i++) {
999     if (needle == list[i]) return PETSC_TRUE;
1000   }
1001   return PETSC_FALSE;
1002 }
1003 
1004 /*@C
1005   PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by `PetscSF` implementations.
1006 
1007   Collective
1008 
1009   Input Parameters:
1010 + sf     - `PetscSF` to set up; `PetscSFSetGraph()` must have been called
1011 - dgroup - `MPI_Group` of ranks to be distinguished (e.g., for self or shared memory exchange)
1012 
1013   Level: developer
1014 
1015 .seealso: `PetscSF`, `PetscSFGetRootRanks()`
1016 @*/
1017 PetscErrorCode PetscSFSetUpRanks(PetscSF sf, MPI_Group dgroup)
1018 {
1019   PetscHMapI    table;
1020   PetscHashIter pos;
1021   PetscMPIInt   size, groupsize, *groupranks;
1022   PetscInt     *rcount, *ranks;
1023   PetscInt      i, irank = -1, orank = -1;
1024 
1025   PetscFunctionBegin;
1026   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1027   PetscSFCheckGraphSet(sf, 1);
1028   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)sf), &size));
1029   PetscCall(PetscHMapICreateWithSize(10, &table));
1030   for (i = 0; i < sf->nleaves; i++) {
1031     /* Log 1-based rank */
1032     PetscCall(PetscHMapISetWithMode(table, sf->remote[i].rank + 1, 1, ADD_VALUES));
1033   }
1034   PetscCall(PetscHMapIGetSize(table, &sf->nranks));
1035   PetscCall(PetscMalloc4(sf->nranks, &sf->ranks, sf->nranks + 1, &sf->roffset, sf->nleaves, &sf->rmine, sf->nleaves, &sf->rremote));
1036   PetscCall(PetscMalloc2(sf->nranks, &rcount, sf->nranks, &ranks));
1037   PetscHashIterBegin(table, pos);
1038   for (i = 0; i < sf->nranks; i++) {
1039     PetscHashIterGetKey(table, pos, ranks[i]);
1040     PetscHashIterGetVal(table, pos, rcount[i]);
1041     PetscHashIterNext(table, pos);
1042     ranks[i]--; /* Convert back to 0-based */
1043   }
1044   PetscCall(PetscHMapIDestroy(&table));
1045 
1046   /* We expect that dgroup is reliably "small" while nranks could be large */
1047   {
1048     MPI_Group    group = MPI_GROUP_NULL;
1049     PetscMPIInt *dgroupranks;
1050     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1051     PetscCallMPI(MPI_Group_size(dgroup, &groupsize));
1052     PetscCall(PetscMalloc1(groupsize, &dgroupranks));
1053     PetscCall(PetscMalloc1(groupsize, &groupranks));
1054     for (i = 0; i < groupsize; i++) dgroupranks[i] = i;
1055     if (groupsize) PetscCallMPI(MPI_Group_translate_ranks(dgroup, groupsize, dgroupranks, group, groupranks));
1056     PetscCallMPI(MPI_Group_free(&group));
1057     PetscCall(PetscFree(dgroupranks));
1058   }
1059 
1060   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1061   for (sf->ndranks = 0, i = sf->nranks; sf->ndranks < i;) {
1062     for (i--; sf->ndranks < i; i--) { /* Scan i backward looking for distinguished rank */
1063       if (InList(ranks[i], groupsize, groupranks)) break;
1064     }
1065     for (; sf->ndranks <= i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1066       if (!InList(ranks[sf->ndranks], groupsize, groupranks)) break;
1067     }
1068     if (sf->ndranks < i) { /* Swap ranks[sf->ndranks] with ranks[i] */
1069       PetscInt tmprank, tmpcount;
1070 
1071       tmprank             = ranks[i];
1072       tmpcount            = rcount[i];
1073       ranks[i]            = ranks[sf->ndranks];
1074       rcount[i]           = rcount[sf->ndranks];
1075       ranks[sf->ndranks]  = tmprank;
1076       rcount[sf->ndranks] = tmpcount;
1077       sf->ndranks++;
1078     }
1079   }
1080   PetscCall(PetscFree(groupranks));
1081   PetscCall(PetscSortIntWithArray(sf->ndranks, ranks, rcount));
1082   if (rcount) PetscCall(PetscSortIntWithArray(sf->nranks - sf->ndranks, ranks + sf->ndranks, rcount + sf->ndranks));
1083   sf->roffset[0] = 0;
1084   for (i = 0; i < sf->nranks; i++) {
1085     PetscCall(PetscMPIIntCast(ranks[i], sf->ranks + i));
1086     sf->roffset[i + 1] = sf->roffset[i] + rcount[i];
1087     rcount[i]          = 0;
1088   }
1089   for (i = 0, irank = -1, orank = -1; i < sf->nleaves; i++) {
1090     /* short circuit */
1091     if (orank != sf->remote[i].rank) {
1092       /* Search for index of iremote[i].rank in sf->ranks */
1093       PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->ndranks, sf->ranks, &irank));
1094       if (irank < 0) {
1095         PetscCall(PetscFindMPIInt(sf->remote[i].rank, sf->nranks - sf->ndranks, sf->ranks + sf->ndranks, &irank));
1096         if (irank >= 0) irank += sf->ndranks;
1097       }
1098       orank = sf->remote[i].rank;
1099     }
1100     PetscCheck(irank >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Could not find rank %" PetscInt_FMT " in array", sf->remote[i].rank);
1101     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1102     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1103     rcount[irank]++;
1104   }
1105   PetscCall(PetscFree2(rcount, ranks));
1106   PetscFunctionReturn(PETSC_SUCCESS);
1107 }
1108 
1109 /*@C
1110   PetscSFGetGroups - gets incoming and outgoing process groups
1111 
1112   Collective
1113 
1114   Input Parameter:
1115 . sf - star forest
1116 
1117   Output Parameters:
1118 + incoming - group of origin processes for incoming edges (leaves that reference my roots)
1119 - outgoing - group of destination processes for outgoing edges (roots that I reference)
1120 
1121   Level: developer
1122 
1123 .seealso: `PetscSF`, `PetscSFGetWindow()`, `PetscSFRestoreWindow()`
1124 @*/
1125 PetscErrorCode PetscSFGetGroups(PetscSF sf, MPI_Group *incoming, MPI_Group *outgoing)
1126 {
1127   MPI_Group group = MPI_GROUP_NULL;
1128 
1129   PetscFunctionBegin;
1130   PetscCheck(sf->nranks >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFSetUpRanks() before obtaining groups");
1131   if (sf->ingroup == MPI_GROUP_NULL) {
1132     PetscInt        i;
1133     const PetscInt *indegree;
1134     PetscMPIInt     rank, *outranks, *inranks;
1135     PetscSFNode    *remote;
1136     PetscSF         bgcount;
1137 
1138     /* Compute the number of incoming ranks */
1139     PetscCall(PetscMalloc1(sf->nranks, &remote));
1140     for (i = 0; i < sf->nranks; i++) {
1141       remote[i].rank  = sf->ranks[i];
1142       remote[i].index = 0;
1143     }
1144     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, &bgcount));
1145     PetscCall(PetscSFSetGraph(bgcount, 1, sf->nranks, NULL, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1146     PetscCall(PetscSFComputeDegreeBegin(bgcount, &indegree));
1147     PetscCall(PetscSFComputeDegreeEnd(bgcount, &indegree));
1148     /* Enumerate the incoming ranks */
1149     PetscCall(PetscMalloc2(indegree[0], &inranks, sf->nranks, &outranks));
1150     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1151     for (i = 0; i < sf->nranks; i++) outranks[i] = rank;
1152     PetscCall(PetscSFGatherBegin(bgcount, MPI_INT, outranks, inranks));
1153     PetscCall(PetscSFGatherEnd(bgcount, MPI_INT, outranks, inranks));
1154     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1155     PetscCallMPI(MPI_Group_incl(group, indegree[0], inranks, &sf->ingroup));
1156     PetscCallMPI(MPI_Group_free(&group));
1157     PetscCall(PetscFree2(inranks, outranks));
1158     PetscCall(PetscSFDestroy(&bgcount));
1159   }
1160   *incoming = sf->ingroup;
1161 
1162   if (sf->outgroup == MPI_GROUP_NULL) {
1163     PetscCallMPI(MPI_Comm_group(PetscObjectComm((PetscObject)sf), &group));
1164     PetscCallMPI(MPI_Group_incl(group, sf->nranks, sf->ranks, &sf->outgroup));
1165     PetscCallMPI(MPI_Group_free(&group));
1166   }
1167   *outgoing = sf->outgroup;
1168   PetscFunctionReturn(PETSC_SUCCESS);
1169 }
1170 
1171 /*@
1172   PetscSFGetMultiSF - gets the inner `PetscSF` implementing gathers and scatters
1173 
1174   Collective
1175 
1176   Input Parameter:
1177 . sf - star forest that may contain roots with 0 or with more than 1 vertex
1178 
1179   Output Parameter:
1180 . multi - star forest with split roots, such that each root has degree exactly 1
1181 
1182   Level: developer
1183 
1184   Note:
1185   In most cases, users should use `PetscSFGatherBegin()` and `PetscSFScatterBegin()` instead of manipulating multi
1186   directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1187   edge, it is a candidate for future optimization that might involve its removal.
1188 
1189 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGatherBegin()`, `PetscSFScatterBegin()`, `PetscSFComputeMultiRootOriginalNumbering()`
1190 @*/
1191 PetscErrorCode PetscSFGetMultiSF(PetscSF sf, PetscSF *multi)
1192 {
1193   PetscFunctionBegin;
1194   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1195   PetscAssertPointer(multi, 2);
1196   if (sf->nroots < 0) { /* Graph has not been set yet; why do we need this? */
1197     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1198     *multi           = sf->multi;
1199     sf->multi->multi = sf->multi;
1200     PetscFunctionReturn(PETSC_SUCCESS);
1201   }
1202   if (!sf->multi) {
1203     const PetscInt *indegree;
1204     PetscInt        i, *inoffset, *outones, *outoffset, maxlocal;
1205     PetscSFNode    *remote;
1206     maxlocal = sf->maxleaf + 1; /* TODO: We should use PetscSFGetLeafRange() */
1207     PetscCall(PetscSFComputeDegreeBegin(sf, &indegree));
1208     PetscCall(PetscSFComputeDegreeEnd(sf, &indegree));
1209     PetscCall(PetscMalloc3(sf->nroots + 1, &inoffset, maxlocal, &outones, maxlocal, &outoffset));
1210     inoffset[0] = 0;
1211     for (i = 0; i < sf->nroots; i++) inoffset[i + 1] = inoffset[i] + indegree[i];
1212     for (i = 0; i < maxlocal; i++) outones[i] = 1;
1213     PetscCall(PetscSFFetchAndOpBegin(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1214     PetscCall(PetscSFFetchAndOpEnd(sf, MPIU_INT, inoffset, outones, outoffset, MPI_SUM));
1215     for (i = 0; i < sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1216     if (PetscDefined(USE_DEBUG)) {                               /* Check that the expected number of increments occurred */
1217       for (i = 0; i < sf->nroots; i++) PetscCheck(inoffset[i] + indegree[i] == inoffset[i + 1], PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incorrect result after PetscSFFetchAndOp");
1218     }
1219     PetscCall(PetscMalloc1(sf->nleaves, &remote));
1220     for (i = 0; i < sf->nleaves; i++) {
1221       remote[i].rank  = sf->remote[i].rank;
1222       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1223     }
1224     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &sf->multi));
1225     sf->multi->multi = sf->multi;
1226     PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, remote, PETSC_OWN_POINTER));
1227     if (sf->rankorder) { /* Sort the ranks */
1228       PetscMPIInt  rank;
1229       PetscInt    *inranks, *newoffset, *outranks, *newoutoffset, *tmpoffset, maxdegree;
1230       PetscSFNode *newremote;
1231       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
1232       for (i = 0, maxdegree = 0; i < sf->nroots; i++) maxdegree = PetscMax(maxdegree, indegree[i]);
1233       PetscCall(PetscMalloc5(sf->multi->nroots, &inranks, sf->multi->nroots, &newoffset, maxlocal, &outranks, maxlocal, &newoutoffset, maxdegree, &tmpoffset));
1234       for (i = 0; i < maxlocal; i++) outranks[i] = rank;
1235       PetscCall(PetscSFReduceBegin(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1236       PetscCall(PetscSFReduceEnd(sf->multi, MPIU_INT, outranks, inranks, MPI_REPLACE));
1237       /* Sort the incoming ranks at each vertex, build the inverse map */
1238       for (i = 0; i < sf->nroots; i++) {
1239         PetscInt j;
1240         for (j = 0; j < indegree[i]; j++) tmpoffset[j] = j;
1241         if (inranks) PetscCall(PetscSortIntWithArray(indegree[i], inranks + inoffset[i], tmpoffset));
1242         for (j = 0; j < indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1243       }
1244       PetscCall(PetscSFBcastBegin(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1245       PetscCall(PetscSFBcastEnd(sf->multi, MPIU_INT, newoffset, newoutoffset, MPI_REPLACE));
1246       PetscCall(PetscMalloc1(sf->nleaves, &newremote));
1247       for (i = 0; i < sf->nleaves; i++) {
1248         newremote[i].rank  = sf->remote[i].rank;
1249         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1250       }
1251       PetscCall(PetscSFSetGraph(sf->multi, inoffset[sf->nroots], sf->nleaves, sf->mine, PETSC_COPY_VALUES, newremote, PETSC_OWN_POINTER));
1252       PetscCall(PetscFree5(inranks, newoffset, outranks, newoutoffset, tmpoffset));
1253     }
1254     PetscCall(PetscFree3(inoffset, outones, outoffset));
1255   }
1256   *multi = sf->multi;
1257   PetscFunctionReturn(PETSC_SUCCESS);
1258 }
1259 
1260 /*@C
1261   PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots of a `PetscSF`, does not remap indices
1262 
1263   Collective
1264 
1265   Input Parameters:
1266 + sf        - original star forest
1267 . nselected - number of selected roots on this process
1268 - selected  - indices of the selected roots on this process
1269 
1270   Output Parameter:
1271 . esf - new star forest
1272 
1273   Level: advanced
1274 
1275   Note:
1276   To use the new `PetscSF`, it may be necessary to know the indices of the leaves that are still participating. This can
1277   be done by calling PetscSFGetGraph().
1278 
1279 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1280 @*/
1281 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *esf)
1282 {
1283   PetscInt           i, j, n, nroots, nleaves, esf_nleaves, *new_ilocal, minleaf, maxleaf, maxlocal;
1284   const PetscInt    *ilocal;
1285   signed char       *rootdata, *leafdata, *leafmem;
1286   const PetscSFNode *iremote;
1287   PetscSFNode       *new_iremote;
1288   MPI_Comm           comm;
1289 
1290   PetscFunctionBegin;
1291   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1292   PetscSFCheckGraphSet(sf, 1);
1293   if (nselected) PetscAssertPointer(selected, 3);
1294   PetscAssertPointer(esf, 4);
1295 
1296   PetscCall(PetscSFSetUp(sf));
1297   PetscCall(PetscLogEventBegin(PETSCSF_EmbedSF, sf, 0, 0, 0));
1298   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1299   PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
1300 
1301   if (PetscDefined(USE_DEBUG)) { /* Error out if selected[] has dups or  out of range indices */
1302     PetscBool dups;
1303     PetscCall(PetscCheckDupsInt(nselected, selected, &dups));
1304     PetscCheck(!dups, comm, PETSC_ERR_ARG_WRONG, "selected[] has dups");
1305     for (i = 0; i < nselected; i++) PetscCheck(selected[i] >= 0 && selected[i] < nroots, comm, PETSC_ERR_ARG_OUTOFRANGE, "selected root indice %" PetscInt_FMT " is out of [0,%" PetscInt_FMT ")", selected[i], nroots);
1306   }
1307 
1308   if (sf->ops->CreateEmbeddedRootSF) PetscUseTypeMethod(sf, CreateEmbeddedRootSF, nselected, selected, esf);
1309   else {
1310     /* A generic version of creating embedded sf */
1311     PetscCall(PetscSFGetLeafRange(sf, &minleaf, &maxleaf));
1312     maxlocal = maxleaf - minleaf + 1;
1313     PetscCall(PetscCalloc2(nroots, &rootdata, maxlocal, &leafmem));
1314     leafdata = leafmem - minleaf;
1315     /* Tag selected roots and bcast to leaves */
1316     for (i = 0; i < nselected; i++) rootdata[selected[i]] = 1;
1317     PetscCall(PetscSFBcastBegin(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1318     PetscCall(PetscSFBcastEnd(sf, MPI_SIGNED_CHAR, rootdata, leafdata, MPI_REPLACE));
1319 
1320     /* Build esf with leaves that are still connected */
1321     esf_nleaves = 0;
1322     for (i = 0; i < nleaves; i++) {
1323       j = ilocal ? ilocal[i] : i;
1324       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1325          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1326       */
1327       esf_nleaves += (leafdata[j] ? 1 : 0);
1328     }
1329     PetscCall(PetscMalloc1(esf_nleaves, &new_ilocal));
1330     PetscCall(PetscMalloc1(esf_nleaves, &new_iremote));
1331     for (i = n = 0; i < nleaves; i++) {
1332       j = ilocal ? ilocal[i] : i;
1333       if (leafdata[j]) {
1334         new_ilocal[n]        = j;
1335         new_iremote[n].rank  = iremote[i].rank;
1336         new_iremote[n].index = iremote[i].index;
1337         ++n;
1338       }
1339     }
1340     PetscCall(PetscSFCreate(comm, esf));
1341     PetscCall(PetscSFSetFromOptions(*esf));
1342     PetscCall(PetscSFSetGraph(*esf, nroots, esf_nleaves, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1343     PetscCall(PetscFree2(rootdata, leafmem));
1344   }
1345   PetscCall(PetscLogEventEnd(PETSCSF_EmbedSF, sf, 0, 0, 0));
1346   PetscFunctionReturn(PETSC_SUCCESS);
1347 }
1348 
1349 /*@C
1350   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves of a `PetscSF`, does not remap indices
1351 
1352   Collective
1353 
1354   Input Parameters:
1355 + sf        - original star forest
1356 . nselected - number of selected leaves on this process
1357 - selected  - indices of the selected leaves on this process
1358 
1359   Output Parameter:
1360 . newsf - new star forest
1361 
1362   Level: advanced
1363 
1364 .seealso: `PetscSF`, `PetscSFCreateEmbeddedRootSF()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
1365 @*/
1366 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf, PetscInt nselected, const PetscInt *selected, PetscSF *newsf)
1367 {
1368   const PetscSFNode *iremote;
1369   PetscSFNode       *new_iremote;
1370   const PetscInt    *ilocal;
1371   PetscInt           i, nroots, *leaves, *new_ilocal;
1372   MPI_Comm           comm;
1373 
1374   PetscFunctionBegin;
1375   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1376   PetscSFCheckGraphSet(sf, 1);
1377   if (nselected) PetscAssertPointer(selected, 3);
1378   PetscAssertPointer(newsf, 4);
1379 
1380   /* Uniq selected[] and put results in leaves[] */
1381   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
1382   PetscCall(PetscMalloc1(nselected, &leaves));
1383   PetscCall(PetscArraycpy(leaves, selected, nselected));
1384   PetscCall(PetscSortedRemoveDupsInt(&nselected, leaves));
1385   PetscCheck(!nselected || !(leaves[0] < 0 || leaves[nselected - 1] >= sf->nleaves), comm, PETSC_ERR_ARG_OUTOFRANGE, "Min/Max leaf indices %" PetscInt_FMT "/%" PetscInt_FMT " are not in [0,%" PetscInt_FMT ")", leaves[0], leaves[nselected - 1], sf->nleaves);
1386 
1387   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1388   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) PetscUseTypeMethod(sf, CreateEmbeddedLeafSF, nselected, leaves, newsf);
1389   else {
1390     PetscCall(PetscSFGetGraph(sf, &nroots, NULL, &ilocal, &iremote));
1391     PetscCall(PetscMalloc1(nselected, &new_ilocal));
1392     PetscCall(PetscMalloc1(nselected, &new_iremote));
1393     for (i = 0; i < nselected; ++i) {
1394       const PetscInt l     = leaves[i];
1395       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1396       new_iremote[i].rank  = iremote[l].rank;
1397       new_iremote[i].index = iremote[l].index;
1398     }
1399     PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, newsf));
1400     PetscCall(PetscSFSetGraph(*newsf, nroots, nselected, new_ilocal, PETSC_OWN_POINTER, new_iremote, PETSC_OWN_POINTER));
1401   }
1402   PetscCall(PetscFree(leaves));
1403   PetscFunctionReturn(PETSC_SUCCESS);
1404 }
1405 
1406 /*@C
1407   PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to `PetscSFBcastEnd()`
1408 
1409   Collective
1410 
1411   Input Parameters:
1412 + sf       - star forest on which to communicate
1413 . unit     - data type associated with each node
1414 . rootdata - buffer to broadcast
1415 - op       - operation to use for reduction
1416 
1417   Output Parameter:
1418 . leafdata - buffer to be reduced with values from each leaf's respective root
1419 
1420   Level: intermediate
1421 
1422   Note:
1423   When PETSc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1424   are host pointers or device pointers, which may incur a noticeable cost. If you already knew the info, you should
1425   use `PetscSFBcastWithMemTypeBegin()` instead.
1426 
1427 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastWithMemTypeBegin()`
1428 @*/
1429 PetscErrorCode PetscSFBcastBegin(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1430 {
1431   PetscMemType rootmtype, leafmtype;
1432 
1433   PetscFunctionBegin;
1434   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1435   PetscCall(PetscSFSetUp(sf));
1436   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1437   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1438   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1439   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1440   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1441   PetscFunctionReturn(PETSC_SUCCESS);
1442 }
1443 
1444 /*@C
1445   PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call
1446   to `PetscSFBcastEnd()`
1447 
1448   Collective
1449 
1450   Input Parameters:
1451 + sf        - star forest on which to communicate
1452 . unit      - data type associated with each node
1453 . rootmtype - memory type of rootdata
1454 . rootdata  - buffer to broadcast
1455 . leafmtype - memory type of leafdata
1456 - op        - operation to use for reduction
1457 
1458   Output Parameter:
1459 . leafdata - buffer to be reduced with values from each leaf's respective root
1460 
1461   Level: intermediate
1462 
1463 .seealso: `PetscSF`, `PetscSFBcastEnd()`, `PetscSFBcastBegin()`
1464 @*/
1465 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
1466 {
1467   PetscFunctionBegin;
1468   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1469   PetscCall(PetscSFSetUp(sf));
1470   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
1471   PetscUseTypeMethod(sf, BcastBegin, unit, rootmtype, rootdata, leafmtype, leafdata, op);
1472   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
1473   PetscFunctionReturn(PETSC_SUCCESS);
1474 }
1475 
1476 /*@C
1477   PetscSFBcastEnd - end a broadcast and reduce operation started with `PetscSFBcastBegin()` or `PetscSFBcastWithMemTypeBegin()`
1478 
1479   Collective
1480 
1481   Input Parameters:
1482 + sf       - star forest
1483 . unit     - data type
1484 . rootdata - buffer to broadcast
1485 - op       - operation to use for reduction
1486 
1487   Output Parameter:
1488 . leafdata - buffer to be reduced with values from each leaf's respective root
1489 
1490   Level: intermediate
1491 
1492 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFReduceEnd()`
1493 @*/
1494 PetscErrorCode PetscSFBcastEnd(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
1495 {
1496   PetscFunctionBegin;
1497   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1498   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_BcastEnd, sf, 0, 0, 0));
1499   PetscUseTypeMethod(sf, BcastEnd, unit, rootdata, leafdata, op);
1500   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_BcastEnd, sf, 0, 0, 0));
1501   PetscFunctionReturn(PETSC_SUCCESS);
1502 }
1503 
1504 /*@C
1505   PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to `PetscSFReduceEnd()`
1506 
1507   Collective
1508 
1509   Input Parameters:
1510 + sf       - star forest
1511 . unit     - data type
1512 . leafdata - values to reduce
1513 - op       - reduction operation
1514 
1515   Output Parameter:
1516 . rootdata - result of reduction of values from all leaves of each root
1517 
1518   Level: intermediate
1519 
1520   Note:
1521   When PETSc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1522   are host pointers or device pointers, which may incur a noticeable cost. If you already knew the info, you should
1523   use `PetscSFReduceWithMemTypeBegin()` instead.
1524 
1525 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceWithMemTypeBegin()`, `PetscSFReduceEnd()`
1526 @*/
1527 PetscErrorCode PetscSFReduceBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1528 {
1529   PetscMemType rootmtype, leafmtype;
1530 
1531   PetscFunctionBegin;
1532   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1533   PetscCall(PetscSFSetUp(sf));
1534   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1535   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1536   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1537   PetscCall((sf->ops->ReduceBegin)(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1538   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1539   PetscFunctionReturn(PETSC_SUCCESS);
1540 }
1541 
1542 /*@C
1543   PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to `PetscSFReduceEnd()`
1544 
1545   Collective
1546 
1547   Input Parameters:
1548 + sf        - star forest
1549 . unit      - data type
1550 . leafmtype - memory type of leafdata
1551 . leafdata  - values to reduce
1552 . rootmtype - memory type of rootdata
1553 - op        - reduction operation
1554 
1555   Output Parameter:
1556 . rootdata - result of reduction of values from all leaves of each root
1557 
1558   Level: intermediate
1559 
1560 .seealso: `PetscSF`, `PetscSFBcastBegin()`, `PetscSFReduceBegin()`, `PetscSFReduceEnd()`
1561 @*/
1562 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
1563 {
1564   PetscFunctionBegin;
1565   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1566   PetscCall(PetscSFSetUp(sf));
1567   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1568   PetscCall((sf->ops->ReduceBegin)(sf, unit, leafmtype, leafdata, rootmtype, rootdata, op));
1569   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceBegin, sf, 0, 0, 0));
1570   PetscFunctionReturn(PETSC_SUCCESS);
1571 }
1572 
1573 /*@C
1574   PetscSFReduceEnd - end a reduction operation started with `PetscSFReduceBegin()` or `PetscSFReduceWithMemTypeBegin()`
1575 
1576   Collective
1577 
1578   Input Parameters:
1579 + sf       - star forest
1580 . unit     - data type
1581 . leafdata - values to reduce
1582 - op       - reduction operation
1583 
1584   Output Parameter:
1585 . rootdata - result of reduction of values from all leaves of each root
1586 
1587   Level: intermediate
1588 
1589 .seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFBcastEnd()`, `PetscSFReduceBegin()`, `PetscSFReduceWithMemTypeBegin()`
1590 @*/
1591 PetscErrorCode PetscSFReduceEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
1592 {
1593   PetscFunctionBegin;
1594   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1595   if (!sf->vscat.logging) PetscCall(PetscLogEventBegin(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1596   PetscUseTypeMethod(sf, ReduceEnd, unit, leafdata, rootdata, op);
1597   if (!sf->vscat.logging) PetscCall(PetscLogEventEnd(PETSCSF_ReduceEnd, sf, 0, 0, 0));
1598   PetscFunctionReturn(PETSC_SUCCESS);
1599 }
1600 
1601 /*@C
1602   PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value,
1603   to be completed with `PetscSFFetchAndOpEnd()`
1604 
1605   Collective
1606 
1607   Input Parameters:
1608 + sf       - star forest
1609 . unit     - data type
1610 . leafdata - leaf values to use in reduction
1611 - op       - operation to use for reduction
1612 
1613   Output Parameters:
1614 + rootdata   - root values to be updated, input state is seen by first process to perform an update
1615 - leafupdate - state at each leaf's respective root immediately prior to my atomic update
1616 
1617   Level: advanced
1618 
1619   Note:
1620   The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1621   might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1622   not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1623   integers.
1624 
1625 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`
1626 @*/
1627 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1628 {
1629   PetscMemType rootmtype, leafmtype, leafupdatemtype;
1630 
1631   PetscFunctionBegin;
1632   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1633   PetscCall(PetscSFSetUp(sf));
1634   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1635   PetscCall(PetscGetMemType(rootdata, &rootmtype));
1636   PetscCall(PetscGetMemType(leafdata, &leafmtype));
1637   PetscCall(PetscGetMemType(leafupdate, &leafupdatemtype));
1638   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1639   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1640   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1641   PetscFunctionReturn(PETSC_SUCCESS);
1642 }
1643 
1644 /*@C
1645   PetscSFFetchAndOpWithMemTypeBegin - begin operation with explicit memory types that fetches values from root and updates atomically by
1646   applying operation using my leaf value, to be completed with `PetscSFFetchAndOpEnd()`
1647 
1648   Collective
1649 
1650   Input Parameters:
1651 + sf              - star forest
1652 . unit            - data type
1653 . rootmtype       - memory type of rootdata
1654 . leafmtype       - memory type of leafdata
1655 . leafdata        - leaf values to use in reduction
1656 . leafupdatemtype - memory type of leafupdate
1657 - op              - operation to use for reduction
1658 
1659   Output Parameters:
1660 + rootdata   - root values to be updated, input state is seen by first process to perform an update
1661 - leafupdate - state at each leaf's respective root immediately prior to my atomic update
1662 
1663   Level: advanced
1664 
1665   Note:
1666   See `PetscSFFetchAndOpBegin()` for more details.
1667 
1668 .seealso: `PetscSF`, `PetscSFFetchAndOpBegin()`, `PetscSFComputeDegreeBegin()`, `PetscSFReduceBegin()`, `PetscSFSetGraph()`, `PetscSFFetchAndOpEnd()`
1669 @*/
1670 PetscErrorCode PetscSFFetchAndOpWithMemTypeBegin(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, PetscMemType leafupdatemtype, void *leafupdate, MPI_Op op)
1671 {
1672   PetscFunctionBegin;
1673   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1674   PetscCall(PetscSFSetUp(sf));
1675   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1676   PetscCheck(leafmtype == leafupdatemtype, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for leafdata and leafupdate in different memory types");
1677   PetscUseTypeMethod(sf, FetchAndOpBegin, unit, rootmtype, rootdata, leafmtype, leafdata, leafupdate, op);
1678   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpBegin, sf, 0, 0, 0));
1679   PetscFunctionReturn(PETSC_SUCCESS);
1680 }
1681 
1682 /*@C
1683   PetscSFFetchAndOpEnd - end operation started in matching call to `PetscSFFetchAndOpBegin()` or `PetscSFFetchAndOpWithMemTypeBegin()`
1684   to fetch values from roots and update atomically by applying operation using my leaf value
1685 
1686   Collective
1687 
1688   Input Parameters:
1689 + sf       - star forest
1690 . unit     - data type
1691 . leafdata - leaf values to use in reduction
1692 - op       - operation to use for reduction
1693 
1694   Output Parameters:
1695 + rootdata   - root values to be updated, input state is seen by first process to perform an update
1696 - leafupdate - state at each leaf's respective root immediately prior to my atomic update
1697 
1698   Level: advanced
1699 
1700 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFReduceEnd()`, `PetscSFSetGraph()`, `PetscSFFetchAndOpBegin()`, `PetscSFFetchAndOpWithMemTypeBegin()`
1701 @*/
1702 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1703 {
1704   PetscFunctionBegin;
1705   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1706   PetscCall(PetscLogEventBegin(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1707   PetscUseTypeMethod(sf, FetchAndOpEnd, unit, rootdata, leafdata, leafupdate, op);
1708   PetscCall(PetscLogEventEnd(PETSCSF_FetchAndOpEnd, sf, 0, 0, 0));
1709   PetscFunctionReturn(PETSC_SUCCESS);
1710 }
1711 
1712 /*@C
1713   PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with `PetscSFComputeDegreeEnd()`
1714 
1715   Collective
1716 
1717   Input Parameter:
1718 . sf - star forest
1719 
1720   Output Parameter:
1721 . degree - degree of each root vertex
1722 
1723   Level: advanced
1724 
1725   Note:
1726   The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence there is no need to call `PetscFree()` on it.
1727 
1728 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeEnd()`
1729 @*/
1730 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf, const PetscInt **degree)
1731 {
1732   PetscFunctionBegin;
1733   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1734   PetscSFCheckGraphSet(sf, 1);
1735   PetscAssertPointer(degree, 2);
1736   if (!sf->degreeknown) {
1737     PetscInt i, nroots = sf->nroots, maxlocal;
1738     PetscCheck(!sf->degree, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1739     maxlocal = sf->maxleaf - sf->minleaf + 1;
1740     PetscCall(PetscMalloc1(nroots, &sf->degree));
1741     PetscCall(PetscMalloc1(PetscMax(maxlocal, 1), &sf->degreetmp)); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1742     for (i = 0; i < nroots; i++) sf->degree[i] = 0;
1743     for (i = 0; i < maxlocal; i++) sf->degreetmp[i] = 1;
1744     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1745   }
1746   *degree = NULL;
1747   PetscFunctionReturn(PETSC_SUCCESS);
1748 }
1749 
1750 /*@C
1751   PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with `PetscSFComputeDegreeBegin()`
1752 
1753   Collective
1754 
1755   Input Parameter:
1756 . sf - star forest
1757 
1758   Output Parameter:
1759 . degree - degree of each root vertex
1760 
1761   Level: developer
1762 
1763   Note:
1764   The returned array is owned by `PetscSF` and automatically freed by `PetscSFDestroy()`. Hence there is no need to call `PetscFree()` on it.
1765 
1766 .seealso: `PetscSF`, `PetscSFGatherBegin()`, `PetscSFComputeDegreeBegin()`
1767 @*/
1768 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf, const PetscInt **degree)
1769 {
1770   PetscFunctionBegin;
1771   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1772   PetscSFCheckGraphSet(sf, 1);
1773   PetscAssertPointer(degree, 2);
1774   if (!sf->degreeknown) {
1775     PetscCheck(sf->degreetmp, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1776     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, sf->degreetmp - sf->minleaf, sf->degree, MPI_SUM));
1777     PetscCall(PetscFree(sf->degreetmp));
1778     sf->degreeknown = PETSC_TRUE;
1779   }
1780   *degree = sf->degree;
1781   PetscFunctionReturn(PETSC_SUCCESS);
1782 }
1783 
1784 /*@C
1785   PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-`PetscSF` returned by `PetscSFGetMultiSF()`).
1786   Each multi-root is assigned index of the corresponding original root.
1787 
1788   Collective
1789 
1790   Input Parameters:
1791 + sf     - star forest
1792 - degree - degree of each root vertex, computed with `PetscSFComputeDegreeBegin()`/`PetscSFComputeDegreeEnd()`
1793 
1794   Output Parameters:
1795 + nMultiRoots             - (optional) number of multi-roots (roots of multi-`PetscSF`)
1796 - multiRootsOrigNumbering - original indices of multi-roots; length of this array is `nMultiRoots`
1797 
1798   Level: developer
1799 
1800   Note:
1801   The returned array `multiRootsOrigNumbering` is newly allocated and should be destroyed with `PetscFree()` when no longer needed.
1802 
1803 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFComputeDegreeEnd()`, `PetscSFGetMultiSF()`
1804 @*/
1805 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1806 {
1807   PetscSF  msf;
1808   PetscInt i, j, k, nroots, nmroots;
1809 
1810   PetscFunctionBegin;
1811   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1812   PetscCall(PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL));
1813   if (nroots) PetscAssertPointer(degree, 2);
1814   if (nMultiRoots) PetscAssertPointer(nMultiRoots, 3);
1815   PetscAssertPointer(multiRootsOrigNumbering, 4);
1816   PetscCall(PetscSFGetMultiSF(sf, &msf));
1817   PetscCall(PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL));
1818   PetscCall(PetscMalloc1(nmroots, multiRootsOrigNumbering));
1819   for (i = 0, j = 0, k = 0; i < nroots; i++) {
1820     if (!degree[i]) continue;
1821     for (j = 0; j < degree[i]; j++, k++) (*multiRootsOrigNumbering)[k] = i;
1822   }
1823   PetscCheck(k == nmroots, PETSC_COMM_SELF, PETSC_ERR_PLIB, "sanity check fail");
1824   if (nMultiRoots) *nMultiRoots = nmroots;
1825   PetscFunctionReturn(PETSC_SUCCESS);
1826 }
1827 
1828 /*@C
1829   PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with `PetscSFGatherEnd()`
1830 
1831   Collective
1832 
1833   Input Parameters:
1834 + sf       - star forest
1835 . unit     - data type
1836 - leafdata - leaf data to gather to roots
1837 
1838   Output Parameter:
1839 . multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1840 
1841   Level: intermediate
1842 
1843 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterBegin()`
1844 @*/
1845 PetscErrorCode PetscSFGatherBegin(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1846 {
1847   PetscSF multi = NULL;
1848 
1849   PetscFunctionBegin;
1850   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1851   PetscCall(PetscSFSetUp(sf));
1852   PetscCall(PetscSFGetMultiSF(sf, &multi));
1853   PetscCall(PetscSFReduceBegin(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1854   PetscFunctionReturn(PETSC_SUCCESS);
1855 }
1856 
1857 /*@C
1858   PetscSFGatherEnd - ends pointwise gather operation that was started with `PetscSFGatherBegin()`
1859 
1860   Collective
1861 
1862   Input Parameters:
1863 + sf       - star forest
1864 . unit     - data type
1865 - leafdata - leaf data to gather to roots
1866 
1867   Output Parameter:
1868 . multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1869 
1870   Level: intermediate
1871 
1872 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterEnd()`
1873 @*/
1874 PetscErrorCode PetscSFGatherEnd(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *multirootdata)
1875 {
1876   PetscSF multi = NULL;
1877 
1878   PetscFunctionBegin;
1879   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1880   PetscCall(PetscSFGetMultiSF(sf, &multi));
1881   PetscCall(PetscSFReduceEnd(multi, unit, leafdata, multirootdata, MPI_REPLACE));
1882   PetscFunctionReturn(PETSC_SUCCESS);
1883 }
1884 
1885 /*@C
1886   PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with `PetscSFScatterEnd()`
1887 
1888   Collective
1889 
1890   Input Parameters:
1891 + sf            - star forest
1892 . unit          - data type
1893 - multirootdata - root buffer to send to each leaf, one unit of data per leaf
1894 
1895   Output Parameter:
1896 . leafdata - leaf data to be update with personal data from each respective root
1897 
1898   Level: intermediate
1899 
1900 .seealso: `PetscSF`, `PetscSFComputeDegreeBegin()`, `PetscSFScatterEnd()`
1901 @*/
1902 PetscErrorCode PetscSFScatterBegin(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1903 {
1904   PetscSF multi = NULL;
1905 
1906   PetscFunctionBegin;
1907   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1908   PetscCall(PetscSFSetUp(sf));
1909   PetscCall(PetscSFGetMultiSF(sf, &multi));
1910   PetscCall(PetscSFBcastBegin(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1911   PetscFunctionReturn(PETSC_SUCCESS);
1912 }
1913 
1914 /*@C
1915   PetscSFScatterEnd - ends pointwise scatter operation that was started with `PetscSFScatterBegin()`
1916 
1917   Collective
1918 
1919   Input Parameters:
1920 + sf            - star forest
1921 . unit          - data type
1922 - multirootdata - root buffer to send to each leaf, one unit of data per leaf
1923 
1924   Output Parameter:
1925 . leafdata - leaf data to be update with personal data from each respective root
1926 
1927   Level: intermediate
1928 
1929 .seealso: `PetscSF`, `PetscSFComputeDegreeEnd()`, `PetscSFScatterBegin()`
1930 @*/
1931 PetscErrorCode PetscSFScatterEnd(PetscSF sf, MPI_Datatype unit, const void *multirootdata, void *leafdata)
1932 {
1933   PetscSF multi = NULL;
1934 
1935   PetscFunctionBegin;
1936   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
1937   PetscCall(PetscSFGetMultiSF(sf, &multi));
1938   PetscCall(PetscSFBcastEnd(multi, unit, multirootdata, leafdata, MPI_REPLACE));
1939   PetscFunctionReturn(PETSC_SUCCESS);
1940 }
1941 
1942 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1943 {
1944   PetscInt        i, n, nleaves;
1945   const PetscInt *ilocal = NULL;
1946   PetscHSetI      seen;
1947 
1948   PetscFunctionBegin;
1949   if (PetscDefined(USE_DEBUG)) {
1950     PetscCall(PetscSFGetGraph(sf, NULL, &nleaves, &ilocal, NULL));
1951     PetscCall(PetscHSetICreate(&seen));
1952     for (i = 0; i < nleaves; i++) {
1953       const PetscInt leaf = ilocal ? ilocal[i] : i;
1954       PetscCall(PetscHSetIAdd(seen, leaf));
1955     }
1956     PetscCall(PetscHSetIGetSize(seen, &n));
1957     PetscCheck(n == nleaves, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Provided leaves have repeated values: all leaves must be unique");
1958     PetscCall(PetscHSetIDestroy(&seen));
1959   }
1960   PetscFunctionReturn(PETSC_SUCCESS);
1961 }
1962 
1963 /*@
1964   PetscSFCompose - Compose a new `PetscSF` by putting the second `PetscSF` under the first one in a top (roots) down (leaves) view
1965 
1966   Input Parameters:
1967 + sfA - The first `PetscSF`
1968 - sfB - The second `PetscSF`
1969 
1970   Output Parameter:
1971 . sfBA - The composite `PetscSF`
1972 
1973   Level: developer
1974 
1975   Notes:
1976   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
1977   forests, i.e. the same leaf is not connected with different roots.
1978 
1979   `sfA`'s leaf space and `sfB`'s root space might be partially overlapped. The composition builds
1980   a graph with `sfA`'s roots and `sfB`'s leaves only when there is a path between them. Unconnected
1981   nodes (roots or leaves) are not in `sfBA`. Doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on the new `PetscSF` is equivalent to doing a
1982   `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on `sfA`, then a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on `sfB`, on connected nodes.
1983 
1984 .seealso: `PetscSF`, `PetscSFComposeInverse()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`
1985 @*/
1986 PetscErrorCode PetscSFCompose(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
1987 {
1988   const PetscSFNode *remotePointsA, *remotePointsB;
1989   PetscSFNode       *remotePointsBA = NULL, *reorderedRemotePointsA = NULL, *leafdataB;
1990   const PetscInt    *localPointsA, *localPointsB;
1991   PetscInt          *localPointsBA;
1992   PetscInt           i, numRootsA, numLeavesA, numRootsB, numLeavesB, minleaf, maxleaf, numLeavesBA;
1993   PetscBool          denseB;
1994 
1995   PetscFunctionBegin;
1996   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
1997   PetscSFCheckGraphSet(sfA, 1);
1998   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
1999   PetscSFCheckGraphSet(sfB, 2);
2000   PetscCheckSameComm(sfA, 1, sfB, 2);
2001   PetscAssertPointer(sfBA, 3);
2002   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
2003   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
2004 
2005   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
2006   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
2007   /* Make sure that PetscSFBcast{Begin, End}(sfB, ...) works with root data of size
2008      numRootsB; otherwise, garbage will be broadcasted.
2009      Example (comm size = 1):
2010      sfA: 0 <- (0, 0)
2011      sfB: 100 <- (0, 0)
2012           101 <- (0, 1)
2013      Here, we have remotePointsA = [(0, 0)], but for remotePointsA to be a valid tartget
2014      of sfB, it has to be recasted as [(0, 0), (-1, -1)] so that points 100 and 101 would
2015      receive (0, 0) and (-1, -1), respectively, when PetscSFBcast(sfB, ...) is called on
2016      remotePointsA; if not recasted, point 101 would receive a garbage value.             */
2017   PetscCall(PetscMalloc1(numRootsB, &reorderedRemotePointsA));
2018   for (i = 0; i < numRootsB; i++) {
2019     reorderedRemotePointsA[i].rank  = -1;
2020     reorderedRemotePointsA[i].index = -1;
2021   }
2022   for (i = 0; i < numLeavesA; i++) {
2023     PetscInt localp = localPointsA ? localPointsA[i] : i;
2024 
2025     if (localp >= numRootsB) continue;
2026     reorderedRemotePointsA[localp] = remotePointsA[i];
2027   }
2028   remotePointsA = reorderedRemotePointsA;
2029   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2030   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &leafdataB));
2031   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2032     leafdataB[i].rank  = -1;
2033     leafdataB[i].index = -1;
2034   }
2035   PetscCall(PetscSFBcastBegin(sfB, MPIU_2INT, remotePointsA, leafdataB - minleaf, MPI_REPLACE));
2036   PetscCall(PetscSFBcastEnd(sfB, MPIU_2INT, remotePointsA, leafdataB - minleaf, MPI_REPLACE));
2037   PetscCall(PetscFree(reorderedRemotePointsA));
2038 
2039   denseB = (PetscBool)!localPointsB;
2040   for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2041     if (leafdataB[localPointsB ? localPointsB[i] - minleaf : i].rank == -1) denseB = PETSC_FALSE;
2042     else numLeavesBA++;
2043   }
2044   if (denseB) {
2045     localPointsBA  = NULL;
2046     remotePointsBA = leafdataB;
2047   } else {
2048     PetscCall(PetscMalloc1(numLeavesBA, &localPointsBA));
2049     PetscCall(PetscMalloc1(numLeavesBA, &remotePointsBA));
2050     for (i = 0, numLeavesBA = 0; i < numLeavesB; i++) {
2051       const PetscInt l = localPointsB ? localPointsB[i] : i;
2052 
2053       if (leafdataB[l - minleaf].rank == -1) continue;
2054       remotePointsBA[numLeavesBA] = leafdataB[l - minleaf];
2055       localPointsBA[numLeavesBA]  = l;
2056       numLeavesBA++;
2057     }
2058     PetscCall(PetscFree(leafdataB));
2059   }
2060   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2061   PetscCall(PetscSFSetFromOptions(*sfBA));
2062   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2063   PetscFunctionReturn(PETSC_SUCCESS);
2064 }
2065 
2066 /*@
2067   PetscSFComposeInverse - Compose a new `PetscSF` by putting the inverse of the second `PetscSF` under the first one
2068 
2069   Input Parameters:
2070 + sfA - The first `PetscSF`
2071 - sfB - The second `PetscSF`
2072 
2073   Output Parameter:
2074 . sfBA - The composite `PetscSF`.
2075 
2076   Level: developer
2077 
2078   Notes:
2079   Currently, the two `PetscSF`s must be defined on congruent communicators and they must be true star
2080   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2081   second `PetscSF` must have a degree of 1, i.e., no roots have more than one leaf connected.
2082 
2083   `sfA`'s leaf space and `sfB`'s leaf space might be partially overlapped. The composition builds
2084   a graph with `sfA`'s roots and `sfB`'s roots only when there is a path between them. Unconnected
2085   roots are not in `sfBA`. Doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()` on the new `PetscSF` is equivalent to doing a `PetscSFBcastBegin()`/`PetscSFBcastEnd()`
2086   on `sfA`, then
2087   a `PetscSFReduceBegin()`/`PetscSFReduceEnd()` on `sfB`, on connected roots.
2088 
2089 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFCreateInverseSF()`
2090 @*/
2091 PetscErrorCode PetscSFComposeInverse(PetscSF sfA, PetscSF sfB, PetscSF *sfBA)
2092 {
2093   const PetscSFNode *remotePointsA, *remotePointsB;
2094   PetscSFNode       *remotePointsBA;
2095   const PetscInt    *localPointsA, *localPointsB;
2096   PetscSFNode       *reorderedRemotePointsA = NULL;
2097   PetscInt           i, numRootsA, numLeavesA, numLeavesBA, numRootsB, numLeavesB, minleaf, maxleaf, *localPointsBA;
2098   MPI_Op             op;
2099 #if defined(PETSC_USE_64BIT_INDICES)
2100   PetscBool iswin;
2101 #endif
2102 
2103   PetscFunctionBegin;
2104   PetscValidHeaderSpecific(sfA, PETSCSF_CLASSID, 1);
2105   PetscSFCheckGraphSet(sfA, 1);
2106   PetscValidHeaderSpecific(sfB, PETSCSF_CLASSID, 2);
2107   PetscSFCheckGraphSet(sfB, 2);
2108   PetscCheckSameComm(sfA, 1, sfB, 2);
2109   PetscAssertPointer(sfBA, 3);
2110   PetscCall(PetscSFCheckLeavesUnique_Private(sfA));
2111   PetscCall(PetscSFCheckLeavesUnique_Private(sfB));
2112 
2113   PetscCall(PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA));
2114   PetscCall(PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB));
2115 
2116   /* TODO: Check roots of sfB have degree of 1 */
2117   /* Once we implement it, we can replace the MPI_MAXLOC
2118      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2119      We use MPI_MAXLOC only to have a deterministic output from this routine if
2120      the root condition is not meet.
2121    */
2122   op = MPI_MAXLOC;
2123 #if defined(PETSC_USE_64BIT_INDICES)
2124   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2125   PetscCall(PetscObjectTypeCompare((PetscObject)sfB, PETSCSFWINDOW, &iswin));
2126   if (iswin) op = MPI_REPLACE;
2127 #endif
2128 
2129   PetscCall(PetscSFGetLeafRange(sfB, &minleaf, &maxleaf));
2130   PetscCall(PetscMalloc1(maxleaf - minleaf + 1, &reorderedRemotePointsA));
2131   for (i = 0; i < maxleaf - minleaf + 1; i++) {
2132     reorderedRemotePointsA[i].rank  = -1;
2133     reorderedRemotePointsA[i].index = -1;
2134   }
2135   if (localPointsA) {
2136     for (i = 0; i < numLeavesA; i++) {
2137       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2138       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2139     }
2140   } else {
2141     for (i = 0; i < numLeavesA; i++) {
2142       if (i > maxleaf || i < minleaf) continue;
2143       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2144     }
2145   }
2146 
2147   PetscCall(PetscMalloc1(numRootsB, &localPointsBA));
2148   PetscCall(PetscMalloc1(numRootsB, &remotePointsBA));
2149   for (i = 0; i < numRootsB; i++) {
2150     remotePointsBA[i].rank  = -1;
2151     remotePointsBA[i].index = -1;
2152   }
2153 
2154   PetscCall(PetscSFReduceBegin(sfB, MPIU_2INT, reorderedRemotePointsA - minleaf, remotePointsBA, op));
2155   PetscCall(PetscSFReduceEnd(sfB, MPIU_2INT, reorderedRemotePointsA - minleaf, remotePointsBA, op));
2156   PetscCall(PetscFree(reorderedRemotePointsA));
2157   for (i = 0, numLeavesBA = 0; i < numRootsB; i++) {
2158     if (remotePointsBA[i].rank == -1) continue;
2159     remotePointsBA[numLeavesBA].rank  = remotePointsBA[i].rank;
2160     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2161     localPointsBA[numLeavesBA]        = i;
2162     numLeavesBA++;
2163   }
2164   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfA), sfBA));
2165   PetscCall(PetscSFSetFromOptions(*sfBA));
2166   PetscCall(PetscSFSetGraph(*sfBA, numRootsA, numLeavesBA, localPointsBA, PETSC_OWN_POINTER, remotePointsBA, PETSC_OWN_POINTER));
2167   PetscFunctionReturn(PETSC_SUCCESS);
2168 }
2169 
2170 /*
2171   PetscSFCreateLocalSF_Private - Creates a local `PetscSF` that only has intra-process edges of the global `PetscSF`
2172 
2173   Input Parameter:
2174 . sf - The global `PetscSF`
2175 
2176   Output Parameter:
2177 . out - The local `PetscSF`
2178 
2179 .seealso: `PetscSF`, `PetscSFCreate()`
2180  */
2181 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf, PetscSF *out)
2182 {
2183   MPI_Comm           comm;
2184   PetscMPIInt        myrank;
2185   const PetscInt    *ilocal;
2186   const PetscSFNode *iremote;
2187   PetscInt           i, j, nroots, nleaves, lnleaves, *lilocal;
2188   PetscSFNode       *liremote;
2189   PetscSF            lsf;
2190 
2191   PetscFunctionBegin;
2192   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2193   if (sf->ops->CreateLocalSF) PetscUseTypeMethod(sf, CreateLocalSF, out);
2194   else {
2195     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2196     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
2197     PetscCallMPI(MPI_Comm_rank(comm, &myrank));
2198 
2199     /* Find out local edges and build a local SF */
2200     PetscCall(PetscSFGetGraph(sf, &nroots, &nleaves, &ilocal, &iremote));
2201     for (i = lnleaves = 0; i < nleaves; i++) {
2202       if (iremote[i].rank == (PetscInt)myrank) lnleaves++;
2203     }
2204     PetscCall(PetscMalloc1(lnleaves, &lilocal));
2205     PetscCall(PetscMalloc1(lnleaves, &liremote));
2206 
2207     for (i = j = 0; i < nleaves; i++) {
2208       if (iremote[i].rank == (PetscInt)myrank) {
2209         lilocal[j]        = ilocal ? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2210         liremote[j].rank  = 0;                      /* rank in PETSC_COMM_SELF */
2211         liremote[j].index = iremote[i].index;
2212         j++;
2213       }
2214     }
2215     PetscCall(PetscSFCreate(PETSC_COMM_SELF, &lsf));
2216     PetscCall(PetscSFSetFromOptions(lsf));
2217     PetscCall(PetscSFSetGraph(lsf, nroots, lnleaves, lilocal, PETSC_OWN_POINTER, liremote, PETSC_OWN_POINTER));
2218     PetscCall(PetscSFSetUp(lsf));
2219     *out = lsf;
2220   }
2221   PetscFunctionReturn(PETSC_SUCCESS);
2222 }
2223 
2224 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2225 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata)
2226 {
2227   PetscMemType rootmtype, leafmtype;
2228 
2229   PetscFunctionBegin;
2230   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
2231   PetscCall(PetscSFSetUp(sf));
2232   PetscCall(PetscLogEventBegin(PETSCSF_BcastBegin, sf, 0, 0, 0));
2233   PetscCall(PetscGetMemType(rootdata, &rootmtype));
2234   PetscCall(PetscGetMemType(leafdata, &leafmtype));
2235   PetscUseTypeMethod(sf, BcastToZero, unit, rootmtype, rootdata, leafmtype, leafdata);
2236   PetscCall(PetscLogEventEnd(PETSCSF_BcastBegin, sf, 0, 0, 0));
2237   PetscFunctionReturn(PETSC_SUCCESS);
2238 }
2239 
2240 /*@
2241   PetscSFConcatenate - concatenate multiple `PetscSF` into one
2242 
2243   Input Parameters:
2244 + comm        - the communicator
2245 . nsfs        - the number of input `PetscSF`
2246 . sfs         - the array of input `PetscSF`
2247 . rootMode    - the root mode specifying how roots are handled
2248 - leafOffsets - the array of local leaf offsets, one for each input `PetscSF`, or `NULL` for contiguous storage
2249 
2250   Output Parameter:
2251 . newsf - The resulting `PetscSF`
2252 
2253   Level: advanced
2254 
2255   Notes:
2256   The communicator of all `PetscSF`s in `sfs` must be comm.
2257 
2258   Leaves are always concatenated locally, keeping them ordered by the input `PetscSF` index and original local order.
2259 
2260   The offsets in `leafOffsets` are added to the original leaf indices.
2261 
2262   If all input SFs use contiguous leaf storage (`ilocal` = `NULL`), `leafOffsets` can be passed as `NULL` as well.
2263   In this case, `NULL` is also passed as `ilocal` to the resulting `PetscSF`.
2264 
2265   If any input `PetscSF` has non-null `ilocal`, `leafOffsets` is needed to distinguish leaves from different input `PetscSF`s.
2266   In this case, user is responsible to provide correct offsets so that the resulting leaves are unique (otherwise an error occurs).
2267 
2268   All root modes retain the essential connectivity condition.
2269   If two leaves of the same input `PetscSF` are connected (sharing the same root), they are also connected in the output `PetscSF`.
2270   Parameter `rootMode` controls how the input root spaces are combined.
2271   For `PETSCSF_CONCATENATE_ROOTMODE_SHARED`, the root space is considered the same for each input `PetscSF` (checked in debug mode)
2272   and is also the same in the output `PetscSF`.
2273   For `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, the input root spaces are taken as separate and joined.
2274   `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` joins the root spaces locally;
2275   roots of sfs[0], sfs[1], sfs[2], ... are joined on each rank separately, ordered by input `PetscSF` and original local index, and renumbered contiguously.
2276   `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL` joins the root spaces globally;
2277   roots of sfs[0], sfs[1], sfs[2], ... are joined globally, ordered by input `PetscSF` index and original global index, and renumbered contiguously;
2278   the original root ranks are ignored.
2279   For both `PETSCSF_CONCATENATE_ROOTMODE_LOCAL` and `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`,
2280   the output `PetscSF`'s root layout is such that the local number of roots is a sum of the input `PetscSF`'s local numbers of roots on each rank
2281   to keep the load balancing.
2282   However, for `PETSCSF_CONCATENATE_ROOTMODE_GLOBAL`, roots can move to different ranks.
2283 
2284   Example:
2285   We can use src/vec/is/sf/tests/ex18.c to compare the root modes. By running
2286 .vb
2287   make -C $PETSC_DIR/src/vec/is/sf/tests ex18
2288   for m in {local,global,shared}; do
2289     mpirun -n 2 $PETSC_DIR/src/vec/is/sf/tests/ex18 -nsfs 2 -n 2 -root_mode $m -sf_view
2290   done
2291 .ve
2292   we generate two identical `PetscSF`s sf_0 and sf_1,
2293 .vb
2294   PetscSF Object: sf_0 2 MPI processes
2295     type: basic
2296     rank #leaves #roots
2297     [ 0]       4      2
2298     [ 1]       4      2
2299     leaves      roots       roots in global numbering
2300     ( 0,  0) <- ( 0,  0)  =   0
2301     ( 0,  1) <- ( 0,  1)  =   1
2302     ( 0,  2) <- ( 1,  0)  =   2
2303     ( 0,  3) <- ( 1,  1)  =   3
2304     ( 1,  0) <- ( 0,  0)  =   0
2305     ( 1,  1) <- ( 0,  1)  =   1
2306     ( 1,  2) <- ( 1,  0)  =   2
2307     ( 1,  3) <- ( 1,  1)  =   3
2308 .ve
2309   and pass them to `PetscSFConcatenate()` along with different choices of `rootMode`, yielding different result_sf\:
2310 .vb
2311   rootMode = local:
2312   PetscSF Object: result_sf 2 MPI processes
2313     type: basic
2314     rank #leaves #roots
2315     [ 0]       8      4
2316     [ 1]       8      4
2317     leaves      roots       roots in global numbering
2318     ( 0,  0) <- ( 0,  0)  =   0
2319     ( 0,  1) <- ( 0,  1)  =   1
2320     ( 0,  2) <- ( 1,  0)  =   4
2321     ( 0,  3) <- ( 1,  1)  =   5
2322     ( 0,  4) <- ( 0,  2)  =   2
2323     ( 0,  5) <- ( 0,  3)  =   3
2324     ( 0,  6) <- ( 1,  2)  =   6
2325     ( 0,  7) <- ( 1,  3)  =   7
2326     ( 1,  0) <- ( 0,  0)  =   0
2327     ( 1,  1) <- ( 0,  1)  =   1
2328     ( 1,  2) <- ( 1,  0)  =   4
2329     ( 1,  3) <- ( 1,  1)  =   5
2330     ( 1,  4) <- ( 0,  2)  =   2
2331     ( 1,  5) <- ( 0,  3)  =   3
2332     ( 1,  6) <- ( 1,  2)  =   6
2333     ( 1,  7) <- ( 1,  3)  =   7
2334 
2335   rootMode = global:
2336   PetscSF Object: result_sf 2 MPI processes
2337     type: basic
2338     rank #leaves #roots
2339     [ 0]       8      4
2340     [ 1]       8      4
2341     leaves      roots       roots in global numbering
2342     ( 0,  0) <- ( 0,  0)  =   0
2343     ( 0,  1) <- ( 0,  1)  =   1
2344     ( 0,  2) <- ( 0,  2)  =   2
2345     ( 0,  3) <- ( 0,  3)  =   3
2346     ( 0,  4) <- ( 1,  0)  =   4
2347     ( 0,  5) <- ( 1,  1)  =   5
2348     ( 0,  6) <- ( 1,  2)  =   6
2349     ( 0,  7) <- ( 1,  3)  =   7
2350     ( 1,  0) <- ( 0,  0)  =   0
2351     ( 1,  1) <- ( 0,  1)  =   1
2352     ( 1,  2) <- ( 0,  2)  =   2
2353     ( 1,  3) <- ( 0,  3)  =   3
2354     ( 1,  4) <- ( 1,  0)  =   4
2355     ( 1,  5) <- ( 1,  1)  =   5
2356     ( 1,  6) <- ( 1,  2)  =   6
2357     ( 1,  7) <- ( 1,  3)  =   7
2358 
2359   rootMode = shared:
2360   PetscSF Object: result_sf 2 MPI processes
2361     type: basic
2362     rank #leaves #roots
2363     [ 0]       8      2
2364     [ 1]       8      2
2365     leaves      roots       roots in global numbering
2366     ( 0,  0) <- ( 0,  0)  =   0
2367     ( 0,  1) <- ( 0,  1)  =   1
2368     ( 0,  2) <- ( 1,  0)  =   2
2369     ( 0,  3) <- ( 1,  1)  =   3
2370     ( 0,  4) <- ( 0,  0)  =   0
2371     ( 0,  5) <- ( 0,  1)  =   1
2372     ( 0,  6) <- ( 1,  0)  =   2
2373     ( 0,  7) <- ( 1,  1)  =   3
2374     ( 1,  0) <- ( 0,  0)  =   0
2375     ( 1,  1) <- ( 0,  1)  =   1
2376     ( 1,  2) <- ( 1,  0)  =   2
2377     ( 1,  3) <- ( 1,  1)  =   3
2378     ( 1,  4) <- ( 0,  0)  =   0
2379     ( 1,  5) <- ( 0,  1)  =   1
2380     ( 1,  6) <- ( 1,  0)  =   2
2381     ( 1,  7) <- ( 1,  1)  =   3
2382 .ve
2383 
2384 .seealso: `PetscSF`, `PetscSFCompose()`, `PetscSFGetGraph()`, `PetscSFSetGraph()`, `PetscSFConcatenateRootMode`
2385 @*/
2386 PetscErrorCode PetscSFConcatenate(MPI_Comm comm, PetscInt nsfs, PetscSF sfs[], PetscSFConcatenateRootMode rootMode, PetscInt leafOffsets[], PetscSF *newsf)
2387 {
2388   PetscInt     i, s, nLeaves, nRoots;
2389   PetscInt    *leafArrayOffsets;
2390   PetscInt    *ilocal_new;
2391   PetscSFNode *iremote_new;
2392   PetscBool    all_ilocal_null = PETSC_FALSE;
2393   PetscLayout  glayout         = NULL;
2394   PetscInt    *gremote         = NULL;
2395   PetscMPIInt  rank, size;
2396 
2397   PetscFunctionBegin;
2398   if (PetscDefined(USE_DEBUG)) {
2399     PetscSF dummy; /* just to have a PetscObject on comm for input validation */
2400 
2401     PetscCall(PetscSFCreate(comm, &dummy));
2402     PetscValidLogicalCollectiveInt(dummy, nsfs, 2);
2403     PetscAssertPointer(sfs, 3);
2404     for (i = 0; i < nsfs; i++) {
2405       PetscValidHeaderSpecific(sfs[i], PETSCSF_CLASSID, 3);
2406       PetscCheckSameComm(dummy, 1, sfs[i], 3);
2407     }
2408     PetscValidLogicalCollectiveEnum(dummy, rootMode, 4);
2409     if (leafOffsets) PetscAssertPointer(leafOffsets, 5);
2410     PetscAssertPointer(newsf, 6);
2411     PetscCall(PetscSFDestroy(&dummy));
2412   }
2413   if (!nsfs) {
2414     PetscCall(PetscSFCreate(comm, newsf));
2415     PetscCall(PetscSFSetGraph(*newsf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
2416     PetscFunctionReturn(PETSC_SUCCESS);
2417   }
2418   PetscCallMPI(MPI_Comm_rank(comm, &rank));
2419   PetscCallMPI(MPI_Comm_size(comm, &size));
2420 
2421   /* Calculate leaf array offsets */
2422   PetscCall(PetscMalloc1(nsfs + 1, &leafArrayOffsets));
2423   leafArrayOffsets[0] = 0;
2424   for (s = 0; s < nsfs; s++) {
2425     PetscInt nl;
2426 
2427     PetscCall(PetscSFGetGraph(sfs[s], NULL, &nl, NULL, NULL));
2428     leafArrayOffsets[s + 1] = leafArrayOffsets[s] + nl;
2429   }
2430   nLeaves = leafArrayOffsets[nsfs];
2431 
2432   /* Calculate number of roots */
2433   switch (rootMode) {
2434   case PETSCSF_CONCATENATE_ROOTMODE_SHARED: {
2435     PetscCall(PetscSFGetGraph(sfs[0], &nRoots, NULL, NULL, NULL));
2436     if (PetscDefined(USE_DEBUG)) {
2437       for (s = 1; s < nsfs; s++) {
2438         PetscInt nr;
2439 
2440         PetscCall(PetscSFGetGraph(sfs[s], &nr, NULL, NULL, NULL));
2441         PetscCheck(nr == nRoots, comm, PETSC_ERR_ARG_SIZ, "rootMode = %s but sfs[%" PetscInt_FMT "] has a different number of roots (%" PetscInt_FMT ") than sfs[0] (%" PetscInt_FMT ")", PetscSFConcatenateRootModes[rootMode], s, nr, nRoots);
2442       }
2443     }
2444   } break;
2445   case PETSCSF_CONCATENATE_ROOTMODE_GLOBAL: {
2446     /* Calculate also global layout in this case */
2447     PetscInt    *nls;
2448     PetscLayout *lts;
2449     PetscInt   **inds;
2450     PetscInt     j;
2451     PetscInt     rootOffset = 0;
2452 
2453     PetscCall(PetscCalloc3(nsfs, &lts, nsfs, &nls, nsfs, &inds));
2454     PetscCall(PetscLayoutCreate(comm, &glayout));
2455     glayout->bs = 1;
2456     glayout->n  = 0;
2457     glayout->N  = 0;
2458     for (s = 0; s < nsfs; s++) {
2459       PetscCall(PetscSFGetGraphLayout(sfs[s], &lts[s], &nls[s], NULL, &inds[s]));
2460       glayout->n += lts[s]->n;
2461       glayout->N += lts[s]->N;
2462     }
2463     PetscCall(PetscLayoutSetUp(glayout));
2464     PetscCall(PetscMalloc1(nLeaves, &gremote));
2465     for (s = 0, j = 0; s < nsfs; s++) {
2466       for (i = 0; i < nls[s]; i++, j++) gremote[j] = inds[s][i] + rootOffset;
2467       rootOffset += lts[s]->N;
2468       PetscCall(PetscLayoutDestroy(&lts[s]));
2469       PetscCall(PetscFree(inds[s]));
2470     }
2471     PetscCall(PetscFree3(lts, nls, inds));
2472     nRoots = glayout->N;
2473   } break;
2474   case PETSCSF_CONCATENATE_ROOTMODE_LOCAL:
2475     /* nRoots calculated later in this case */
2476     break;
2477   default:
2478     SETERRQ(comm, PETSC_ERR_ARG_WRONG, "Invalid PetscSFConcatenateRootMode %d", rootMode);
2479   }
2480 
2481   if (!leafOffsets) {
2482     all_ilocal_null = PETSC_TRUE;
2483     for (s = 0; s < nsfs; s++) {
2484       const PetscInt *ilocal;
2485 
2486       PetscCall(PetscSFGetGraph(sfs[s], NULL, NULL, &ilocal, NULL));
2487       if (ilocal) {
2488         all_ilocal_null = PETSC_FALSE;
2489         break;
2490       }
2491     }
2492     PetscCheck(all_ilocal_null, PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "leafOffsets can be passed as NULL only if all SFs have ilocal = NULL");
2493   }
2494 
2495   /* Renumber and concatenate local leaves */
2496   ilocal_new = NULL;
2497   if (!all_ilocal_null) {
2498     PetscCall(PetscMalloc1(nLeaves, &ilocal_new));
2499     for (i = 0; i < nLeaves; i++) ilocal_new[i] = -1;
2500     for (s = 0; s < nsfs; s++) {
2501       const PetscInt *ilocal;
2502       PetscInt       *ilocal_l = &ilocal_new[leafArrayOffsets[s]];
2503       PetscInt        i, nleaves_l;
2504 
2505       PetscCall(PetscSFGetGraph(sfs[s], NULL, &nleaves_l, &ilocal, NULL));
2506       for (i = 0; i < nleaves_l; i++) ilocal_l[i] = (ilocal ? ilocal[i] : i) + leafOffsets[s];
2507     }
2508   }
2509 
2510   /* Renumber and concatenate remote roots */
2511   if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL || rootMode == PETSCSF_CONCATENATE_ROOTMODE_SHARED) {
2512     PetscInt rootOffset = 0;
2513 
2514     PetscCall(PetscMalloc1(nLeaves, &iremote_new));
2515     for (i = 0; i < nLeaves; i++) {
2516       iremote_new[i].rank  = -1;
2517       iremote_new[i].index = -1;
2518     }
2519     for (s = 0; s < nsfs; s++) {
2520       PetscInt           i, nl, nr;
2521       PetscSF            tmp_sf;
2522       const PetscSFNode *iremote;
2523       PetscSFNode       *tmp_rootdata;
2524       PetscSFNode       *tmp_leafdata = &iremote_new[leafArrayOffsets[s]];
2525 
2526       PetscCall(PetscSFGetGraph(sfs[s], &nr, &nl, NULL, &iremote));
2527       PetscCall(PetscSFCreate(comm, &tmp_sf));
2528       /* create helper SF with contiguous leaves */
2529       PetscCall(PetscSFSetGraph(tmp_sf, nr, nl, NULL, PETSC_USE_POINTER, (PetscSFNode *)iremote, PETSC_COPY_VALUES));
2530       PetscCall(PetscSFSetUp(tmp_sf));
2531       PetscCall(PetscMalloc1(nr, &tmp_rootdata));
2532       if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) {
2533         for (i = 0; i < nr; i++) {
2534           tmp_rootdata[i].index = i + rootOffset;
2535           tmp_rootdata[i].rank  = (PetscInt)rank;
2536         }
2537         rootOffset += nr;
2538       } else {
2539         for (i = 0; i < nr; i++) {
2540           tmp_rootdata[i].index = i;
2541           tmp_rootdata[i].rank  = (PetscInt)rank;
2542         }
2543       }
2544       PetscCall(PetscSFBcastBegin(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2545       PetscCall(PetscSFBcastEnd(tmp_sf, MPIU_2INT, tmp_rootdata, tmp_leafdata, MPI_REPLACE));
2546       PetscCall(PetscSFDestroy(&tmp_sf));
2547       PetscCall(PetscFree(tmp_rootdata));
2548     }
2549     if (rootMode == PETSCSF_CONCATENATE_ROOTMODE_LOCAL) nRoots = rootOffset; // else nRoots already calculated above
2550 
2551     /* Build the new SF */
2552     PetscCall(PetscSFCreate(comm, newsf));
2553     PetscCall(PetscSFSetGraph(*newsf, nRoots, nLeaves, ilocal_new, PETSC_OWN_POINTER, iremote_new, PETSC_OWN_POINTER));
2554   } else {
2555     /* Build the new SF */
2556     PetscCall(PetscSFCreate(comm, newsf));
2557     PetscCall(PetscSFSetGraphLayout(*newsf, glayout, nLeaves, ilocal_new, PETSC_OWN_POINTER, gremote));
2558   }
2559   PetscCall(PetscSFSetUp(*newsf));
2560   PetscCall(PetscSFViewFromOptions(*newsf, NULL, "-sf_concat_view"));
2561   PetscCall(PetscLayoutDestroy(&glayout));
2562   PetscCall(PetscFree(gremote));
2563   PetscCall(PetscFree(leafArrayOffsets));
2564   PetscFunctionReturn(PETSC_SUCCESS);
2565 }
2566