xref: /petsc/src/vec/is/sf/interface/sf.c (revision bfcc362789b558bcb4d7977f28a2c9d2d6233b89)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6   #include <cuda_runtime.h>
7 #endif
8 
9 #if defined(PETSC_HAVE_HIP)
10   #include <hip/hip_runtime.h>
11 #endif
12 
13 #if defined(PETSC_USE_DEBUG)
14 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
15     if (PetscUnlikely(!(sf)->graphset))                              \
16       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
17   } while (0)
18 #else
19 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
20 #endif
21 
22 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",NULL};
23 
24 PETSC_STATIC_INLINE PetscErrorCode PetscGetMemType(const void *data,PetscMemType *type)
25 {
26   PetscFunctionBegin;
27   PetscValidPointer(type,2);
28   *type = PETSC_MEMTYPE_HOST;
29 #if defined(PETSC_HAVE_CUDA)
30   if (PetscCUDAInitialized && data) {
31     cudaError_t                  cerr;
32     struct cudaPointerAttributes attr;
33     enum cudaMemoryType          mtype;
34     cerr = cudaPointerGetAttributes(&attr,data); /* Do not check error since before CUDA 11.0, passing a host pointer returns cudaErrorInvalidValue */
35     cudaGetLastError(); /* Reset the last error */
36     #if (CUDART_VERSION < 10000)
37       mtype = attr.memoryType;
38     #else
39       mtype = attr.type;
40     #endif
41     if (cerr == cudaSuccess && mtype == cudaMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
42   }
43 #endif
44 
45 #if defined(PETSC_HAVE_HIP)
46   if (PetscHIPInitialized && data) {
47     hipError_t                   cerr;
48     struct hipPointerAttribute_t attr;
49     enum hipMemoryType           mtype;
50     cerr = hipPointerGetAttributes(&attr,data);
51     hipGetLastError(); /* Reset the last error */
52     mtype = attr.memoryType;
53     if (cerr == hipSuccess && mtype == hipMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
54   }
55 #endif
56   PetscFunctionReturn(0);
57 }
58 
59 /*@
60    PetscSFCreate - create a star forest communication context
61 
62    Collective
63 
64    Input Arguments:
65 .  comm - communicator on which the star forest will operate
66 
67    Output Arguments:
68 .  sf - new star forest context
69 
70    Options Database Keys:
71 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
72 .  -sf_type window    -Use MPI-3 one-sided window for communication
73 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
74 
75    Level: intermediate
76 
77    Notes:
78    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
79    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
80    SFs are optimized and they have better performance than general SFs.
81 
82 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
83 @*/
84 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
85 {
86   PetscErrorCode ierr;
87   PetscSF        b;
88 
89   PetscFunctionBegin;
90   PetscValidPointer(sf,2);
91   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
92 
93   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
94 
95   b->nroots    = -1;
96   b->nleaves   = -1;
97   b->minleaf   = PETSC_MAX_INT;
98   b->maxleaf   = PETSC_MIN_INT;
99   b->nranks    = -1;
100   b->rankorder = PETSC_TRUE;
101   b->ingroup   = MPI_GROUP_NULL;
102   b->outgroup  = MPI_GROUP_NULL;
103   b->graphset  = PETSC_FALSE;
104 #if defined(PETSC_HAVE_DEVICE)
105   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
106   b->use_stream_aware_mpi = PETSC_FALSE;
107   b->use_default_stream   = PETSC_TRUE; /* The assumption is true for PETSc internal use of SF */
108   /* Set the default */
109   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
110     b->backend = PETSCSF_BACKEND_KOKKOS;
111   #elif defined(PETSC_HAVE_CUDA)
112     b->backend = PETSCSF_BACKEND_CUDA;
113   #elif defined(PETSC_HAVE_HIP)
114     b->backend = PETSCSF_BACKEND_HIP;
115   #endif
116 #endif
117   b->vscat.from_n = -1;
118   b->vscat.to_n   = -1;
119   b->vscat.unit   = MPIU_SCALAR;
120  *sf = b;
121   PetscFunctionReturn(0);
122 }
123 
124 /*@
125    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
126 
127    Collective
128 
129    Input Arguments:
130 .  sf - star forest
131 
132    Level: advanced
133 
134 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
135 @*/
136 PetscErrorCode PetscSFReset(PetscSF sf)
137 {
138   PetscErrorCode ierr;
139 
140   PetscFunctionBegin;
141   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
142   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
143   sf->nroots   = -1;
144   sf->nleaves  = -1;
145   sf->minleaf  = PETSC_MAX_INT;
146   sf->maxleaf  = PETSC_MIN_INT;
147   sf->mine     = NULL;
148   sf->remote   = NULL;
149   sf->graphset = PETSC_FALSE;
150   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
151   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
152   sf->nranks = -1;
153   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
154 #if defined(PETSC_HAVE_DEVICE)
155   for (PetscInt i=0; i<2; i++) {ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,sf->rmine_d[i]);CHKERRQ(ierr);}
156 #endif
157   sf->degreeknown = PETSC_FALSE;
158   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
159   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRMPI(ierr);}
160   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRMPI(ierr);}
161   if (sf->multi) sf->multi->multi = NULL;
162   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
163   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
164   sf->setupcalled = PETSC_FALSE;
165   PetscFunctionReturn(0);
166 }
167 
168 /*@C
169    PetscSFSetType - Set the PetscSF communication implementation
170 
171    Collective on PetscSF
172 
173    Input Parameters:
174 +  sf - the PetscSF context
175 -  type - a known method
176 
177    Options Database Key:
178 .  -sf_type <type> - Sets the method; use -help for a list
179    of available methods (for instance, window, basic, neighbor)
180 
181    Notes:
182    See "include/petscsf.h" for available methods (for instance)
183 +    PETSCSFWINDOW - MPI-2/3 one-sided
184 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
185 
186   Level: intermediate
187 
188 .seealso: PetscSFType, PetscSFCreate()
189 @*/
190 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
191 {
192   PetscErrorCode ierr,(*r)(PetscSF);
193   PetscBool      match;
194 
195   PetscFunctionBegin;
196   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
197   PetscValidCharPointer(type,2);
198 
199   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
200   if (match) PetscFunctionReturn(0);
201 
202   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
203   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
204   /* Destroy the previous PetscSF implementation context */
205   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
206   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
207   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
208   ierr = (*r)(sf);CHKERRQ(ierr);
209   PetscFunctionReturn(0);
210 }
211 
212 /*@C
213   PetscSFGetType - Get the PetscSF communication implementation
214 
215   Not Collective
216 
217   Input Parameter:
218 . sf  - the PetscSF context
219 
220   Output Parameter:
221 . type - the PetscSF type name
222 
223   Level: intermediate
224 
225 .seealso: PetscSFSetType(), PetscSFCreate()
226 @*/
227 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
228 {
229   PetscFunctionBegin;
230   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
231   PetscValidPointer(type,2);
232   *type = ((PetscObject)sf)->type_name;
233   PetscFunctionReturn(0);
234 }
235 
236 /*@
237    PetscSFDestroy - destroy star forest
238 
239    Collective
240 
241    Input Arguments:
242 .  sf - address of star forest
243 
244    Level: intermediate
245 
246 .seealso: PetscSFCreate(), PetscSFReset()
247 @*/
248 PetscErrorCode PetscSFDestroy(PetscSF *sf)
249 {
250   PetscErrorCode ierr;
251 
252   PetscFunctionBegin;
253   if (!*sf) PetscFunctionReturn(0);
254   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
255   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
256   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
257   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
258   ierr = PetscSFDestroy(&(*sf)->vscat.lsf);CHKERRQ(ierr);
259   if ((*sf)->vscat.bs > 1) {ierr = MPI_Type_free(&(*sf)->vscat.unit);CHKERRQ(ierr);}
260   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
265 {
266   PetscInt           i, nleaves;
267   PetscMPIInt        size;
268   const PetscInt    *ilocal;
269   const PetscSFNode *iremote;
270   PetscErrorCode     ierr;
271 
272   PetscFunctionBegin;
273   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
274   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
275   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
276   for (i = 0; i < nleaves; i++) {
277     const PetscInt rank = iremote[i].rank;
278     const PetscInt remote = iremote[i].index;
279     const PetscInt leaf = ilocal ? ilocal[i] : i;
280     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
281     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
282     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
283   }
284   PetscFunctionReturn(0);
285 }
286 
287 
288 
289 
290 /*@
291    PetscSFSetUp - set up communication structures
292 
293    Collective
294 
295    Input Arguments:
296 .  sf - star forest communication object
297 
298    Level: beginner
299 
300 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
301 @*/
302 PetscErrorCode PetscSFSetUp(PetscSF sf)
303 {
304   PetscErrorCode ierr;
305 
306   PetscFunctionBegin;
307   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
308   PetscSFCheckGraphSet(sf,1);
309   if (sf->setupcalled) PetscFunctionReturn(0);
310   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
311   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
312   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);} /* Zero all sf->ops */
313   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
314 #if defined(PETSC_HAVE_CUDA)
315   if (sf->backend == PETSCSF_BACKEND_CUDA) {
316     sf->ops->Malloc = PetscSFMalloc_Cuda;
317     sf->ops->Free   = PetscSFFree_Cuda;
318   }
319 #endif
320 #if defined(PETSC_HAVE_HIP)
321   if (sf->backend == PETSCSF_BACKEND_HIP) {
322     sf->ops->Malloc = PetscSFMalloc_HIP;
323     sf->ops->Free   = PetscSFFree_HIP;
324   }
325 #endif
326 
327 #
328 #if defined(PETSC_HAVE_KOKKOS)
329   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
330     sf->ops->Malloc = PetscSFMalloc_Kokkos;
331     sf->ops->Free   = PetscSFFree_Kokkos;
332   }
333 #endif
334   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
335   sf->setupcalled = PETSC_TRUE;
336   PetscFunctionReturn(0);
337 }
338 
339 /*@
340    PetscSFSetFromOptions - set PetscSF options using the options database
341 
342    Logically Collective
343 
344    Input Arguments:
345 .  sf - star forest
346 
347    Options Database Keys:
348 +  -sf_type               - implementation type, see PetscSFSetType()
349 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
350 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
351                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
352                             If true, this option only works with -use_gpu_aware_mpi 1.
353 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
354                                If true, this option only works with -use_gpu_aware_mpi 1.
355 
356 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently SF has these backends: cuda, hip and Kokkos.
357                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
358                               the only available is kokkos.
359 
360    Level: intermediate
361 @*/
362 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
363 {
364   PetscSFType    deft;
365   char           type[256];
366   PetscErrorCode ierr;
367   PetscBool      flg;
368 
369   PetscFunctionBegin;
370   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
371   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
372   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
373   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
374   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
375   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
376 #if defined(PETSC_HAVE_DEVICE)
377   {
378     char        backendstr[32] = {0};
379     PetscBool   isCuda = PETSC_FALSE,isHip = PETSC_FALSE,isKokkos = PETSC_FALSE,set;
380     /* Change the defaults set in PetscSFCreate() with command line options */
381     ierr = PetscOptionsBool("-sf_use_default_stream","Assume SF's input and output root/leafdata is computed on the default stream","PetscSFSetFromOptions",sf->use_default_stream,&sf->use_default_stream,NULL);CHKERRQ(ierr);
382     ierr = PetscOptionsBool("-sf_use_stream_aware_mpi","Assume the underlying MPI is cuda-stream aware","PetscSFSetFromOptions",sf->use_stream_aware_mpi,&sf->use_stream_aware_mpi,NULL);CHKERRQ(ierr);
383     ierr = PetscOptionsString("-sf_backend","Select the device backend SF uses","PetscSFSetFromOptions",NULL,backendstr,sizeof(backendstr),&set);CHKERRQ(ierr);
384     ierr = PetscStrcasecmp("cuda",backendstr,&isCuda);CHKERRQ(ierr);
385     ierr = PetscStrcasecmp("kokkos",backendstr,&isKokkos);CHKERRQ(ierr);
386     ierr = PetscStrcasecmp("hip",backendstr,&isHip);CHKERRQ(ierr);
387   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
388     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
389     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
390     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
391     else if (set) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
392   #elif defined(PETSC_HAVE_KOKKOS)
393     if (set && !isKokkos) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You can only choose kokkos", backendstr);
394   #endif
395   }
396 #endif
397   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
398   ierr = PetscOptionsEnd();CHKERRQ(ierr);
399   PetscFunctionReturn(0);
400 }
401 
402 /*@
403    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
404 
405    Logically Collective
406 
407    Input Arguments:
408 +  sf - star forest
409 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
410 
411    Level: advanced
412 
413 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
414 @*/
415 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
416 {
417   PetscFunctionBegin;
418   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
419   PetscValidLogicalCollectiveBool(sf,flg,2);
420   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
421   sf->rankorder = flg;
422   PetscFunctionReturn(0);
423 }
424 
425 /*@
426    PetscSFSetGraph - Set a parallel star forest
427 
428    Collective
429 
430    Input Arguments:
431 +  sf - star forest
432 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
433 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
434 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
435 during setup in debug mode)
436 .  localmode - copy mode for ilocal
437 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
438 during setup in debug mode)
439 -  remotemode - copy mode for iremote
440 
441    Level: intermediate
442 
443    Notes:
444     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
445 
446    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
447    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
448    needed
449 
450    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
451    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
452 
453 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
454 @*/
455 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
456 {
457   PetscErrorCode ierr;
458 
459   PetscFunctionBegin;
460   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
461   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
462   if (nleaves > 0) PetscValidPointer(iremote,6);
463   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
464   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
465 
466   if (sf->nroots >= 0) { /* Reset only if graph already set */
467     ierr = PetscSFReset(sf);CHKERRQ(ierr);
468   }
469 
470   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
471 
472   sf->nroots  = nroots;
473   sf->nleaves = nleaves;
474 
475   if (nleaves && ilocal) {
476     PetscInt i;
477     PetscInt minleaf = PETSC_MAX_INT;
478     PetscInt maxleaf = PETSC_MIN_INT;
479     int      contiguous = 1;
480     for (i=0; i<nleaves; i++) {
481       minleaf = PetscMin(minleaf,ilocal[i]);
482       maxleaf = PetscMax(maxleaf,ilocal[i]);
483       contiguous &= (ilocal[i] == i);
484     }
485     sf->minleaf = minleaf;
486     sf->maxleaf = maxleaf;
487     if (contiguous) {
488       if (localmode == PETSC_OWN_POINTER) {
489         ierr = PetscFree(ilocal);CHKERRQ(ierr);
490       }
491       ilocal = NULL;
492     }
493   } else {
494     sf->minleaf = 0;
495     sf->maxleaf = nleaves - 1;
496   }
497 
498   if (ilocal) {
499     switch (localmode) {
500     case PETSC_COPY_VALUES:
501       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
502       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
503       sf->mine = sf->mine_alloc;
504       break;
505     case PETSC_OWN_POINTER:
506       sf->mine_alloc = (PetscInt*)ilocal;
507       sf->mine       = sf->mine_alloc;
508       break;
509     case PETSC_USE_POINTER:
510       sf->mine_alloc = NULL;
511       sf->mine       = (PetscInt*)ilocal;
512       break;
513     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
514     }
515   }
516 
517   switch (remotemode) {
518   case PETSC_COPY_VALUES:
519     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
520     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
521     sf->remote = sf->remote_alloc;
522     break;
523   case PETSC_OWN_POINTER:
524     sf->remote_alloc = (PetscSFNode*)iremote;
525     sf->remote       = sf->remote_alloc;
526     break;
527   case PETSC_USE_POINTER:
528     sf->remote_alloc = NULL;
529     sf->remote       = (PetscSFNode*)iremote;
530     break;
531   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
532   }
533 
534   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
535   sf->graphset = PETSC_TRUE;
536   PetscFunctionReturn(0);
537 }
538 
539 /*@
540   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
541 
542   Collective
543 
544   Input Parameters:
545 + sf      - The PetscSF
546 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
547 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
548 
549   Notes:
550   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
551   n and N are local and global sizes of x respectively.
552 
553   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
554   sequential vectors y on all ranks.
555 
556   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
557   sequential vector y on rank 0.
558 
559   In above cases, entries of x are roots and entries of y are leaves.
560 
561   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
562   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
563   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
564   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
565   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
566 
567   In this case, roots and leaves are symmetric.
568 
569   Level: intermediate
570  @*/
571 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
572 {
573   MPI_Comm       comm;
574   PetscInt       n,N,res[2];
575   PetscMPIInt    rank,size;
576   PetscSFType    type;
577   PetscErrorCode ierr;
578 
579   PetscFunctionBegin;
580   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
581   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
582   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
583   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
584 
585   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
586     type = PETSCSFALLTOALL;
587     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
588     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
589     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
590     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
591   } else {
592     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
593     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
594     res[0] = n;
595     res[1] = -n;
596     /* Check if n are same over all ranks so that we can optimize it */
597     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
598     if (res[0] == -res[1]) { /* same n */
599       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
600     } else {
601       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
602     }
603     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
604   }
605   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
606 
607   sf->pattern = pattern;
608   sf->mine    = NULL; /* Contiguous */
609 
610   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
611      Also set other easy stuff.
612    */
613   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
614     sf->nleaves      = N;
615     sf->nroots       = n;
616     sf->nranks       = size;
617     sf->minleaf      = 0;
618     sf->maxleaf      = N - 1;
619   } else if (pattern == PETSCSF_PATTERN_GATHER) {
620     sf->nleaves      = rank ? 0 : N;
621     sf->nroots       = n;
622     sf->nranks       = rank ? 0 : size;
623     sf->minleaf      = 0;
624     sf->maxleaf      = rank ? -1 : N - 1;
625   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
626     sf->nleaves      = size;
627     sf->nroots       = size;
628     sf->nranks       = size;
629     sf->minleaf      = 0;
630     sf->maxleaf      = size - 1;
631   }
632   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
633   sf->graphset = PETSC_TRUE;
634   PetscFunctionReturn(0);
635 }
636 
637 /*@
638    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
639 
640    Collective
641 
642    Input Arguments:
643 
644 .  sf - star forest to invert
645 
646    Output Arguments:
647 .  isf - inverse of sf
648    Level: advanced
649 
650    Notes:
651    All roots must have degree 1.
652 
653    The local space may be a permutation, but cannot be sparse.
654 
655 .seealso: PetscSFSetGraph()
656 @*/
657 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
658 {
659   PetscErrorCode ierr;
660   PetscMPIInt    rank;
661   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
662   const PetscInt *ilocal;
663   PetscSFNode    *roots,*leaves;
664 
665   PetscFunctionBegin;
666   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
667   PetscSFCheckGraphSet(sf,1);
668   PetscValidPointer(isf,2);
669 
670   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
671   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
672 
673   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
674   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
675   for (i=0; i<maxlocal; i++) {
676     leaves[i].rank  = rank;
677     leaves[i].index = i;
678   }
679   for (i=0; i <nroots; i++) {
680     roots[i].rank  = -1;
681     roots[i].index = -1;
682   }
683   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
684   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
685 
686   /* Check whether our leaves are sparse */
687   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
688   if (count == nroots) newilocal = NULL;
689   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
690     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
691     for (i=0,count=0; i<nroots; i++) {
692       if (roots[i].rank >= 0) {
693         newilocal[count]   = i;
694         roots[count].rank  = roots[i].rank;
695         roots[count].index = roots[i].index;
696         count++;
697       }
698     }
699   }
700 
701   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
702   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
703   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
704   PetscFunctionReturn(0);
705 }
706 
707 /*@
708    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
709 
710    Collective
711 
712    Input Arguments:
713 +  sf - communication object to duplicate
714 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
715 
716    Output Arguments:
717 .  newsf - new communication object
718 
719    Level: beginner
720 
721 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
722 @*/
723 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
724 {
725   PetscSFType    type;
726   PetscErrorCode ierr;
727   MPI_Datatype   dtype=MPIU_SCALAR;
728 
729   PetscFunctionBegin;
730   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
731   PetscValidLogicalCollectiveEnum(sf,opt,2);
732   PetscValidPointer(newsf,3);
733   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
734   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
735   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
736   if (opt == PETSCSF_DUPLICATE_GRAPH) {
737     PetscSFCheckGraphSet(sf,1);
738     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
739       PetscInt          nroots,nleaves;
740       const PetscInt    *ilocal;
741       const PetscSFNode *iremote;
742       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
743       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
744     } else {
745       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
746     }
747   }
748   /* Since oldtype is committed, so is newtype, according to MPI */
749   if (sf->vscat.bs > 1) {ierr = MPI_Type_dup(sf->vscat.unit,&dtype);CHKERRQ(ierr);}
750   (*newsf)->vscat.bs     = sf->vscat.bs;
751   (*newsf)->vscat.unit   = dtype;
752   (*newsf)->vscat.to_n   = sf->vscat.to_n;
753   (*newsf)->vscat.from_n = sf->vscat.from_n;
754   /* Do not copy lsf. Build it on demand since it is rarely used */
755 
756 #if defined(PETSC_HAVE_DEVICE)
757   (*newsf)->backend              = sf->backend;
758   (*newsf)->use_default_stream   = sf->use_default_stream;
759   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
760   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
761 #endif
762   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
763   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
764   PetscFunctionReturn(0);
765 }
766 
767 /*@C
768    PetscSFGetGraph - Get the graph specifying a parallel star forest
769 
770    Not Collective
771 
772    Input Arguments:
773 .  sf - star forest
774 
775    Output Arguments:
776 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
777 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
778 .  ilocal - locations of leaves in leafdata buffers
779 -  iremote - remote locations of root vertices for each leaf on the current process
780 
781    Notes:
782    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
783 
784    When called from Fortran, the returned iremote array is a copy and must be deallocated after use. Consequently, if you
785    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
786 
787    Level: intermediate
788 
789 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
790 @*/
791 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
792 {
793   PetscErrorCode ierr;
794 
795   PetscFunctionBegin;
796   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
797   if (sf->ops->GetGraph) {
798     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
799   } else {
800     if (nroots) *nroots = sf->nroots;
801     if (nleaves) *nleaves = sf->nleaves;
802     if (ilocal) *ilocal = sf->mine;
803     if (iremote) *iremote = sf->remote;
804   }
805   PetscFunctionReturn(0);
806 }
807 
808 /*@
809    PetscSFGetLeafRange - Get the active leaf ranges
810 
811    Not Collective
812 
813    Input Arguments:
814 .  sf - star forest
815 
816    Output Arguments:
817 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
818 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
819 
820    Level: developer
821 
822 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
823 @*/
824 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
825 {
826   PetscFunctionBegin;
827   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
828   PetscSFCheckGraphSet(sf,1);
829   if (minleaf) *minleaf = sf->minleaf;
830   if (maxleaf) *maxleaf = sf->maxleaf;
831   PetscFunctionReturn(0);
832 }
833 
834 /*@C
835    PetscSFViewFromOptions - View from Options
836 
837    Collective on PetscSF
838 
839    Input Parameters:
840 +  A - the star forest
841 .  obj - Optional object
842 -  name - command line option
843 
844    Level: intermediate
845 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
846 @*/
847 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
848 {
849   PetscErrorCode ierr;
850 
851   PetscFunctionBegin;
852   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
853   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
854   PetscFunctionReturn(0);
855 }
856 
857 /*@C
858    PetscSFView - view a star forest
859 
860    Collective
861 
862    Input Arguments:
863 +  sf - star forest
864 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
865 
866    Level: beginner
867 
868 .seealso: PetscSFCreate(), PetscSFSetGraph()
869 @*/
870 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
871 {
872   PetscErrorCode    ierr;
873   PetscBool         iascii;
874   PetscViewerFormat format;
875 
876   PetscFunctionBegin;
877   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
878   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
879   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
880   PetscCheckSameComm(sf,1,viewer,2);
881   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
882   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
883   if (iascii) {
884     PetscMPIInt rank;
885     PetscInt    ii,i,j;
886 
887     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
888     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
889     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
890     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
891       if (!sf->graphset) {
892         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
893         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
894         PetscFunctionReturn(0);
895       }
896       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
897       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
898       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
899       for (i=0; i<sf->nleaves; i++) {
900         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
901       }
902       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
903       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
904       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
905         PetscMPIInt *tmpranks,*perm;
906         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
907         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
908         for (i=0; i<sf->nranks; i++) perm[i] = i;
909         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
910         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
911         for (ii=0; ii<sf->nranks; ii++) {
912           i = perm[ii];
913           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
914           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
915             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
916           }
917         }
918         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
919       }
920       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
921       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
922     }
923     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
924   }
925   PetscFunctionReturn(0);
926 }
927 
928 /*@C
929    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
930 
931    Not Collective
932 
933    Input Arguments:
934 .  sf - star forest
935 
936    Output Arguments:
937 +  nranks - number of ranks referenced by local part
938 .  ranks - array of ranks
939 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
940 .  rmine - concatenated array holding local indices referencing each remote rank
941 -  rremote - concatenated array holding remote indices referenced for each remote rank
942 
943    Level: developer
944 
945 .seealso: PetscSFGetLeafRanks()
946 @*/
947 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
948 {
949   PetscErrorCode ierr;
950 
951   PetscFunctionBegin;
952   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
953   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
954   if (sf->ops->GetRootRanks) {
955     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
956   } else {
957     /* The generic implementation */
958     if (nranks)  *nranks  = sf->nranks;
959     if (ranks)   *ranks   = sf->ranks;
960     if (roffset) *roffset = sf->roffset;
961     if (rmine)   *rmine   = sf->rmine;
962     if (rremote) *rremote = sf->rremote;
963   }
964   PetscFunctionReturn(0);
965 }
966 
967 /*@C
968    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
969 
970    Not Collective
971 
972    Input Arguments:
973 .  sf - star forest
974 
975    Output Arguments:
976 +  niranks - number of leaf ranks referencing roots on this process
977 .  iranks - array of ranks
978 .  ioffset - offset in irootloc for each rank (length niranks+1)
979 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
980 
981    Level: developer
982 
983 .seealso: PetscSFGetRootRanks()
984 @*/
985 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
986 {
987   PetscErrorCode ierr;
988 
989   PetscFunctionBegin;
990   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
991   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
992   if (sf->ops->GetLeafRanks) {
993     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
994   } else {
995     PetscSFType type;
996     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
997     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
998   }
999   PetscFunctionReturn(0);
1000 }
1001 
1002 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
1003   PetscInt i;
1004   for (i=0; i<n; i++) {
1005     if (needle == list[i]) return PETSC_TRUE;
1006   }
1007   return PETSC_FALSE;
1008 }
1009 
1010 /*@C
1011    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
1012 
1013    Collective
1014 
1015    Input Arguments:
1016 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
1017 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
1018 
1019    Level: developer
1020 
1021 .seealso: PetscSFGetRootRanks()
1022 @*/
1023 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
1024 {
1025   PetscErrorCode     ierr;
1026   PetscTable         table;
1027   PetscTablePosition pos;
1028   PetscMPIInt        size,groupsize,*groupranks;
1029   PetscInt           *rcount,*ranks;
1030   PetscInt           i, irank = -1,orank = -1;
1031 
1032   PetscFunctionBegin;
1033   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1034   PetscSFCheckGraphSet(sf,1);
1035   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
1036   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
1037   for (i=0; i<sf->nleaves; i++) {
1038     /* Log 1-based rank */
1039     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
1040   }
1041   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
1042   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
1043   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
1044   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
1045   for (i=0; i<sf->nranks; i++) {
1046     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
1047     ranks[i]--;             /* Convert back to 0-based */
1048   }
1049   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
1050 
1051   /* We expect that dgroup is reliably "small" while nranks could be large */
1052   {
1053     MPI_Group group = MPI_GROUP_NULL;
1054     PetscMPIInt *dgroupranks;
1055     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1056     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRMPI(ierr);
1057     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
1058     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
1059     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
1060     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRMPI(ierr);}
1061     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1062     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
1063   }
1064 
1065   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1066   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i;) {
1067     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
1068       if (InList(ranks[i],groupsize,groupranks)) break;
1069     }
1070     for (; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1071       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
1072     }
1073     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
1074       PetscInt tmprank,tmpcount;
1075 
1076       tmprank             = ranks[i];
1077       tmpcount            = rcount[i];
1078       ranks[i]            = ranks[sf->ndranks];
1079       rcount[i]           = rcount[sf->ndranks];
1080       ranks[sf->ndranks]  = tmprank;
1081       rcount[sf->ndranks] = tmpcount;
1082       sf->ndranks++;
1083     }
1084   }
1085   ierr = PetscFree(groupranks);CHKERRQ(ierr);
1086   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
1087   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
1088   sf->roffset[0] = 0;
1089   for (i=0; i<sf->nranks; i++) {
1090     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
1091     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
1092     rcount[i]        = 0;
1093   }
1094   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
1095     /* short circuit */
1096     if (orank != sf->remote[i].rank) {
1097       /* Search for index of iremote[i].rank in sf->ranks */
1098       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
1099       if (irank < 0) {
1100         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
1101         if (irank >= 0) irank += sf->ndranks;
1102       }
1103       orank = sf->remote[i].rank;
1104     }
1105     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
1106     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1107     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1108     rcount[irank]++;
1109   }
1110   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 /*@C
1115    PetscSFGetGroups - gets incoming and outgoing process groups
1116 
1117    Collective
1118 
1119    Input Argument:
1120 .  sf - star forest
1121 
1122    Output Arguments:
1123 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1124 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1125 
1126    Level: developer
1127 
1128 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1129 @*/
1130 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1131 {
1132   PetscErrorCode ierr;
1133   MPI_Group      group = MPI_GROUP_NULL;
1134 
1135   PetscFunctionBegin;
1136   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1137   if (sf->ingroup == MPI_GROUP_NULL) {
1138     PetscInt       i;
1139     const PetscInt *indegree;
1140     PetscMPIInt    rank,*outranks,*inranks;
1141     PetscSFNode    *remote;
1142     PetscSF        bgcount;
1143 
1144     /* Compute the number of incoming ranks */
1145     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1146     for (i=0; i<sf->nranks; i++) {
1147       remote[i].rank  = sf->ranks[i];
1148       remote[i].index = 0;
1149     }
1150     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1151     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1152     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1153     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1154     /* Enumerate the incoming ranks */
1155     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1156     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1157     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1158     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1159     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1160     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1161     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRMPI(ierr);
1162     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1163     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1164     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1165   }
1166   *incoming = sf->ingroup;
1167 
1168   if (sf->outgroup == MPI_GROUP_NULL) {
1169     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1170     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRMPI(ierr);
1171     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1172   }
1173   *outgoing = sf->outgroup;
1174   PetscFunctionReturn(0);
1175 }
1176 
1177 /*@
1178    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1179 
1180    Collective
1181 
1182    Input Argument:
1183 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1184 
1185    Output Arguments:
1186 .  multi - star forest with split roots, such that each root has degree exactly 1
1187 
1188    Level: developer
1189 
1190    Notes:
1191 
1192    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1193    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1194    edge, it is a candidate for future optimization that might involve its removal.
1195 
1196 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1197 @*/
1198 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1199 {
1200   PetscErrorCode ierr;
1201 
1202   PetscFunctionBegin;
1203   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1204   PetscValidPointer(multi,2);
1205   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1206     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1207     *multi = sf->multi;
1208     sf->multi->multi = sf->multi;
1209     PetscFunctionReturn(0);
1210   }
1211   if (!sf->multi) {
1212     const PetscInt *indegree;
1213     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1214     PetscSFNode    *remote;
1215     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1216     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1217     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1218     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1219     inoffset[0] = 0;
1220     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1221     for (i=0; i<maxlocal; i++) outones[i] = 1;
1222     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1223     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1224     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1225     if (PetscDefined(USE_DEBUG)) { /* Check that the expected number of increments occurred */
1226       for (i=0; i<sf->nroots; i++) {
1227         if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1228       }
1229     }
1230     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1231     for (i=0; i<sf->nleaves; i++) {
1232       remote[i].rank  = sf->remote[i].rank;
1233       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1234     }
1235     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1236     sf->multi->multi = sf->multi;
1237     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1238     if (sf->rankorder) {        /* Sort the ranks */
1239       PetscMPIInt rank;
1240       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1241       PetscSFNode *newremote;
1242       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1243       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1244       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1245       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1246       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1247       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1248       /* Sort the incoming ranks at each vertex, build the inverse map */
1249       for (i=0; i<sf->nroots; i++) {
1250         PetscInt j;
1251         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1252         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1253         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1254       }
1255       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1256       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1257       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1258       for (i=0; i<sf->nleaves; i++) {
1259         newremote[i].rank  = sf->remote[i].rank;
1260         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1261       }
1262       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1263       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1264     }
1265     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1266   }
1267   *multi = sf->multi;
1268   PetscFunctionReturn(0);
1269 }
1270 
1271 /*@C
1272    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1273 
1274    Collective
1275 
1276    Input Arguments:
1277 +  sf - original star forest
1278 .  nselected  - number of selected roots on this process
1279 -  selected   - indices of the selected roots on this process
1280 
1281    Output Arguments:
1282 .  esf - new star forest
1283 
1284    Level: advanced
1285 
1286    Note:
1287    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1288    be done by calling PetscSFGetGraph().
1289 
1290 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1291 @*/
1292 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *esf)
1293 {
1294   PetscInt          i,j,n,nroots,nleaves,esf_nleaves,*new_ilocal,minleaf,maxleaf,maxlocal;
1295   const PetscInt    *ilocal;
1296   signed char       *rootdata,*leafdata,*leafmem;
1297   const PetscSFNode *iremote;
1298   PetscSFNode       *new_iremote;
1299   MPI_Comm          comm;
1300   PetscErrorCode    ierr;
1301 
1302   PetscFunctionBegin;
1303   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1304   PetscSFCheckGraphSet(sf,1);
1305   if (nselected) PetscValidPointer(selected,3);
1306   PetscValidPointer(esf,4);
1307 
1308   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1309   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1310   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1311   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1312 
1313   if (PetscDefined(USE_DEBUG)) {  /* Error out if selected[] has dups or  out of range indices */
1314     PetscBool dups;
1315     ierr = PetscCheckDupsInt(nselected,selected,&dups);CHKERRQ(ierr);
1316     if (dups) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"selected[] has dups");
1317     for (i=0; i<nselected; i++)
1318       if (selected[i] < 0 || selected[i] >= nroots) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"selected root indice %D is out of [0,%D)",selected[i],nroots);
1319   }
1320 
1321   if (sf->ops->CreateEmbeddedSF) {
1322     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,selected,esf);CHKERRQ(ierr);
1323   } else {
1324     /* A generic version of creating embedded sf */
1325     ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
1326     maxlocal = maxleaf - minleaf + 1;
1327     ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafmem);CHKERRQ(ierr);
1328     leafdata = leafmem - minleaf;
1329     /* Tag selected roots and bcast to leaves */
1330     for (i=0; i<nselected; i++) rootdata[selected[i]] = 1;
1331     ierr = PetscSFBcastBegin(sf,MPI_SIGNED_CHAR,rootdata,leafdata);CHKERRQ(ierr);
1332     ierr = PetscSFBcastEnd(sf,MPI_SIGNED_CHAR,rootdata,leafdata);CHKERRQ(ierr);
1333 
1334     /* Build esf with leaves that are still connected */
1335     esf_nleaves = 0;
1336     for (i=0; i<nleaves; i++) {
1337       j = ilocal ? ilocal[i] : i;
1338       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1339          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1340       */
1341       esf_nleaves += (leafdata[j] ? 1 : 0);
1342     }
1343     ierr = PetscMalloc1(esf_nleaves,&new_ilocal);CHKERRQ(ierr);
1344     ierr = PetscMalloc1(esf_nleaves,&new_iremote);CHKERRQ(ierr);
1345     for (i=n=0; i<nleaves; i++) {
1346       j = ilocal ? ilocal[i] : i;
1347       if (leafdata[j]) {
1348         new_ilocal[n]        = j;
1349         new_iremote[n].rank  = iremote[i].rank;
1350         new_iremote[n].index = iremote[i].index;
1351         ++n;
1352       }
1353     }
1354     ierr = PetscSFCreate(comm,esf);CHKERRQ(ierr);
1355     ierr = PetscSFSetFromOptions(*esf);CHKERRQ(ierr);
1356     ierr = PetscSFSetGraph(*esf,nroots,esf_nleaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1357     ierr = PetscFree2(rootdata,leafmem);CHKERRQ(ierr);
1358   }
1359   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 /*@C
1364   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1365 
1366   Collective
1367 
1368   Input Arguments:
1369 + sf - original star forest
1370 . nselected  - number of selected leaves on this process
1371 - selected   - indices of the selected leaves on this process
1372 
1373   Output Arguments:
1374 .  newsf - new star forest
1375 
1376   Level: advanced
1377 
1378 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1379 @*/
1380 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1381 {
1382   const PetscSFNode *iremote;
1383   PetscSFNode       *new_iremote;
1384   const PetscInt    *ilocal;
1385   PetscInt          i,nroots,*leaves,*new_ilocal;
1386   MPI_Comm          comm;
1387   PetscErrorCode    ierr;
1388 
1389   PetscFunctionBegin;
1390   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1391   PetscSFCheckGraphSet(sf,1);
1392   if (nselected) PetscValidPointer(selected,3);
1393   PetscValidPointer(newsf,4);
1394 
1395   /* Uniq selected[] and put results in leaves[] */
1396   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1397   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1398   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1399   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1400   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1401 
1402   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1403   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1404     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1405   } else {
1406     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1407     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1408     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1409     for (i=0; i<nselected; ++i) {
1410       const PetscInt l     = leaves[i];
1411       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1412       new_iremote[i].rank  = iremote[l].rank;
1413       new_iremote[i].index = iremote[l].index;
1414     }
1415     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1416     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1417   }
1418   ierr = PetscFree(leaves);CHKERRQ(ierr);
1419   PetscFunctionReturn(0);
1420 }
1421 
1422 /*@C
1423    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1424 
1425    Collective on PetscSF
1426 
1427    Input Arguments:
1428 +  sf - star forest on which to communicate
1429 .  unit - data type associated with each node
1430 .  rootdata - buffer to broadcast
1431 -  op - operation to use for reduction
1432 
1433    Output Arguments:
1434 .  leafdata - buffer to be reduced with values from each leaf's respective root
1435 
1436    Level: intermediate
1437 
1438    Notes:
1439     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1440     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1441     use PetscSFBcastAndOpWithMemTypeBegin() instead.
1442 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd(), PetscSFBcastAndOpWithMemTypeBegin()
1443 @*/
1444 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1445 {
1446   PetscErrorCode ierr;
1447   PetscMemType   rootmtype,leafmtype;
1448 
1449   PetscFunctionBegin;
1450   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1451   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1452   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1453   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1454   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1455   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1456   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1457   PetscFunctionReturn(0);
1458 }
1459 
1460 /*@C
1461    PetscSFBcastAndOpWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to PetscSFBcastAndOpEnd()
1462 
1463    Collective on PetscSF
1464 
1465    Input Arguments:
1466 +  sf - star forest on which to communicate
1467 .  unit - data type associated with each node
1468 .  rootmtype - memory type of rootdata
1469 .  rootdata - buffer to broadcast
1470 .  leafmtype - memory type of leafdata
1471 -  op - operation to use for reduction
1472 
1473    Output Arguments:
1474 .  leafdata - buffer to be reduced with values from each leaf's respective root
1475 
1476    Level: intermediate
1477 
1478 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd(),PetscSFBcastAndOpBegin()
1479 @*/
1480 PetscErrorCode PetscSFBcastAndOpWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
1481 {
1482   PetscErrorCode ierr;
1483 
1484   PetscFunctionBegin;
1485   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1486   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1487   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1488   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1489   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1490   PetscFunctionReturn(0);
1491 }
1492 
1493 /*@C
1494    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1495 
1496    Collective
1497 
1498    Input Arguments:
1499 +  sf - star forest
1500 .  unit - data type
1501 .  rootdata - buffer to broadcast
1502 -  op - operation to use for reduction
1503 
1504    Output Arguments:
1505 .  leafdata - buffer to be reduced with values from each leaf's respective root
1506 
1507    Level: intermediate
1508 
1509 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1510 @*/
1511 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1512 {
1513   PetscErrorCode ierr;
1514 
1515   PetscFunctionBegin;
1516   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1517   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);}
1518   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootdata,leafdata,op);CHKERRQ(ierr);
1519   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);}
1520   PetscFunctionReturn(0);
1521 }
1522 
1523 /*@C
1524    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1525 
1526    Collective
1527 
1528    Input Arguments:
1529 +  sf - star forest
1530 .  unit - data type
1531 .  leafdata - values to reduce
1532 -  op - reduction operation
1533 
1534    Output Arguments:
1535 .  rootdata - result of reduction of values from all leaves of each root
1536 
1537    Level: intermediate
1538 
1539    Notes:
1540     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1541     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1542     use PetscSFReduceWithMemTypeBegin() instead.
1543 
1544 .seealso: PetscSFBcastBegin(), PetscSFReduceWithMemTypeBegin()
1545 @*/
1546 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1547 {
1548   PetscErrorCode ierr;
1549   PetscMemType   rootmtype,leafmtype;
1550 
1551   PetscFunctionBegin;
1552   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1553   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1554   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1555   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1556   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1557   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1558   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1559   PetscFunctionReturn(0);
1560 }
1561 
1562 /*@C
1563    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to PetscSFReduceEnd()
1564 
1565    Collective
1566 
1567    Input Arguments:
1568 +  sf - star forest
1569 .  unit - data type
1570 .  leafmtype - memory type of leafdata
1571 .  leafdata - values to reduce
1572 .  rootmtype - memory type of rootdata
1573 -  op - reduction operation
1574 
1575    Output Arguments:
1576 .  rootdata - result of reduction of values from all leaves of each root
1577 
1578    Level: intermediate
1579 
1580 .seealso: PetscSFBcastBegin(), PetscSFReduceBegin()
1581 @*/
1582 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
1583 {
1584   PetscErrorCode ierr;
1585 
1586   PetscFunctionBegin;
1587   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1588   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1589   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1590   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1591   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 /*@C
1596    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1597 
1598    Collective
1599 
1600    Input Arguments:
1601 +  sf - star forest
1602 .  unit - data type
1603 .  leafdata - values to reduce
1604 -  op - reduction operation
1605 
1606    Output Arguments:
1607 .  rootdata - result of reduction of values from all leaves of each root
1608 
1609    Level: intermediate
1610 
1611 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1612 @*/
1613 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1614 {
1615   PetscErrorCode ierr;
1616 
1617   PetscFunctionBegin;
1618   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1619   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1620   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
1621   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1622   PetscFunctionReturn(0);
1623 }
1624 
1625 /*@C
1626    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1627 
1628    Collective
1629 
1630    Input Arguments:
1631 +  sf - star forest
1632 .  unit - data type
1633 .  leafdata - leaf values to use in reduction
1634 -  op - operation to use for reduction
1635 
1636    Output Arguments:
1637 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1638 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1639 
1640    Level: advanced
1641 
1642    Note:
1643    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1644    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1645    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1646    integers.
1647 
1648 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1649 @*/
1650 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1651 {
1652   PetscErrorCode ierr;
1653   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1654 
1655   PetscFunctionBegin;
1656   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1657   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1658   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1659   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1660   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1661   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1662   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1663   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1664   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1665   PetscFunctionReturn(0);
1666 }
1667 
1668 /*@C
1669    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1670 
1671    Collective
1672 
1673    Input Arguments:
1674 +  sf - star forest
1675 .  unit - data type
1676 .  leafdata - leaf values to use in reduction
1677 -  op - operation to use for reduction
1678 
1679    Output Arguments:
1680 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1681 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1682 
1683    Level: advanced
1684 
1685 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1686 @*/
1687 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1688 {
1689   PetscErrorCode ierr;
1690 
1691   PetscFunctionBegin;
1692   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1693   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1694   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootdata,leafdata,leafupdate,op);CHKERRQ(ierr);
1695   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1696   PetscFunctionReturn(0);
1697 }
1698 
1699 /*@C
1700    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1701 
1702    Collective
1703 
1704    Input Arguments:
1705 .  sf - star forest
1706 
1707    Output Arguments:
1708 .  degree - degree of each root vertex
1709 
1710    Level: advanced
1711 
1712    Notes:
1713    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1714 
1715 .seealso: PetscSFGatherBegin()
1716 @*/
1717 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1718 {
1719   PetscErrorCode ierr;
1720 
1721   PetscFunctionBegin;
1722   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1723   PetscSFCheckGraphSet(sf,1);
1724   PetscValidPointer(degree,2);
1725   if (!sf->degreeknown) {
1726     PetscInt i, nroots = sf->nroots, maxlocal;
1727     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1728     maxlocal = sf->maxleaf-sf->minleaf+1;
1729     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1730     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1731     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1732     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1733     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1734   }
1735   *degree = NULL;
1736   PetscFunctionReturn(0);
1737 }
1738 
1739 /*@C
1740    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1741 
1742    Collective
1743 
1744    Input Arguments:
1745 .  sf - star forest
1746 
1747    Output Arguments:
1748 .  degree - degree of each root vertex
1749 
1750    Level: developer
1751 
1752    Notes:
1753    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1754 
1755 .seealso:
1756 @*/
1757 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1758 {
1759   PetscErrorCode ierr;
1760 
1761   PetscFunctionBegin;
1762   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1763   PetscSFCheckGraphSet(sf,1);
1764   PetscValidPointer(degree,2);
1765   if (!sf->degreeknown) {
1766     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1767     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1768     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1769     sf->degreeknown = PETSC_TRUE;
1770   }
1771   *degree = sf->degree;
1772   PetscFunctionReturn(0);
1773 }
1774 
1775 
1776 /*@C
1777    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1778    Each multi-root is assigned index of the corresponding original root.
1779 
1780    Collective
1781 
1782    Input Arguments:
1783 +  sf - star forest
1784 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1785 
1786    Output Arguments:
1787 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1788 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1789 
1790    Level: developer
1791 
1792    Notes:
1793    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1794 
1795 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1796 @*/
1797 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1798 {
1799   PetscSF             msf;
1800   PetscInt            i, j, k, nroots, nmroots;
1801   PetscErrorCode      ierr;
1802 
1803   PetscFunctionBegin;
1804   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1805   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1806   if (nroots) PetscValidIntPointer(degree,2);
1807   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1808   PetscValidPointer(multiRootsOrigNumbering,4);
1809   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1810   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1811   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1812   for (i=0,j=0,k=0; i<nroots; i++) {
1813     if (!degree[i]) continue;
1814     for (j=0; j<degree[i]; j++,k++) {
1815       (*multiRootsOrigNumbering)[k] = i;
1816     }
1817   }
1818   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1819   if (nMultiRoots) *nMultiRoots = nmroots;
1820   PetscFunctionReturn(0);
1821 }
1822 
1823 /*@C
1824    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1825 
1826    Collective
1827 
1828    Input Arguments:
1829 +  sf - star forest
1830 .  unit - data type
1831 -  leafdata - leaf data to gather to roots
1832 
1833    Output Argument:
1834 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1835 
1836    Level: intermediate
1837 
1838 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1839 @*/
1840 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1841 {
1842   PetscErrorCode ierr;
1843   PetscSF        multi = NULL;
1844 
1845   PetscFunctionBegin;
1846   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1847   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1848   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1849   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1850   PetscFunctionReturn(0);
1851 }
1852 
1853 /*@C
1854    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1855 
1856    Collective
1857 
1858    Input Arguments:
1859 +  sf - star forest
1860 .  unit - data type
1861 -  leafdata - leaf data to gather to roots
1862 
1863    Output Argument:
1864 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1865 
1866    Level: intermediate
1867 
1868 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1869 @*/
1870 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1871 {
1872   PetscErrorCode ierr;
1873   PetscSF        multi = NULL;
1874 
1875   PetscFunctionBegin;
1876   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1877   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1878   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1879   PetscFunctionReturn(0);
1880 }
1881 
1882 /*@C
1883    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1884 
1885    Collective
1886 
1887    Input Arguments:
1888 +  sf - star forest
1889 .  unit - data type
1890 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1891 
1892    Output Argument:
1893 .  leafdata - leaf data to be update with personal data from each respective root
1894 
1895    Level: intermediate
1896 
1897 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1898 @*/
1899 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1900 {
1901   PetscErrorCode ierr;
1902   PetscSF        multi = NULL;
1903 
1904   PetscFunctionBegin;
1905   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1906   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1907   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1908   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1909   PetscFunctionReturn(0);
1910 }
1911 
1912 /*@C
1913    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1914 
1915    Collective
1916 
1917    Input Arguments:
1918 +  sf - star forest
1919 .  unit - data type
1920 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1921 
1922    Output Argument:
1923 .  leafdata - leaf data to be update with personal data from each respective root
1924 
1925    Level: intermediate
1926 
1927 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1928 @*/
1929 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1930 {
1931   PetscErrorCode ierr;
1932   PetscSF        multi = NULL;
1933 
1934   PetscFunctionBegin;
1935   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1936   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1937   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1938   PetscFunctionReturn(0);
1939 }
1940 
1941 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1942 {
1943   PetscInt        i, n, nleaves;
1944   const PetscInt *ilocal = NULL;
1945   PetscHSetI      seen;
1946   PetscErrorCode  ierr;
1947 
1948   PetscFunctionBegin;
1949   if (PetscDefined(USE_DEBUG)) {
1950     ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1951     ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1952     for (i = 0; i < nleaves; i++) {
1953       const PetscInt leaf = ilocal ? ilocal[i] : i;
1954       ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1955     }
1956     ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1957     if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1958     ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1959   }
1960   PetscFunctionReturn(0);
1961 }
1962 
1963 /*@
1964   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1965 
1966   Input Parameters:
1967 + sfA - The first PetscSF
1968 - sfB - The second PetscSF
1969 
1970   Output Parameters:
1971 . sfBA - The composite SF
1972 
1973   Level: developer
1974 
1975   Notes:
1976   Currently, the two SFs must be defined on congruent communicators and they must be true star
1977   forests, i.e. the same leaf is not connected with different roots.
1978 
1979   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1980   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1981   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1982   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1983 
1984 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1985 @*/
1986 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1987 {
1988   PetscErrorCode    ierr;
1989   const PetscSFNode *remotePointsA,*remotePointsB;
1990   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1991   const PetscInt    *localPointsA,*localPointsB;
1992   PetscInt          *localPointsBA;
1993   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1994   PetscBool         denseB;
1995 
1996   PetscFunctionBegin;
1997   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1998   PetscSFCheckGraphSet(sfA,1);
1999   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2000   PetscSFCheckGraphSet(sfB,2);
2001   PetscCheckSameComm(sfA,1,sfB,2);
2002   PetscValidPointer(sfBA,3);
2003   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2004   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2005 
2006   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
2007   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
2008   if (localPointsA) {
2009     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
2010     for (i=0; i<numRootsB; i++) {
2011       reorderedRemotePointsA[i].rank = -1;
2012       reorderedRemotePointsA[i].index = -1;
2013     }
2014     for (i=0; i<numLeavesA; i++) {
2015       if (localPointsA[i] >= numRootsB) continue;
2016       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
2017     }
2018     remotePointsA = reorderedRemotePointsA;
2019   }
2020   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
2021   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
2022   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
2023   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
2024   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2025 
2026   denseB = (PetscBool)!localPointsB;
2027   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2028     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
2029     else numLeavesBA++;
2030   }
2031   if (denseB) {
2032     localPointsBA  = NULL;
2033     remotePointsBA = leafdataB;
2034   } else {
2035     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
2036     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
2037     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2038       const PetscInt l = localPointsB ? localPointsB[i] : i;
2039 
2040       if (leafdataB[l-minleaf].rank == -1) continue;
2041       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
2042       localPointsBA[numLeavesBA] = l;
2043       numLeavesBA++;
2044     }
2045     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
2046   }
2047   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2048   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2049   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2050   PetscFunctionReturn(0);
2051 }
2052 
2053 /*@
2054   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
2055 
2056   Input Parameters:
2057 + sfA - The first PetscSF
2058 - sfB - The second PetscSF
2059 
2060   Output Parameters:
2061 . sfBA - The composite SF.
2062 
2063   Level: developer
2064 
2065   Notes:
2066   Currently, the two SFs must be defined on congruent communicators and they must be true star
2067   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2068   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2069 
2070   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2071   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2072   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2073   a Reduce on sfB, on connected roots.
2074 
2075 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
2076 @*/
2077 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
2078 {
2079   PetscErrorCode    ierr;
2080   const PetscSFNode *remotePointsA,*remotePointsB;
2081   PetscSFNode       *remotePointsBA;
2082   const PetscInt    *localPointsA,*localPointsB;
2083   PetscSFNode       *reorderedRemotePointsA = NULL;
2084   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
2085   MPI_Op            op;
2086 #if defined(PETSC_USE_64BIT_INDICES)
2087   PetscBool         iswin;
2088 #endif
2089 
2090   PetscFunctionBegin;
2091   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2092   PetscSFCheckGraphSet(sfA,1);
2093   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2094   PetscSFCheckGraphSet(sfB,2);
2095   PetscCheckSameComm(sfA,1,sfB,2);
2096   PetscValidPointer(sfBA,3);
2097   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2098   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2099 
2100   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
2101   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
2102 
2103   /* TODO: Check roots of sfB have degree of 1 */
2104   /* Once we implement it, we can replace the MPI_MAXLOC
2105      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
2106      We use MPI_MAXLOC only to have a deterministic output from this routine if
2107      the root condition is not meet.
2108    */
2109   op = MPI_MAXLOC;
2110 #if defined(PETSC_USE_64BIT_INDICES)
2111   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2112   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
2113   if (iswin) op = MPIU_REPLACE;
2114 #endif
2115 
2116   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
2117   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
2118   for (i=0; i<maxleaf - minleaf + 1; i++) {
2119     reorderedRemotePointsA[i].rank = -1;
2120     reorderedRemotePointsA[i].index = -1;
2121   }
2122   if (localPointsA) {
2123     for (i=0; i<numLeavesA; i++) {
2124       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2125       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2126     }
2127   } else {
2128     for (i=0; i<numLeavesA; i++) {
2129       if (i > maxleaf || i < minleaf) continue;
2130       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2131     }
2132   }
2133 
2134   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
2135   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
2136   for (i=0; i<numRootsB; i++) {
2137     remotePointsBA[i].rank = -1;
2138     remotePointsBA[i].index = -1;
2139   }
2140 
2141   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2142   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2143   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2144   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
2145     if (remotePointsBA[i].rank == -1) continue;
2146     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
2147     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2148     localPointsBA[numLeavesBA] = i;
2149     numLeavesBA++;
2150   }
2151   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2152   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2153   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2154   PetscFunctionReturn(0);
2155 }
2156 
2157 /*
2158   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
2159 
2160   Input Parameters:
2161 . sf - The global PetscSF
2162 
2163   Output Parameters:
2164 . out - The local PetscSF
2165  */
2166 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
2167 {
2168   MPI_Comm           comm;
2169   PetscMPIInt        myrank;
2170   const PetscInt     *ilocal;
2171   const PetscSFNode  *iremote;
2172   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
2173   PetscSFNode        *liremote;
2174   PetscSF            lsf;
2175   PetscErrorCode     ierr;
2176 
2177   PetscFunctionBegin;
2178   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2179   if (sf->ops->CreateLocalSF) {
2180     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2181   } else {
2182     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2183     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2184     ierr = MPI_Comm_rank(comm,&myrank);CHKERRMPI(ierr);
2185 
2186     /* Find out local edges and build a local SF */
2187     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2188     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2189     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2190     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2191 
2192     for (i=j=0; i<nleaves; i++) {
2193       if (iremote[i].rank == (PetscInt)myrank) {
2194         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2195         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2196         liremote[j].index = iremote[i].index;
2197         j++;
2198       }
2199     }
2200     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2201     ierr = PetscSFSetFromOptions(lsf);CHKERRQ(ierr);
2202     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2203     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2204     *out = lsf;
2205   }
2206   PetscFunctionReturn(0);
2207 }
2208 
2209 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2210 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2211 {
2212   PetscErrorCode     ierr;
2213   PetscMemType       rootmtype,leafmtype;
2214 
2215   PetscFunctionBegin;
2216   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2217   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2218   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2219   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2220   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2221   if (sf->ops->BcastToZero) {
2222     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2223   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2224   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2225   PetscFunctionReturn(0);
2226 }
2227 
2228