xref: /petsc/src/vec/is/sf/interface/sf.c (revision 0542ac825773a8ab0dd6938c7fdf4bf42897a690)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6   #include <cuda_runtime.h>
7 #endif
8 
9 #if defined(PETSC_HAVE_HIP)
10   #include <hip/hip_runtime.h>
11 #endif
12 
13 #if defined(PETSC_USE_DEBUG)
14 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
15     if (PetscUnlikely(!(sf)->graphset))                              \
16       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
17   } while (0)
18 #else
19 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
20 #endif
21 
22 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",NULL};
23 
24 PETSC_STATIC_INLINE PetscErrorCode PetscGetMemType(const void *data,PetscMemType *type)
25 {
26   PetscFunctionBegin;
27   PetscValidPointer(type,2);
28   *type = PETSC_MEMTYPE_HOST;
29 #if defined(PETSC_HAVE_CUDA)
30   if (PetscCUDAInitialized && data) {
31     cudaError_t                  cerr;
32     struct cudaPointerAttributes attr;
33     enum cudaMemoryType          mtype;
34     cerr = cudaPointerGetAttributes(&attr,data); /* Do not check error since before CUDA 11.0, passing a host pointer returns cudaErrorInvalidValue */
35     cudaGetLastError(); /* Reset the last error */
36     #if (CUDART_VERSION < 10000)
37       mtype = attr.memoryType;
38     #else
39       mtype = attr.type;
40     #endif
41     if (cerr == cudaSuccess && mtype == cudaMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
42   }
43 #endif
44 
45 #if defined(PETSC_HAVE_HIP)
46   if (PetscHIPInitialized && data) {
47     hipError_t                   cerr;
48     struct hipPointerAttribute_t attr;
49     enum hipMemoryType           mtype;
50     cerr = hipPointerGetAttributes(&attr,data);
51     hipGetLastError(); /* Reset the last error */
52     mtype = attr.memoryType;
53     if (cerr == hipSuccess && mtype == hipMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
54   }
55 #endif
56   PetscFunctionReturn(0);
57 }
58 
59 /*@
60    PetscSFCreate - create a star forest communication context
61 
62    Collective
63 
64    Input Parameter:
65 .  comm - communicator on which the star forest will operate
66 
67    Output Parameter:
68 .  sf - new star forest context
69 
70    Options Database Keys:
71 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
72 .  -sf_type window    -Use MPI-3 one-sided window for communication
73 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
74 
75    Level: intermediate
76 
77    Notes:
78    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
79    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
80    SFs are optimized and they have better performance than general SFs.
81 
82 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
83 @*/
84 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
85 {
86   PetscErrorCode ierr;
87   PetscSF        b;
88 
89   PetscFunctionBegin;
90   PetscValidPointer(sf,2);
91   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
92 
93   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
94 
95   b->nroots    = -1;
96   b->nleaves   = -1;
97   b->minleaf   = PETSC_MAX_INT;
98   b->maxleaf   = PETSC_MIN_INT;
99   b->nranks    = -1;
100   b->rankorder = PETSC_TRUE;
101   b->ingroup   = MPI_GROUP_NULL;
102   b->outgroup  = MPI_GROUP_NULL;
103   b->graphset  = PETSC_FALSE;
104 #if defined(PETSC_HAVE_DEVICE)
105   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
106   b->use_stream_aware_mpi = PETSC_FALSE;
107   b->unknown_input_stream= PETSC_FALSE;
108   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
109     b->backend = PETSCSF_BACKEND_KOKKOS;
110   #elif defined(PETSC_HAVE_CUDA)
111     b->backend = PETSCSF_BACKEND_CUDA;
112   #elif defined(PETSC_HAVE_HIP)
113     b->backend = PETSCSF_BACKEND_HIP;
114   #endif
115 
116   #if defined(PETSC_HAVE_NVSHMEM)
117     b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
118     b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
119     ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&b->use_nvshmem,NULL);CHKERRQ(ierr);
120     ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem_get",&b->use_nvshmem_get,NULL);CHKERRQ(ierr);
121   #endif
122 #endif
123   b->vscat.from_n = -1;
124   b->vscat.to_n   = -1;
125   b->vscat.unit   = MPIU_SCALAR;
126  *sf = b;
127   PetscFunctionReturn(0);
128 }
129 
130 /*@
131    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
132 
133    Collective
134 
135    Input Parameter:
136 .  sf - star forest
137 
138    Level: advanced
139 
140 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
141 @*/
142 PetscErrorCode PetscSFReset(PetscSF sf)
143 {
144   PetscErrorCode ierr;
145 
146   PetscFunctionBegin;
147   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
148   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
149   sf->nroots   = -1;
150   sf->nleaves  = -1;
151   sf->minleaf  = PETSC_MAX_INT;
152   sf->maxleaf  = PETSC_MIN_INT;
153   sf->mine     = NULL;
154   sf->remote   = NULL;
155   sf->graphset = PETSC_FALSE;
156   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
157   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
158   sf->nranks = -1;
159   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
160   sf->degreeknown = PETSC_FALSE;
161   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
162   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRMPI(ierr);}
163   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRMPI(ierr);}
164   if (sf->multi) sf->multi->multi = NULL;
165   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
166   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
167 
168  #if defined(PETSC_HAVE_DEVICE)
169   for (PetscInt i=0; i<2; i++) {ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,sf->rmine_d[i]);CHKERRQ(ierr);}
170  #endif
171 
172   sf->setupcalled = PETSC_FALSE;
173   PetscFunctionReturn(0);
174 }
175 
176 /*@C
177    PetscSFSetType - Set the PetscSF communication implementation
178 
179    Collective on PetscSF
180 
181    Input Parameters:
182 +  sf - the PetscSF context
183 -  type - a known method
184 
185    Options Database Key:
186 .  -sf_type <type> - Sets the method; use -help for a list
187    of available methods (for instance, window, basic, neighbor)
188 
189    Notes:
190    See "include/petscsf.h" for available methods (for instance)
191 +    PETSCSFWINDOW - MPI-2/3 one-sided
192 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
193 
194   Level: intermediate
195 
196 .seealso: PetscSFType, PetscSFCreate()
197 @*/
198 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
199 {
200   PetscErrorCode ierr,(*r)(PetscSF);
201   PetscBool      match;
202 
203   PetscFunctionBegin;
204   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
205   PetscValidCharPointer(type,2);
206 
207   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
208   if (match) PetscFunctionReturn(0);
209 
210   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
211   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
212   /* Destroy the previous PetscSF implementation context */
213   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
214   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
215   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
216   ierr = (*r)(sf);CHKERRQ(ierr);
217   PetscFunctionReturn(0);
218 }
219 
220 /*@C
221   PetscSFGetType - Get the PetscSF communication implementation
222 
223   Not Collective
224 
225   Input Parameter:
226 . sf  - the PetscSF context
227 
228   Output Parameter:
229 . type - the PetscSF type name
230 
231   Level: intermediate
232 
233 .seealso: PetscSFSetType(), PetscSFCreate()
234 @*/
235 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
236 {
237   PetscFunctionBegin;
238   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
239   PetscValidPointer(type,2);
240   *type = ((PetscObject)sf)->type_name;
241   PetscFunctionReturn(0);
242 }
243 
244 /*@C
245    PetscSFDestroy - destroy star forest
246 
247    Collective
248 
249    Input Parameter:
250 .  sf - address of star forest
251 
252    Level: intermediate
253 
254 .seealso: PetscSFCreate(), PetscSFReset()
255 @*/
256 PetscErrorCode PetscSFDestroy(PetscSF *sf)
257 {
258   PetscErrorCode ierr;
259 
260   PetscFunctionBegin;
261   if (!*sf) PetscFunctionReturn(0);
262   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
263   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
264   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
265   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
266   ierr = PetscSFDestroy(&(*sf)->vscat.lsf);CHKERRQ(ierr);
267   if ((*sf)->vscat.bs > 1) {ierr = MPI_Type_free(&(*sf)->vscat.unit);CHKERRMPI(ierr);}
268   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
269   PetscFunctionReturn(0);
270 }
271 
272 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
273 {
274   PetscInt           i, nleaves;
275   PetscMPIInt        size;
276   const PetscInt    *ilocal;
277   const PetscSFNode *iremote;
278   PetscErrorCode     ierr;
279 
280   PetscFunctionBegin;
281   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
282   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
283   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
284   for (i = 0; i < nleaves; i++) {
285     const PetscInt rank = iremote[i].rank;
286     const PetscInt remote = iremote[i].index;
287     const PetscInt leaf = ilocal ? ilocal[i] : i;
288     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
289     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
290     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
291   }
292   PetscFunctionReturn(0);
293 }
294 
295 /*@
296    PetscSFSetUp - set up communication structures
297 
298    Collective
299 
300    Input Parameter:
301 .  sf - star forest communication object
302 
303    Level: beginner
304 
305 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
306 @*/
307 PetscErrorCode PetscSFSetUp(PetscSF sf)
308 {
309   PetscErrorCode ierr;
310 
311   PetscFunctionBegin;
312   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
313   PetscSFCheckGraphSet(sf,1);
314   if (sf->setupcalled) PetscFunctionReturn(0);
315   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
316   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
317   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);} /* Zero all sf->ops */
318   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
319 #if defined(PETSC_HAVE_CUDA)
320   if (sf->backend == PETSCSF_BACKEND_CUDA) {
321     sf->ops->Malloc = PetscSFMalloc_CUDA;
322     sf->ops->Free   = PetscSFFree_CUDA;
323   }
324 #endif
325 #if defined(PETSC_HAVE_HIP)
326   if (sf->backend == PETSCSF_BACKEND_HIP) {
327     sf->ops->Malloc = PetscSFMalloc_HIP;
328     sf->ops->Free   = PetscSFFree_HIP;
329   }
330 #endif
331 
332 #
333 #if defined(PETSC_HAVE_KOKKOS)
334   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
335     sf->ops->Malloc = PetscSFMalloc_Kokkos;
336     sf->ops->Free   = PetscSFFree_Kokkos;
337   }
338 #endif
339   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
340   sf->setupcalled = PETSC_TRUE;
341   PetscFunctionReturn(0);
342 }
343 
344 /*@
345    PetscSFSetFromOptions - set PetscSF options using the options database
346 
347    Logically Collective
348 
349    Input Parameter:
350 .  sf - star forest
351 
352    Options Database Keys:
353 +  -sf_type               - implementation type, see PetscSFSetType()
354 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
355 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
356                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
357                             If true, this option only works with -use_gpu_aware_mpi 1.
358 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
359                                If true, this option only works with -use_gpu_aware_mpi 1.
360 
361 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently SF has these backends: cuda, hip and Kokkos.
362                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
363                               the only available is kokkos.
364 
365    Level: intermediate
366 @*/
367 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
368 {
369   PetscSFType    deft;
370   char           type[256];
371   PetscErrorCode ierr;
372   PetscBool      flg;
373 
374   PetscFunctionBegin;
375   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
376   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
377   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
378   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
379   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
380   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
381  #if defined(PETSC_HAVE_DEVICE)
382   {
383     char        backendstr[32] = {0};
384     PetscBool   isCuda = PETSC_FALSE,isHip = PETSC_FALSE,isKokkos = PETSC_FALSE,set;
385     /* Change the defaults set in PetscSFCreate() with command line options */
386     ierr = PetscOptionsBool("-sf_unknown_input_stream","SF root/leafdata is computed on arbitary streams unknown to SF","PetscSFSetFromOptions",sf->unknown_input_stream,&sf->unknown_input_stream,NULL);CHKERRQ(ierr);
387     ierr = PetscOptionsBool("-sf_use_stream_aware_mpi","Assume the underlying MPI is cuda-stream aware","PetscSFSetFromOptions",sf->use_stream_aware_mpi,&sf->use_stream_aware_mpi,NULL);CHKERRQ(ierr);
388     ierr = PetscOptionsString("-sf_backend","Select the device backend SF uses","PetscSFSetFromOptions",NULL,backendstr,sizeof(backendstr),&set);CHKERRQ(ierr);
389     ierr = PetscStrcasecmp("cuda",backendstr,&isCuda);CHKERRQ(ierr);
390     ierr = PetscStrcasecmp("kokkos",backendstr,&isKokkos);CHKERRQ(ierr);
391     ierr = PetscStrcasecmp("hip",backendstr,&isHip);CHKERRQ(ierr);
392   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
393     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
394     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
395     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
396     else if (set) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
397   #elif defined(PETSC_HAVE_KOKKOS)
398     if (set && !isKokkos) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You can only choose kokkos", backendstr);
399    #endif
400   }
401  #endif
402   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
403   ierr = PetscOptionsEnd();CHKERRQ(ierr);
404   PetscFunctionReturn(0);
405 }
406 
407 /*@
408    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
409 
410    Logically Collective
411 
412    Input Parameters:
413 +  sf - star forest
414 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
415 
416    Level: advanced
417 
418 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
419 @*/
420 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
421 {
422   PetscFunctionBegin;
423   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
424   PetscValidLogicalCollectiveBool(sf,flg,2);
425   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
426   sf->rankorder = flg;
427   PetscFunctionReturn(0);
428 }
429 
430 /*@C
431    PetscSFSetGraph - Set a parallel star forest
432 
433    Collective
434 
435    Input Parameters:
436 +  sf - star forest
437 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
438 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
439 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
440 during setup in debug mode)
441 .  localmode - copy mode for ilocal
442 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
443 during setup in debug mode)
444 -  remotemode - copy mode for iremote
445 
446    Level: intermediate
447 
448    Notes:
449     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
450 
451    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
452    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
453    needed
454 
455    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
456    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
457 
458 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
459 @*/
460 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
461 {
462   PetscErrorCode ierr;
463 
464   PetscFunctionBegin;
465   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
466   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
467   if (nleaves > 0) PetscValidPointer(iremote,6);
468   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
469   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
470 
471   if (sf->nroots >= 0) { /* Reset only if graph already set */
472     ierr = PetscSFReset(sf);CHKERRQ(ierr);
473   }
474 
475   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
476 
477   sf->nroots  = nroots;
478   sf->nleaves = nleaves;
479 
480   if (nleaves && ilocal) {
481     PetscInt i;
482     PetscInt minleaf = PETSC_MAX_INT;
483     PetscInt maxleaf = PETSC_MIN_INT;
484     int      contiguous = 1;
485     for (i=0; i<nleaves; i++) {
486       minleaf = PetscMin(minleaf,ilocal[i]);
487       maxleaf = PetscMax(maxleaf,ilocal[i]);
488       contiguous &= (ilocal[i] == i);
489     }
490     sf->minleaf = minleaf;
491     sf->maxleaf = maxleaf;
492     if (contiguous) {
493       if (localmode == PETSC_OWN_POINTER) {
494         ierr = PetscFree(ilocal);CHKERRQ(ierr);
495       }
496       ilocal = NULL;
497     }
498   } else {
499     sf->minleaf = 0;
500     sf->maxleaf = nleaves - 1;
501   }
502 
503   if (ilocal) {
504     switch (localmode) {
505     case PETSC_COPY_VALUES:
506       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
507       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
508       sf->mine = sf->mine_alloc;
509       break;
510     case PETSC_OWN_POINTER:
511       sf->mine_alloc = (PetscInt*)ilocal;
512       sf->mine       = sf->mine_alloc;
513       break;
514     case PETSC_USE_POINTER:
515       sf->mine_alloc = NULL;
516       sf->mine       = (PetscInt*)ilocal;
517       break;
518     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
519     }
520   }
521 
522   switch (remotemode) {
523   case PETSC_COPY_VALUES:
524     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
525     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
526     sf->remote = sf->remote_alloc;
527     break;
528   case PETSC_OWN_POINTER:
529     sf->remote_alloc = (PetscSFNode*)iremote;
530     sf->remote       = sf->remote_alloc;
531     break;
532   case PETSC_USE_POINTER:
533     sf->remote_alloc = NULL;
534     sf->remote       = (PetscSFNode*)iremote;
535     break;
536   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
537   }
538 
539   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
540   sf->graphset = PETSC_TRUE;
541   PetscFunctionReturn(0);
542 }
543 
544 /*@
545   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
546 
547   Collective
548 
549   Input Parameters:
550 + sf      - The PetscSF
551 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
552 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
553 
554   Notes:
555   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
556   n and N are local and global sizes of x respectively.
557 
558   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
559   sequential vectors y on all ranks.
560 
561   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
562   sequential vector y on rank 0.
563 
564   In above cases, entries of x are roots and entries of y are leaves.
565 
566   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
567   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
568   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
569   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
570   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
571 
572   In this case, roots and leaves are symmetric.
573 
574   Level: intermediate
575  @*/
576 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
577 {
578   MPI_Comm       comm;
579   PetscInt       n,N,res[2];
580   PetscMPIInt    rank,size;
581   PetscSFType    type;
582   PetscErrorCode ierr;
583 
584   PetscFunctionBegin;
585   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
586   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
587   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
588   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
589 
590   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
591     type = PETSCSFALLTOALL;
592     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
593     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
594     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
595     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
596   } else {
597     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
598     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
599     res[0] = n;
600     res[1] = -n;
601     /* Check if n are same over all ranks so that we can optimize it */
602     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRMPI(ierr);
603     if (res[0] == -res[1]) { /* same n */
604       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
605     } else {
606       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
607     }
608     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
609   }
610   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
611 
612   sf->pattern = pattern;
613   sf->mine    = NULL; /* Contiguous */
614 
615   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
616      Also set other easy stuff.
617    */
618   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
619     sf->nleaves      = N;
620     sf->nroots       = n;
621     sf->nranks       = size;
622     sf->minleaf      = 0;
623     sf->maxleaf      = N - 1;
624   } else if (pattern == PETSCSF_PATTERN_GATHER) {
625     sf->nleaves      = rank ? 0 : N;
626     sf->nroots       = n;
627     sf->nranks       = rank ? 0 : size;
628     sf->minleaf      = 0;
629     sf->maxleaf      = rank ? -1 : N - 1;
630   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
631     sf->nleaves      = size;
632     sf->nroots       = size;
633     sf->nranks       = size;
634     sf->minleaf      = 0;
635     sf->maxleaf      = size - 1;
636   }
637   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
638   sf->graphset = PETSC_TRUE;
639   PetscFunctionReturn(0);
640 }
641 
642 /*@
643    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
644 
645    Collective
646 
647    Input Parameter:
648 .  sf - star forest to invert
649 
650    Output Parameter:
651 .  isf - inverse of sf
652 
653    Level: advanced
654 
655    Notes:
656    All roots must have degree 1.
657 
658    The local space may be a permutation, but cannot be sparse.
659 
660 .seealso: PetscSFSetGraph()
661 @*/
662 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
663 {
664   PetscErrorCode ierr;
665   PetscMPIInt    rank;
666   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
667   const PetscInt *ilocal;
668   PetscSFNode    *roots,*leaves;
669 
670   PetscFunctionBegin;
671   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
672   PetscSFCheckGraphSet(sf,1);
673   PetscValidPointer(isf,2);
674 
675   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
676   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
677 
678   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
679   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
680   for (i=0; i<maxlocal; i++) {
681     leaves[i].rank  = rank;
682     leaves[i].index = i;
683   }
684   for (i=0; i <nroots; i++) {
685     roots[i].rank  = -1;
686     roots[i].index = -1;
687   }
688   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPI_REPLACE);CHKERRQ(ierr);
689   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPI_REPLACE);CHKERRQ(ierr);
690 
691   /* Check whether our leaves are sparse */
692   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
693   if (count == nroots) newilocal = NULL;
694   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
695     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
696     for (i=0,count=0; i<nroots; i++) {
697       if (roots[i].rank >= 0) {
698         newilocal[count]   = i;
699         roots[count].rank  = roots[i].rank;
700         roots[count].index = roots[i].index;
701         count++;
702       }
703     }
704   }
705 
706   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
707   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
708   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 /*@
713    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
714 
715    Collective
716 
717    Input Parameters:
718 +  sf - communication object to duplicate
719 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
720 
721    Output Parameter:
722 .  newsf - new communication object
723 
724    Level: beginner
725 
726 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
727 @*/
728 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
729 {
730   PetscSFType    type;
731   PetscErrorCode ierr;
732   MPI_Datatype   dtype=MPIU_SCALAR;
733 
734   PetscFunctionBegin;
735   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
736   PetscValidLogicalCollectiveEnum(sf,opt,2);
737   PetscValidPointer(newsf,3);
738   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
739   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
740   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
741   if (opt == PETSCSF_DUPLICATE_GRAPH) {
742     PetscSFCheckGraphSet(sf,1);
743     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
744       PetscInt          nroots,nleaves;
745       const PetscInt    *ilocal;
746       const PetscSFNode *iremote;
747       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
748       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
749     } else {
750       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
751     }
752   }
753   /* Since oldtype is committed, so is newtype, according to MPI */
754   if (sf->vscat.bs > 1) {ierr = MPI_Type_dup(sf->vscat.unit,&dtype);CHKERRMPI(ierr);}
755   (*newsf)->vscat.bs     = sf->vscat.bs;
756   (*newsf)->vscat.unit   = dtype;
757   (*newsf)->vscat.to_n   = sf->vscat.to_n;
758   (*newsf)->vscat.from_n = sf->vscat.from_n;
759   /* Do not copy lsf. Build it on demand since it is rarely used */
760 
761 #if defined(PETSC_HAVE_DEVICE)
762   (*newsf)->backend              = sf->backend;
763   (*newsf)->unknown_input_stream= sf->unknown_input_stream;
764   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
765   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
766 #endif
767   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
768   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
769   PetscFunctionReturn(0);
770 }
771 
772 /*@C
773    PetscSFGetGraph - Get the graph specifying a parallel star forest
774 
775    Not Collective
776 
777    Input Parameter:
778 .  sf - star forest
779 
780    Output Parameters:
781 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
782 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
783 .  ilocal - locations of leaves in leafdata buffers (if returned value is NULL, it means leaves are in contiguous storage)
784 -  iremote - remote locations of root vertices for each leaf on the current process
785 
786    Notes:
787      We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
788 
789    Fortran Notes:
790      The returned iremote array is a copy and must be deallocated after use. Consequently, if you
791      want to update the graph, you must call PetscSFSetGraph() after modifying the iremote array.
792 
793      To check for a NULL ilocal use
794 $      if (loc(ilocal) == loc(PETSC_NULL_INTEGER)) then
795 
796    Level: intermediate
797 
798 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
799 @*/
800 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
801 {
802   PetscErrorCode ierr;
803 
804   PetscFunctionBegin;
805   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
806   if (sf->ops->GetGraph) {
807     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
808   } else {
809     if (nroots) *nroots = sf->nroots;
810     if (nleaves) *nleaves = sf->nleaves;
811     if (ilocal) *ilocal = sf->mine;
812     if (iremote) *iremote = sf->remote;
813   }
814   PetscFunctionReturn(0);
815 }
816 
817 /*@
818    PetscSFGetLeafRange - Get the active leaf ranges
819 
820    Not Collective
821 
822    Input Parameter:
823 .  sf - star forest
824 
825    Output Parameters:
826 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
827 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
828 
829    Level: developer
830 
831 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
832 @*/
833 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
834 {
835   PetscFunctionBegin;
836   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
837   PetscSFCheckGraphSet(sf,1);
838   if (minleaf) *minleaf = sf->minleaf;
839   if (maxleaf) *maxleaf = sf->maxleaf;
840   PetscFunctionReturn(0);
841 }
842 
843 /*@C
844    PetscSFViewFromOptions - View from Options
845 
846    Collective on PetscSF
847 
848    Input Parameters:
849 +  A - the star forest
850 .  obj - Optional object
851 -  name - command line option
852 
853    Level: intermediate
854 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
855 @*/
856 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
857 {
858   PetscErrorCode ierr;
859 
860   PetscFunctionBegin;
861   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
862   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
863   PetscFunctionReturn(0);
864 }
865 
866 /*@C
867    PetscSFView - view a star forest
868 
869    Collective
870 
871    Input Parameters:
872 +  sf - star forest
873 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
874 
875    Level: beginner
876 
877 .seealso: PetscSFCreate(), PetscSFSetGraph()
878 @*/
879 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
880 {
881   PetscErrorCode    ierr;
882   PetscBool         iascii;
883   PetscViewerFormat format;
884 
885   PetscFunctionBegin;
886   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
887   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
888   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
889   PetscCheckSameComm(sf,1,viewer,2);
890   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
891   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
892   if (iascii) {
893     PetscMPIInt rank;
894     PetscInt    ii,i,j;
895 
896     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
897     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
898     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
899       if (!sf->graphset) {
900         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
901         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
902         PetscFunctionReturn(0);
903       }
904       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
905       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
906       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
907       for (i=0; i<sf->nleaves; i++) {
908         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
909       }
910       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
911       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
912       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
913         PetscMPIInt *tmpranks,*perm;
914         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
915         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
916         for (i=0; i<sf->nranks; i++) perm[i] = i;
917         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
918         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
919         for (ii=0; ii<sf->nranks; ii++) {
920           i = perm[ii];
921           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
922           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
923             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
924           }
925         }
926         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
927       }
928       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
929       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
930     }
931     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
932   }
933   if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
934   PetscFunctionReturn(0);
935 }
936 
937 /*@C
938    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
939 
940    Not Collective
941 
942    Input Parameter:
943 .  sf - star forest
944 
945    Output Parameters:
946 +  nranks - number of ranks referenced by local part
947 .  ranks - array of ranks
948 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
949 .  rmine - concatenated array holding local indices referencing each remote rank
950 -  rremote - concatenated array holding remote indices referenced for each remote rank
951 
952    Level: developer
953 
954 .seealso: PetscSFGetLeafRanks()
955 @*/
956 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
957 {
958   PetscErrorCode ierr;
959 
960   PetscFunctionBegin;
961   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
962   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
963   if (sf->ops->GetRootRanks) {
964     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
965   } else {
966     /* The generic implementation */
967     if (nranks)  *nranks  = sf->nranks;
968     if (ranks)   *ranks   = sf->ranks;
969     if (roffset) *roffset = sf->roffset;
970     if (rmine)   *rmine   = sf->rmine;
971     if (rremote) *rremote = sf->rremote;
972   }
973   PetscFunctionReturn(0);
974 }
975 
976 /*@C
977    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
978 
979    Not Collective
980 
981    Input Parameter:
982 .  sf - star forest
983 
984    Output Parameters:
985 +  niranks - number of leaf ranks referencing roots on this process
986 .  iranks - array of ranks
987 .  ioffset - offset in irootloc for each rank (length niranks+1)
988 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
989 
990    Level: developer
991 
992 .seealso: PetscSFGetRootRanks()
993 @*/
994 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
995 {
996   PetscErrorCode ierr;
997 
998   PetscFunctionBegin;
999   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1000   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
1001   if (sf->ops->GetLeafRanks) {
1002     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
1003   } else {
1004     PetscSFType type;
1005     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
1006     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
1007   }
1008   PetscFunctionReturn(0);
1009 }
1010 
1011 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
1012   PetscInt i;
1013   for (i=0; i<n; i++) {
1014     if (needle == list[i]) return PETSC_TRUE;
1015   }
1016   return PETSC_FALSE;
1017 }
1018 
1019 /*@C
1020    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
1021 
1022    Collective
1023 
1024    Input Parameters:
1025 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
1026 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
1027 
1028    Level: developer
1029 
1030 .seealso: PetscSFGetRootRanks()
1031 @*/
1032 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
1033 {
1034   PetscErrorCode     ierr;
1035   PetscTable         table;
1036   PetscTablePosition pos;
1037   PetscMPIInt        size,groupsize,*groupranks;
1038   PetscInt           *rcount,*ranks;
1039   PetscInt           i, irank = -1,orank = -1;
1040 
1041   PetscFunctionBegin;
1042   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1043   PetscSFCheckGraphSet(sf,1);
1044   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
1045   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
1046   for (i=0; i<sf->nleaves; i++) {
1047     /* Log 1-based rank */
1048     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
1049   }
1050   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
1051   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
1052   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
1053   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
1054   for (i=0; i<sf->nranks; i++) {
1055     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
1056     ranks[i]--;             /* Convert back to 0-based */
1057   }
1058   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
1059 
1060   /* We expect that dgroup is reliably "small" while nranks could be large */
1061   {
1062     MPI_Group group = MPI_GROUP_NULL;
1063     PetscMPIInt *dgroupranks;
1064     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1065     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRMPI(ierr);
1066     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
1067     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
1068     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
1069     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRMPI(ierr);}
1070     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1071     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
1072   }
1073 
1074   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1075   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i;) {
1076     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
1077       if (InList(ranks[i],groupsize,groupranks)) break;
1078     }
1079     for (; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1080       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
1081     }
1082     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
1083       PetscInt tmprank,tmpcount;
1084 
1085       tmprank             = ranks[i];
1086       tmpcount            = rcount[i];
1087       ranks[i]            = ranks[sf->ndranks];
1088       rcount[i]           = rcount[sf->ndranks];
1089       ranks[sf->ndranks]  = tmprank;
1090       rcount[sf->ndranks] = tmpcount;
1091       sf->ndranks++;
1092     }
1093   }
1094   ierr = PetscFree(groupranks);CHKERRQ(ierr);
1095   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
1096   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
1097   sf->roffset[0] = 0;
1098   for (i=0; i<sf->nranks; i++) {
1099     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
1100     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
1101     rcount[i]        = 0;
1102   }
1103   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
1104     /* short circuit */
1105     if (orank != sf->remote[i].rank) {
1106       /* Search for index of iremote[i].rank in sf->ranks */
1107       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
1108       if (irank < 0) {
1109         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
1110         if (irank >= 0) irank += sf->ndranks;
1111       }
1112       orank = sf->remote[i].rank;
1113     }
1114     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
1115     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1116     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1117     rcount[irank]++;
1118   }
1119   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
1120   PetscFunctionReturn(0);
1121 }
1122 
1123 /*@C
1124    PetscSFGetGroups - gets incoming and outgoing process groups
1125 
1126    Collective
1127 
1128    Input Parameter:
1129 .  sf - star forest
1130 
1131    Output Parameters:
1132 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1133 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1134 
1135    Level: developer
1136 
1137 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1138 @*/
1139 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1140 {
1141   PetscErrorCode ierr;
1142   MPI_Group      group = MPI_GROUP_NULL;
1143 
1144   PetscFunctionBegin;
1145   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1146   if (sf->ingroup == MPI_GROUP_NULL) {
1147     PetscInt       i;
1148     const PetscInt *indegree;
1149     PetscMPIInt    rank,*outranks,*inranks;
1150     PetscSFNode    *remote;
1151     PetscSF        bgcount;
1152 
1153     /* Compute the number of incoming ranks */
1154     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1155     for (i=0; i<sf->nranks; i++) {
1156       remote[i].rank  = sf->ranks[i];
1157       remote[i].index = 0;
1158     }
1159     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1160     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1161     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1162     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1163     /* Enumerate the incoming ranks */
1164     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1165     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1166     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1167     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1168     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1169     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1170     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRMPI(ierr);
1171     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1172     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1173     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1174   }
1175   *incoming = sf->ingroup;
1176 
1177   if (sf->outgroup == MPI_GROUP_NULL) {
1178     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1179     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRMPI(ierr);
1180     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1181   }
1182   *outgoing = sf->outgroup;
1183   PetscFunctionReturn(0);
1184 }
1185 
1186 /*@
1187    PetscSFGetMultiSF - gets the inner SF implementing gathers and scatters
1188 
1189    Collective
1190 
1191    Input Parameter:
1192 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1193 
1194    Output Parameter:
1195 .  multi - star forest with split roots, such that each root has degree exactly 1
1196 
1197    Level: developer
1198 
1199    Notes:
1200 
1201    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1202    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1203    edge, it is a candidate for future optimization that might involve its removal.
1204 
1205 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1206 @*/
1207 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1208 {
1209   PetscErrorCode ierr;
1210 
1211   PetscFunctionBegin;
1212   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1213   PetscValidPointer(multi,2);
1214   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1215     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1216     *multi = sf->multi;
1217     sf->multi->multi = sf->multi;
1218     PetscFunctionReturn(0);
1219   }
1220   if (!sf->multi) {
1221     const PetscInt *indegree;
1222     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1223     PetscSFNode    *remote;
1224     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1225     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1226     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1227     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1228     inoffset[0] = 0;
1229     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1230     for (i=0; i<maxlocal; i++) outones[i] = 1;
1231     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1232     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1233     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1234     if (PetscDefined(USE_DEBUG)) { /* Check that the expected number of increments occurred */
1235       for (i=0; i<sf->nroots; i++) {
1236         if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1237       }
1238     }
1239     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1240     for (i=0; i<sf->nleaves; i++) {
1241       remote[i].rank  = sf->remote[i].rank;
1242       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1243     }
1244     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1245     sf->multi->multi = sf->multi;
1246     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1247     if (sf->rankorder) {        /* Sort the ranks */
1248       PetscMPIInt rank;
1249       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1250       PetscSFNode *newremote;
1251       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1252       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1253       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1254       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1255       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPI_REPLACE);CHKERRQ(ierr);
1256       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPI_REPLACE);CHKERRQ(ierr);
1257       /* Sort the incoming ranks at each vertex, build the inverse map */
1258       for (i=0; i<sf->nroots; i++) {
1259         PetscInt j;
1260         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1261         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1262         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1263       }
1264       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset,MPI_REPLACE);CHKERRQ(ierr);
1265       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset,MPI_REPLACE);CHKERRQ(ierr);
1266       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1267       for (i=0; i<sf->nleaves; i++) {
1268         newremote[i].rank  = sf->remote[i].rank;
1269         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1270       }
1271       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1272       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1273     }
1274     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1275   }
1276   *multi = sf->multi;
1277   PetscFunctionReturn(0);
1278 }
1279 
1280 /*@C
1281    PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots, does not remap indices
1282 
1283    Collective
1284 
1285    Input Parameters:
1286 +  sf - original star forest
1287 .  nselected  - number of selected roots on this process
1288 -  selected   - indices of the selected roots on this process
1289 
1290    Output Parameter:
1291 .  esf - new star forest
1292 
1293    Level: advanced
1294 
1295    Note:
1296    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1297    be done by calling PetscSFGetGraph().
1298 
1299 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1300 @*/
1301 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *esf)
1302 {
1303   PetscInt          i,j,n,nroots,nleaves,esf_nleaves,*new_ilocal,minleaf,maxleaf,maxlocal;
1304   const PetscInt    *ilocal;
1305   signed char       *rootdata,*leafdata,*leafmem;
1306   const PetscSFNode *iremote;
1307   PetscSFNode       *new_iremote;
1308   MPI_Comm          comm;
1309   PetscErrorCode    ierr;
1310 
1311   PetscFunctionBegin;
1312   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1313   PetscSFCheckGraphSet(sf,1);
1314   if (nselected) PetscValidPointer(selected,3);
1315   PetscValidPointer(esf,4);
1316 
1317   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1318   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1319   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1320   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1321 
1322   if (PetscDefined(USE_DEBUG)) {  /* Error out if selected[] has dups or  out of range indices */
1323     PetscBool dups;
1324     ierr = PetscCheckDupsInt(nselected,selected,&dups);CHKERRQ(ierr);
1325     if (dups) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"selected[] has dups");
1326     for (i=0; i<nselected; i++)
1327       if (selected[i] < 0 || selected[i] >= nroots) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"selected root indice %D is out of [0,%D)",selected[i],nroots);
1328   }
1329 
1330   if (sf->ops->CreateEmbeddedRootSF) {
1331     ierr = (*sf->ops->CreateEmbeddedRootSF)(sf,nselected,selected,esf);CHKERRQ(ierr);
1332   } else {
1333     /* A generic version of creating embedded sf */
1334     ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
1335     maxlocal = maxleaf - minleaf + 1;
1336     ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafmem);CHKERRQ(ierr);
1337     leafdata = leafmem - minleaf;
1338     /* Tag selected roots and bcast to leaves */
1339     for (i=0; i<nselected; i++) rootdata[selected[i]] = 1;
1340     ierr = PetscSFBcastBegin(sf,MPI_SIGNED_CHAR,rootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1341     ierr = PetscSFBcastEnd(sf,MPI_SIGNED_CHAR,rootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1342 
1343     /* Build esf with leaves that are still connected */
1344     esf_nleaves = 0;
1345     for (i=0; i<nleaves; i++) {
1346       j = ilocal ? ilocal[i] : i;
1347       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1348          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1349       */
1350       esf_nleaves += (leafdata[j] ? 1 : 0);
1351     }
1352     ierr = PetscMalloc1(esf_nleaves,&new_ilocal);CHKERRQ(ierr);
1353     ierr = PetscMalloc1(esf_nleaves,&new_iremote);CHKERRQ(ierr);
1354     for (i=n=0; i<nleaves; i++) {
1355       j = ilocal ? ilocal[i] : i;
1356       if (leafdata[j]) {
1357         new_ilocal[n]        = j;
1358         new_iremote[n].rank  = iremote[i].rank;
1359         new_iremote[n].index = iremote[i].index;
1360         ++n;
1361       }
1362     }
1363     ierr = PetscSFCreate(comm,esf);CHKERRQ(ierr);
1364     ierr = PetscSFSetFromOptions(*esf);CHKERRQ(ierr);
1365     ierr = PetscSFSetGraph(*esf,nroots,esf_nleaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1366     ierr = PetscFree2(rootdata,leafmem);CHKERRQ(ierr);
1367   }
1368   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1369   PetscFunctionReturn(0);
1370 }
1371 
1372 /*@C
1373   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1374 
1375   Collective
1376 
1377   Input Parameters:
1378 + sf - original star forest
1379 . nselected  - number of selected leaves on this process
1380 - selected   - indices of the selected leaves on this process
1381 
1382   Output Parameter:
1383 .  newsf - new star forest
1384 
1385   Level: advanced
1386 
1387 .seealso: PetscSFCreateEmbeddedRootSF(), PetscSFSetGraph(), PetscSFGetGraph()
1388 @*/
1389 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1390 {
1391   const PetscSFNode *iremote;
1392   PetscSFNode       *new_iremote;
1393   const PetscInt    *ilocal;
1394   PetscInt          i,nroots,*leaves,*new_ilocal;
1395   MPI_Comm          comm;
1396   PetscErrorCode    ierr;
1397 
1398   PetscFunctionBegin;
1399   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1400   PetscSFCheckGraphSet(sf,1);
1401   if (nselected) PetscValidPointer(selected,3);
1402   PetscValidPointer(newsf,4);
1403 
1404   /* Uniq selected[] and put results in leaves[] */
1405   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1406   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1407   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1408   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1409   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1410 
1411   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1412   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1413     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1414   } else {
1415     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1416     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1417     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1418     for (i=0; i<nselected; ++i) {
1419       const PetscInt l     = leaves[i];
1420       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1421       new_iremote[i].rank  = iremote[l].rank;
1422       new_iremote[i].index = iremote[l].index;
1423     }
1424     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1425     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1426   }
1427   ierr = PetscFree(leaves);CHKERRQ(ierr);
1428   PetscFunctionReturn(0);
1429 }
1430 
1431 /*@C
1432    PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastEnd()
1433 
1434    Collective on PetscSF
1435 
1436    Input Parameters:
1437 +  sf - star forest on which to communicate
1438 .  unit - data type associated with each node
1439 .  rootdata - buffer to broadcast
1440 -  op - operation to use for reduction
1441 
1442    Output Parameter:
1443 .  leafdata - buffer to be reduced with values from each leaf's respective root
1444 
1445    Level: intermediate
1446 
1447    Notes:
1448     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1449     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1450     use PetscSFBcastWithMemTypeBegin() instead.
1451 .seealso: PetscSFBcastEnd(), PetscSFBcastWithMemTypeBegin()
1452 @*/
1453 PetscErrorCode PetscSFBcastBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1454 {
1455   PetscErrorCode ierr;
1456   PetscMemType   rootmtype,leafmtype;
1457 
1458   PetscFunctionBegin;
1459   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1460   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1461   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1462   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1463   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1464   ierr = (*sf->ops->BcastBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1465   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1466   PetscFunctionReturn(0);
1467 }
1468 
1469 /*@C
1470    PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to PetscSFBcastEnd()
1471 
1472    Collective on PetscSF
1473 
1474    Input Parameters:
1475 +  sf - star forest on which to communicate
1476 .  unit - data type associated with each node
1477 .  rootmtype - memory type of rootdata
1478 .  rootdata - buffer to broadcast
1479 .  leafmtype - memory type of leafdata
1480 -  op - operation to use for reduction
1481 
1482    Output Parameter:
1483 .  leafdata - buffer to be reduced with values from each leaf's respective root
1484 
1485    Level: intermediate
1486 
1487 .seealso: PetscSFBcastEnd(), PetscSFBcastBegin()
1488 @*/
1489 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
1490 {
1491   PetscErrorCode ierr;
1492 
1493   PetscFunctionBegin;
1494   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1495   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1496   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1497   ierr = (*sf->ops->BcastBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1498   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1499   PetscFunctionReturn(0);
1500 }
1501 
1502 /*@C
1503    PetscSFBcastEnd - end a broadcast & reduce operation started with PetscSFBcastBegin()
1504 
1505    Collective
1506 
1507    Input Parameters:
1508 +  sf - star forest
1509 .  unit - data type
1510 .  rootdata - buffer to broadcast
1511 -  op - operation to use for reduction
1512 
1513    Output Parameter:
1514 .  leafdata - buffer to be reduced with values from each leaf's respective root
1515 
1516    Level: intermediate
1517 
1518 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1519 @*/
1520 PetscErrorCode PetscSFBcastEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1521 {
1522   PetscErrorCode ierr;
1523 
1524   PetscFunctionBegin;
1525   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1526   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastEnd,sf,0,0,0);CHKERRQ(ierr);}
1527   ierr = (*sf->ops->BcastEnd)(sf,unit,rootdata,leafdata,op);CHKERRQ(ierr);
1528   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastEnd,sf,0,0,0);CHKERRQ(ierr);}
1529   PetscFunctionReturn(0);
1530 }
1531 
1532 /*@C
1533    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1534 
1535    Collective
1536 
1537    Input Parameters:
1538 +  sf - star forest
1539 .  unit - data type
1540 .  leafdata - values to reduce
1541 -  op - reduction operation
1542 
1543    Output Parameter:
1544 .  rootdata - result of reduction of values from all leaves of each root
1545 
1546    Level: intermediate
1547 
1548    Notes:
1549     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1550     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1551     use PetscSFReduceWithMemTypeBegin() instead.
1552 
1553 .seealso: PetscSFBcastBegin(), PetscSFReduceWithMemTypeBegin()
1554 @*/
1555 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1556 {
1557   PetscErrorCode ierr;
1558   PetscMemType   rootmtype,leafmtype;
1559 
1560   PetscFunctionBegin;
1561   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1562   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1563   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1564   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1565   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1566   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1567   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1568   PetscFunctionReturn(0);
1569 }
1570 
1571 /*@C
1572    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to PetscSFReduceEnd()
1573 
1574    Collective
1575 
1576    Input Parameters:
1577 +  sf - star forest
1578 .  unit - data type
1579 .  leafmtype - memory type of leafdata
1580 .  leafdata - values to reduce
1581 .  rootmtype - memory type of rootdata
1582 -  op - reduction operation
1583 
1584    Output Parameter:
1585 .  rootdata - result of reduction of values from all leaves of each root
1586 
1587    Level: intermediate
1588 
1589 .seealso: PetscSFBcastBegin(), PetscSFReduceBegin()
1590 @*/
1591 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
1592 {
1593   PetscErrorCode ierr;
1594 
1595   PetscFunctionBegin;
1596   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1597   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1598   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1599   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1600   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1601   PetscFunctionReturn(0);
1602 }
1603 
1604 /*@C
1605    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1606 
1607    Collective
1608 
1609    Input Parameters:
1610 +  sf - star forest
1611 .  unit - data type
1612 .  leafdata - values to reduce
1613 -  op - reduction operation
1614 
1615    Output Parameter:
1616 .  rootdata - result of reduction of values from all leaves of each root
1617 
1618    Level: intermediate
1619 
1620 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1621 @*/
1622 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1623 {
1624   PetscErrorCode ierr;
1625 
1626   PetscFunctionBegin;
1627   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1628   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1629   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
1630   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1631   PetscFunctionReturn(0);
1632 }
1633 
1634 /*@C
1635    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1636 
1637    Collective
1638 
1639    Input Parameters:
1640 +  sf - star forest
1641 .  unit - data type
1642 .  leafdata - leaf values to use in reduction
1643 -  op - operation to use for reduction
1644 
1645    Output Parameters:
1646 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1647 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1648 
1649    Level: advanced
1650 
1651    Note:
1652    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1653    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1654    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1655    integers.
1656 
1657 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1658 @*/
1659 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1660 {
1661   PetscErrorCode ierr;
1662   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1663 
1664   PetscFunctionBegin;
1665   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1666   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1667   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1668   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1669   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1670   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1671   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1672   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1673   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1674   PetscFunctionReturn(0);
1675 }
1676 
1677 /*@C
1678    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1679 
1680    Collective
1681 
1682    Input Parameters:
1683 +  sf - star forest
1684 .  unit - data type
1685 .  leafdata - leaf values to use in reduction
1686 -  op - operation to use for reduction
1687 
1688    Output Parameters:
1689 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1690 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1691 
1692    Level: advanced
1693 
1694 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1695 @*/
1696 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1697 {
1698   PetscErrorCode ierr;
1699 
1700   PetscFunctionBegin;
1701   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1702   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1703   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootdata,leafdata,leafupdate,op);CHKERRQ(ierr);
1704   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1705   PetscFunctionReturn(0);
1706 }
1707 
1708 /*@C
1709    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1710 
1711    Collective
1712 
1713    Input Parameter:
1714 .  sf - star forest
1715 
1716    Output Parameter:
1717 .  degree - degree of each root vertex
1718 
1719    Level: advanced
1720 
1721    Notes:
1722    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1723 
1724 .seealso: PetscSFGatherBegin()
1725 @*/
1726 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1727 {
1728   PetscErrorCode ierr;
1729 
1730   PetscFunctionBegin;
1731   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1732   PetscSFCheckGraphSet(sf,1);
1733   PetscValidPointer(degree,2);
1734   if (!sf->degreeknown) {
1735     PetscInt i, nroots = sf->nroots, maxlocal;
1736     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1737     maxlocal = sf->maxleaf-sf->minleaf+1;
1738     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1739     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1740     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1741     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1742     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1743   }
1744   *degree = NULL;
1745   PetscFunctionReturn(0);
1746 }
1747 
1748 /*@C
1749    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1750 
1751    Collective
1752 
1753    Input Parameter:
1754 .  sf - star forest
1755 
1756    Output Parameter:
1757 .  degree - degree of each root vertex
1758 
1759    Level: developer
1760 
1761    Notes:
1762    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1763 
1764 .seealso:
1765 @*/
1766 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1767 {
1768   PetscErrorCode ierr;
1769 
1770   PetscFunctionBegin;
1771   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1772   PetscSFCheckGraphSet(sf,1);
1773   PetscValidPointer(degree,2);
1774   if (!sf->degreeknown) {
1775     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1776     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1777     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1778     sf->degreeknown = PETSC_TRUE;
1779   }
1780   *degree = sf->degree;
1781   PetscFunctionReturn(0);
1782 }
1783 
1784 /*@C
1785    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1786    Each multi-root is assigned index of the corresponding original root.
1787 
1788    Collective
1789 
1790    Input Parameters:
1791 +  sf - star forest
1792 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1793 
1794    Output Parameters:
1795 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1796 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1797 
1798    Level: developer
1799 
1800    Notes:
1801    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1802 
1803 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1804 @*/
1805 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1806 {
1807   PetscSF             msf;
1808   PetscInt            i, j, k, nroots, nmroots;
1809   PetscErrorCode      ierr;
1810 
1811   PetscFunctionBegin;
1812   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1813   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1814   if (nroots) PetscValidIntPointer(degree,2);
1815   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1816   PetscValidPointer(multiRootsOrigNumbering,4);
1817   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1818   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1819   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1820   for (i=0,j=0,k=0; i<nroots; i++) {
1821     if (!degree[i]) continue;
1822     for (j=0; j<degree[i]; j++,k++) {
1823       (*multiRootsOrigNumbering)[k] = i;
1824     }
1825   }
1826   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1827   if (nMultiRoots) *nMultiRoots = nmroots;
1828   PetscFunctionReturn(0);
1829 }
1830 
1831 /*@C
1832    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1833 
1834    Collective
1835 
1836    Input Parameters:
1837 +  sf - star forest
1838 .  unit - data type
1839 -  leafdata - leaf data to gather to roots
1840 
1841    Output Parameter:
1842 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1843 
1844    Level: intermediate
1845 
1846 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1847 @*/
1848 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1849 {
1850   PetscErrorCode ierr;
1851   PetscSF        multi = NULL;
1852 
1853   PetscFunctionBegin;
1854   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1855   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1856   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1857   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPI_REPLACE);CHKERRQ(ierr);
1858   PetscFunctionReturn(0);
1859 }
1860 
1861 /*@C
1862    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1863 
1864    Collective
1865 
1866    Input Parameters:
1867 +  sf - star forest
1868 .  unit - data type
1869 -  leafdata - leaf data to gather to roots
1870 
1871    Output Parameter:
1872 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1873 
1874    Level: intermediate
1875 
1876 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1877 @*/
1878 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1879 {
1880   PetscErrorCode ierr;
1881   PetscSF        multi = NULL;
1882 
1883   PetscFunctionBegin;
1884   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1885   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1886   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPI_REPLACE);CHKERRQ(ierr);
1887   PetscFunctionReturn(0);
1888 }
1889 
1890 /*@C
1891    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1892 
1893    Collective
1894 
1895    Input Parameters:
1896 +  sf - star forest
1897 .  unit - data type
1898 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1899 
1900    Output Parameter:
1901 .  leafdata - leaf data to be update with personal data from each respective root
1902 
1903    Level: intermediate
1904 
1905 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1906 @*/
1907 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1908 {
1909   PetscErrorCode ierr;
1910   PetscSF        multi = NULL;
1911 
1912   PetscFunctionBegin;
1913   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1914   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1915   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1916   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1917   PetscFunctionReturn(0);
1918 }
1919 
1920 /*@C
1921    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1922 
1923    Collective
1924 
1925    Input Parameters:
1926 +  sf - star forest
1927 .  unit - data type
1928 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1929 
1930    Output Parameter:
1931 .  leafdata - leaf data to be update with personal data from each respective root
1932 
1933    Level: intermediate
1934 
1935 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1936 @*/
1937 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1938 {
1939   PetscErrorCode ierr;
1940   PetscSF        multi = NULL;
1941 
1942   PetscFunctionBegin;
1943   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1944   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1945   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1946   PetscFunctionReturn(0);
1947 }
1948 
1949 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1950 {
1951   PetscInt        i, n, nleaves;
1952   const PetscInt *ilocal = NULL;
1953   PetscHSetI      seen;
1954   PetscErrorCode  ierr;
1955 
1956   PetscFunctionBegin;
1957   if (PetscDefined(USE_DEBUG)) {
1958     ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1959     ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1960     for (i = 0; i < nleaves; i++) {
1961       const PetscInt leaf = ilocal ? ilocal[i] : i;
1962       ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1963     }
1964     ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1965     if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1966     ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1967   }
1968   PetscFunctionReturn(0);
1969 }
1970 
1971 /*@
1972   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1973 
1974   Input Parameters:
1975 + sfA - The first PetscSF
1976 - sfB - The second PetscSF
1977 
1978   Output Parameters:
1979 . sfBA - The composite SF
1980 
1981   Level: developer
1982 
1983   Notes:
1984   Currently, the two SFs must be defined on congruent communicators and they must be true star
1985   forests, i.e. the same leaf is not connected with different roots.
1986 
1987   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1988   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1989   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1990   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1991 
1992 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1993 @*/
1994 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1995 {
1996   PetscErrorCode    ierr;
1997   const PetscSFNode *remotePointsA,*remotePointsB;
1998   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1999   const PetscInt    *localPointsA,*localPointsB;
2000   PetscInt          *localPointsBA;
2001   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
2002   PetscBool         denseB;
2003 
2004   PetscFunctionBegin;
2005   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2006   PetscSFCheckGraphSet(sfA,1);
2007   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2008   PetscSFCheckGraphSet(sfB,2);
2009   PetscCheckSameComm(sfA,1,sfB,2);
2010   PetscValidPointer(sfBA,3);
2011   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2012   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2013 
2014   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
2015   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
2016   if (localPointsA) {
2017     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
2018     for (i=0; i<numRootsB; i++) {
2019       reorderedRemotePointsA[i].rank = -1;
2020       reorderedRemotePointsA[i].index = -1;
2021     }
2022     for (i=0; i<numLeavesA; i++) {
2023       if (localPointsA[i] >= numRootsB) continue;
2024       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
2025     }
2026     remotePointsA = reorderedRemotePointsA;
2027   }
2028   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
2029   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
2030   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf,MPI_REPLACE);CHKERRQ(ierr);
2031   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf,MPI_REPLACE);CHKERRQ(ierr);
2032   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2033 
2034   denseB = (PetscBool)!localPointsB;
2035   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2036     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
2037     else numLeavesBA++;
2038   }
2039   if (denseB) {
2040     localPointsBA  = NULL;
2041     remotePointsBA = leafdataB;
2042   } else {
2043     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
2044     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
2045     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2046       const PetscInt l = localPointsB ? localPointsB[i] : i;
2047 
2048       if (leafdataB[l-minleaf].rank == -1) continue;
2049       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
2050       localPointsBA[numLeavesBA] = l;
2051       numLeavesBA++;
2052     }
2053     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
2054   }
2055   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2056   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2057   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2058   PetscFunctionReturn(0);
2059 }
2060 
2061 /*@
2062   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
2063 
2064   Input Parameters:
2065 + sfA - The first PetscSF
2066 - sfB - The second PetscSF
2067 
2068   Output Parameters:
2069 . sfBA - The composite SF.
2070 
2071   Level: developer
2072 
2073   Notes:
2074   Currently, the two SFs must be defined on congruent communicators and they must be true star
2075   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2076   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2077 
2078   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2079   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2080   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2081   a Reduce on sfB, on connected roots.
2082 
2083 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
2084 @*/
2085 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
2086 {
2087   PetscErrorCode    ierr;
2088   const PetscSFNode *remotePointsA,*remotePointsB;
2089   PetscSFNode       *remotePointsBA;
2090   const PetscInt    *localPointsA,*localPointsB;
2091   PetscSFNode       *reorderedRemotePointsA = NULL;
2092   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
2093   MPI_Op            op;
2094 #if defined(PETSC_USE_64BIT_INDICES)
2095   PetscBool         iswin;
2096 #endif
2097 
2098   PetscFunctionBegin;
2099   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2100   PetscSFCheckGraphSet(sfA,1);
2101   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2102   PetscSFCheckGraphSet(sfB,2);
2103   PetscCheckSameComm(sfA,1,sfB,2);
2104   PetscValidPointer(sfBA,3);
2105   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2106   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2107 
2108   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
2109   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
2110 
2111   /* TODO: Check roots of sfB have degree of 1 */
2112   /* Once we implement it, we can replace the MPI_MAXLOC
2113      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2114      We use MPI_MAXLOC only to have a deterministic output from this routine if
2115      the root condition is not meet.
2116    */
2117   op = MPI_MAXLOC;
2118 #if defined(PETSC_USE_64BIT_INDICES)
2119   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2120   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
2121   if (iswin) op = MPI_REPLACE;
2122 #endif
2123 
2124   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
2125   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
2126   for (i=0; i<maxleaf - minleaf + 1; i++) {
2127     reorderedRemotePointsA[i].rank = -1;
2128     reorderedRemotePointsA[i].index = -1;
2129   }
2130   if (localPointsA) {
2131     for (i=0; i<numLeavesA; i++) {
2132       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2133       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2134     }
2135   } else {
2136     for (i=0; i<numLeavesA; i++) {
2137       if (i > maxleaf || i < minleaf) continue;
2138       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2139     }
2140   }
2141 
2142   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
2143   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
2144   for (i=0; i<numRootsB; i++) {
2145     remotePointsBA[i].rank = -1;
2146     remotePointsBA[i].index = -1;
2147   }
2148 
2149   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2150   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2151   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2152   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
2153     if (remotePointsBA[i].rank == -1) continue;
2154     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
2155     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2156     localPointsBA[numLeavesBA] = i;
2157     numLeavesBA++;
2158   }
2159   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2160   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2161   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2162   PetscFunctionReturn(0);
2163 }
2164 
2165 /*
2166   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
2167 
2168   Input Parameters:
2169 . sf - The global PetscSF
2170 
2171   Output Parameters:
2172 . out - The local PetscSF
2173  */
2174 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
2175 {
2176   MPI_Comm           comm;
2177   PetscMPIInt        myrank;
2178   const PetscInt     *ilocal;
2179   const PetscSFNode  *iremote;
2180   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
2181   PetscSFNode        *liremote;
2182   PetscSF            lsf;
2183   PetscErrorCode     ierr;
2184 
2185   PetscFunctionBegin;
2186   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2187   if (sf->ops->CreateLocalSF) {
2188     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2189   } else {
2190     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2191     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2192     ierr = MPI_Comm_rank(comm,&myrank);CHKERRMPI(ierr);
2193 
2194     /* Find out local edges and build a local SF */
2195     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2196     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2197     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2198     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2199 
2200     for (i=j=0; i<nleaves; i++) {
2201       if (iremote[i].rank == (PetscInt)myrank) {
2202         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2203         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2204         liremote[j].index = iremote[i].index;
2205         j++;
2206       }
2207     }
2208     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2209     ierr = PetscSFSetFromOptions(lsf);CHKERRQ(ierr);
2210     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2211     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2212     *out = lsf;
2213   }
2214   PetscFunctionReturn(0);
2215 }
2216 
2217 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2218 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2219 {
2220   PetscErrorCode     ierr;
2221   PetscMemType       rootmtype,leafmtype;
2222 
2223   PetscFunctionBegin;
2224   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2225   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2226   ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);
2227   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2228   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2229   if (sf->ops->BcastToZero) {
2230     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2231   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2232   ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);
2233   PetscFunctionReturn(0);
2234 }
2235 
2236