xref: /petsc/src/vec/is/sf/interface/sf.c (revision 8a53a0a473cbd886d528d66811e6ee7fc8c83ebf)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6   #include <cuda_runtime.h>
7 #endif
8 
9 #if defined(PETSC_HAVE_HIP)
10   #include <hip/hip_runtime.h>
11 #endif
12 
13 #if defined(PETSC_USE_DEBUG)
14 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
15     if (PetscUnlikely(!(sf)->graphset))                              \
16       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
17   } while (0)
18 #else
19 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
20 #endif
21 
22 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",NULL};
23 
24 PETSC_STATIC_INLINE PetscErrorCode PetscGetMemType(const void *data,PetscMemType *type)
25 {
26   PetscFunctionBegin;
27   PetscValidPointer(type,2);
28   *type = PETSC_MEMTYPE_HOST;
29 #if defined(PETSC_HAVE_CUDA)
30   if (PetscCUDAInitialized && data) {
31     cudaError_t                  cerr;
32     struct cudaPointerAttributes attr;
33     enum cudaMemoryType          mtype;
34     cerr = cudaPointerGetAttributes(&attr,data); /* Do not check error since before CUDA 11.0, passing a host pointer returns cudaErrorInvalidValue */
35     cudaGetLastError(); /* Reset the last error */
36     #if (CUDART_VERSION < 10000)
37       mtype = attr.memoryType;
38     #else
39       mtype = attr.type;
40     #endif
41     if (cerr == cudaSuccess && mtype == cudaMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
42   }
43 #endif
44 
45 #if defined(PETSC_HAVE_HIP)
46   if (PetscHIPInitialized && data) {
47     hipError_t                   cerr;
48     struct hipPointerAttribute_t attr;
49     enum hipMemoryType           mtype;
50     cerr = hipPointerGetAttributes(&attr,data);
51     hipGetLastError(); /* Reset the last error */
52     mtype = attr.memoryType;
53     if (cerr == hipSuccess && mtype == hipMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
54   }
55 #endif
56   PetscFunctionReturn(0);
57 }
58 
59 /*@
60    PetscSFCreate - create a star forest communication context
61 
62    Collective
63 
64    Input Arguments:
65 .  comm - communicator on which the star forest will operate
66 
67    Output Arguments:
68 .  sf - new star forest context
69 
70    Options Database Keys:
71 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
72 .  -sf_type window    -Use MPI-3 one-sided window for communication
73 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
74 
75    Level: intermediate
76 
77    Notes:
78    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
79    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
80    SFs are optimized and they have better performance than general SFs.
81 
82 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
83 @*/
84 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
85 {
86   PetscErrorCode ierr;
87   PetscSF        b;
88 
89   PetscFunctionBegin;
90   PetscValidPointer(sf,2);
91   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
92 
93   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
94 
95   b->nroots    = -1;
96   b->nleaves   = -1;
97   b->minleaf   = PETSC_MAX_INT;
98   b->maxleaf   = PETSC_MIN_INT;
99   b->nranks    = -1;
100   b->rankorder = PETSC_TRUE;
101   b->ingroup   = MPI_GROUP_NULL;
102   b->outgroup  = MPI_GROUP_NULL;
103   b->graphset  = PETSC_FALSE;
104 #if defined(PETSC_HAVE_DEVICE)
105   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
106   b->use_stream_aware_mpi = PETSC_FALSE;
107   b->unknown_input_stream= PETSC_FALSE;
108   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
109     b->backend = PETSCSF_BACKEND_KOKKOS;
110   #elif defined(PETSC_HAVE_CUDA)
111     b->backend = PETSCSF_BACKEND_CUDA;
112   #elif defined(PETSC_HAVE_HIP)
113     b->backend = PETSCSF_BACKEND_HIP;
114   #endif
115 
116   #if defined(PETSC_HAVE_NVSHMEM)
117     b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
118     b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
119     ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&b->use_nvshmem,NULL);CHKERRQ(ierr);
120     ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem_get",&b->use_nvshmem_get,NULL);CHKERRQ(ierr);
121   #endif
122 #endif
123   b->vscat.from_n = -1;
124   b->vscat.to_n   = -1;
125   b->vscat.unit   = MPIU_SCALAR;
126  *sf = b;
127   PetscFunctionReturn(0);
128 }
129 
130 /*@
131    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
132 
133    Collective
134 
135    Input Arguments:
136 .  sf - star forest
137 
138    Level: advanced
139 
140 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
141 @*/
142 PetscErrorCode PetscSFReset(PetscSF sf)
143 {
144   PetscErrorCode ierr;
145 
146   PetscFunctionBegin;
147   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
148   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
149   sf->nroots   = -1;
150   sf->nleaves  = -1;
151   sf->minleaf  = PETSC_MAX_INT;
152   sf->maxleaf  = PETSC_MIN_INT;
153   sf->mine     = NULL;
154   sf->remote   = NULL;
155   sf->graphset = PETSC_FALSE;
156   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
157   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
158   sf->nranks = -1;
159   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
160   sf->degreeknown = PETSC_FALSE;
161   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
162   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRMPI(ierr);}
163   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRMPI(ierr);}
164   if (sf->multi) sf->multi->multi = NULL;
165   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
166   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
167 
168  #if defined(PETSC_HAVE_DEVICE)
169   for (PetscInt i=0; i<2; i++) {ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,sf->rmine_d[i]);CHKERRQ(ierr);}
170  #endif
171 
172   sf->setupcalled = PETSC_FALSE;
173   PetscFunctionReturn(0);
174 }
175 
176 /*@C
177    PetscSFSetType - Set the PetscSF communication implementation
178 
179    Collective on PetscSF
180 
181    Input Parameters:
182 +  sf - the PetscSF context
183 -  type - a known method
184 
185    Options Database Key:
186 .  -sf_type <type> - Sets the method; use -help for a list
187    of available methods (for instance, window, basic, neighbor)
188 
189    Notes:
190    See "include/petscsf.h" for available methods (for instance)
191 +    PETSCSFWINDOW - MPI-2/3 one-sided
192 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
193 
194   Level: intermediate
195 
196 .seealso: PetscSFType, PetscSFCreate()
197 @*/
198 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
199 {
200   PetscErrorCode ierr,(*r)(PetscSF);
201   PetscBool      match;
202 
203   PetscFunctionBegin;
204   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
205   PetscValidCharPointer(type,2);
206 
207   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
208   if (match) PetscFunctionReturn(0);
209 
210   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
211   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
212   /* Destroy the previous PetscSF implementation context */
213   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
214   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
215   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
216   ierr = (*r)(sf);CHKERRQ(ierr);
217   PetscFunctionReturn(0);
218 }
219 
220 /*@C
221   PetscSFGetType - Get the PetscSF communication implementation
222 
223   Not Collective
224 
225   Input Parameter:
226 . sf  - the PetscSF context
227 
228   Output Parameter:
229 . type - the PetscSF type name
230 
231   Level: intermediate
232 
233 .seealso: PetscSFSetType(), PetscSFCreate()
234 @*/
235 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
236 {
237   PetscFunctionBegin;
238   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
239   PetscValidPointer(type,2);
240   *type = ((PetscObject)sf)->type_name;
241   PetscFunctionReturn(0);
242 }
243 
244 /*@C
245    PetscSFDestroy - destroy star forest
246 
247    Collective
248 
249    Input Arguments:
250 .  sf - address of star forest
251 
252    Level: intermediate
253 
254 .seealso: PetscSFCreate(), PetscSFReset()
255 @*/
256 PetscErrorCode PetscSFDestroy(PetscSF *sf)
257 {
258   PetscErrorCode ierr;
259 
260   PetscFunctionBegin;
261   if (!*sf) PetscFunctionReturn(0);
262   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
263   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
264   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
265   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
266   ierr = PetscSFDestroy(&(*sf)->vscat.lsf);CHKERRQ(ierr);
267   if ((*sf)->vscat.bs > 1) {ierr = MPI_Type_free(&(*sf)->vscat.unit);CHKERRMPI(ierr);}
268   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
269   PetscFunctionReturn(0);
270 }
271 
272 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
273 {
274   PetscInt           i, nleaves;
275   PetscMPIInt        size;
276   const PetscInt    *ilocal;
277   const PetscSFNode *iremote;
278   PetscErrorCode     ierr;
279 
280   PetscFunctionBegin;
281   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
282   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
283   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
284   for (i = 0; i < nleaves; i++) {
285     const PetscInt rank = iremote[i].rank;
286     const PetscInt remote = iremote[i].index;
287     const PetscInt leaf = ilocal ? ilocal[i] : i;
288     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
289     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
290     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
291   }
292   PetscFunctionReturn(0);
293 }
294 
295 /*@
296    PetscSFSetUp - set up communication structures
297 
298    Collective
299 
300    Input Arguments:
301 .  sf - star forest communication object
302 
303    Level: beginner
304 
305 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
306 @*/
307 PetscErrorCode PetscSFSetUp(PetscSF sf)
308 {
309   PetscErrorCode ierr;
310 
311   PetscFunctionBegin;
312   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
313   PetscSFCheckGraphSet(sf,1);
314   if (sf->setupcalled) PetscFunctionReturn(0);
315   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
316   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
317   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);} /* Zero all sf->ops */
318   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
319 #if defined(PETSC_HAVE_CUDA)
320   if (sf->backend == PETSCSF_BACKEND_CUDA) {
321     sf->ops->Malloc = PetscSFMalloc_CUDA;
322     sf->ops->Free   = PetscSFFree_CUDA;
323   }
324 #endif
325 #if defined(PETSC_HAVE_HIP)
326   if (sf->backend == PETSCSF_BACKEND_HIP) {
327     sf->ops->Malloc = PetscSFMalloc_HIP;
328     sf->ops->Free   = PetscSFFree_HIP;
329   }
330 #endif
331 
332 #
333 #if defined(PETSC_HAVE_KOKKOS)
334   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
335     sf->ops->Malloc = PetscSFMalloc_Kokkos;
336     sf->ops->Free   = PetscSFFree_Kokkos;
337   }
338 #endif
339   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
340   sf->setupcalled = PETSC_TRUE;
341   PetscFunctionReturn(0);
342 }
343 
344 /*@
345    PetscSFSetFromOptions - set PetscSF options using the options database
346 
347    Logically Collective
348 
349    Input Arguments:
350 .  sf - star forest
351 
352    Options Database Keys:
353 +  -sf_type               - implementation type, see PetscSFSetType()
354 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
355 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
356                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
357                             If true, this option only works with -use_gpu_aware_mpi 1.
358 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
359                                If true, this option only works with -use_gpu_aware_mpi 1.
360 
361 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently SF has these backends: cuda, hip and Kokkos.
362                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
363                               the only available is kokkos.
364 
365    Level: intermediate
366 @*/
367 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
368 {
369   PetscSFType    deft;
370   char           type[256];
371   PetscErrorCode ierr;
372   PetscBool      flg;
373 
374   PetscFunctionBegin;
375   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
376   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
377   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
378   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
379   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
380   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
381  #if defined(PETSC_HAVE_DEVICE)
382   {
383     char        backendstr[32] = {0};
384     PetscBool   isCuda = PETSC_FALSE,isHip = PETSC_FALSE,isKokkos = PETSC_FALSE,set;
385     /* Change the defaults set in PetscSFCreate() with command line options */
386     ierr = PetscOptionsBool("-sf_unknown_input_stream","SF root/leafdata is computed on arbitary streams unknown to SF","PetscSFSetFromOptions",sf->unknown_input_stream,&sf->unknown_input_stream,NULL);CHKERRQ(ierr);
387     ierr = PetscOptionsBool("-sf_use_stream_aware_mpi","Assume the underlying MPI is cuda-stream aware","PetscSFSetFromOptions",sf->use_stream_aware_mpi,&sf->use_stream_aware_mpi,NULL);CHKERRQ(ierr);
388     ierr = PetscOptionsString("-sf_backend","Select the device backend SF uses","PetscSFSetFromOptions",NULL,backendstr,sizeof(backendstr),&set);CHKERRQ(ierr);
389     ierr = PetscStrcasecmp("cuda",backendstr,&isCuda);CHKERRQ(ierr);
390     ierr = PetscStrcasecmp("kokkos",backendstr,&isKokkos);CHKERRQ(ierr);
391     ierr = PetscStrcasecmp("hip",backendstr,&isHip);CHKERRQ(ierr);
392   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
393     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
394     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
395     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
396     else if (set) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
397   #elif defined(PETSC_HAVE_KOKKOS)
398     if (set && !isKokkos) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You can only choose kokkos", backendstr);
399    #endif
400   }
401  #endif
402   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
403   ierr = PetscOptionsEnd();CHKERRQ(ierr);
404   PetscFunctionReturn(0);
405 }
406 
407 /*@
408    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
409 
410    Logically Collective
411 
412    Input Arguments:
413 +  sf - star forest
414 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
415 
416    Level: advanced
417 
418 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
419 @*/
420 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
421 {
422   PetscFunctionBegin;
423   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
424   PetscValidLogicalCollectiveBool(sf,flg,2);
425   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
426   sf->rankorder = flg;
427   PetscFunctionReturn(0);
428 }
429 
430 /*@
431    PetscSFSetGraph - Set a parallel star forest
432 
433    Collective
434 
435    Input Arguments:
436 +  sf - star forest
437 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
438 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
439 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
440 during setup in debug mode)
441 .  localmode - copy mode for ilocal
442 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
443 during setup in debug mode)
444 -  remotemode - copy mode for iremote
445 
446    Level: intermediate
447 
448    Notes:
449     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
450 
451    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
452    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
453    needed
454 
455    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
456    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
457 
458 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
459 @*/
460 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
461 {
462   PetscErrorCode ierr;
463 
464   PetscFunctionBegin;
465   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
466   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
467   if (nleaves > 0) PetscValidPointer(iremote,6);
468   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
469   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
470 
471   if (sf->nroots >= 0) { /* Reset only if graph already set */
472     ierr = PetscSFReset(sf);CHKERRQ(ierr);
473   }
474 
475   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
476 
477   sf->nroots  = nroots;
478   sf->nleaves = nleaves;
479 
480   if (nleaves && ilocal) {
481     PetscInt i;
482     PetscInt minleaf = PETSC_MAX_INT;
483     PetscInt maxleaf = PETSC_MIN_INT;
484     int      contiguous = 1;
485     for (i=0; i<nleaves; i++) {
486       minleaf = PetscMin(minleaf,ilocal[i]);
487       maxleaf = PetscMax(maxleaf,ilocal[i]);
488       contiguous &= (ilocal[i] == i);
489     }
490     sf->minleaf = minleaf;
491     sf->maxleaf = maxleaf;
492     if (contiguous) {
493       if (localmode == PETSC_OWN_POINTER) {
494         ierr = PetscFree(ilocal);CHKERRQ(ierr);
495       }
496       ilocal = NULL;
497     }
498   } else {
499     sf->minleaf = 0;
500     sf->maxleaf = nleaves - 1;
501   }
502 
503   if (ilocal) {
504     switch (localmode) {
505     case PETSC_COPY_VALUES:
506       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
507       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
508       sf->mine = sf->mine_alloc;
509       break;
510     case PETSC_OWN_POINTER:
511       sf->mine_alloc = (PetscInt*)ilocal;
512       sf->mine       = sf->mine_alloc;
513       break;
514     case PETSC_USE_POINTER:
515       sf->mine_alloc = NULL;
516       sf->mine       = (PetscInt*)ilocal;
517       break;
518     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
519     }
520   }
521 
522   switch (remotemode) {
523   case PETSC_COPY_VALUES:
524     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
525     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
526     sf->remote = sf->remote_alloc;
527     break;
528   case PETSC_OWN_POINTER:
529     sf->remote_alloc = (PetscSFNode*)iremote;
530     sf->remote       = sf->remote_alloc;
531     break;
532   case PETSC_USE_POINTER:
533     sf->remote_alloc = NULL;
534     sf->remote       = (PetscSFNode*)iremote;
535     break;
536   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
537   }
538 
539   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
540   sf->graphset = PETSC_TRUE;
541   PetscFunctionReturn(0);
542 }
543 
544 /*@
545   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
546 
547   Collective
548 
549   Input Parameters:
550 + sf      - The PetscSF
551 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
552 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
553 
554   Notes:
555   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
556   n and N are local and global sizes of x respectively.
557 
558   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
559   sequential vectors y on all ranks.
560 
561   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
562   sequential vector y on rank 0.
563 
564   In above cases, entries of x are roots and entries of y are leaves.
565 
566   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
567   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
568   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
569   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
570   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
571 
572   In this case, roots and leaves are symmetric.
573 
574   Level: intermediate
575  @*/
576 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
577 {
578   MPI_Comm       comm;
579   PetscInt       n,N,res[2];
580   PetscMPIInt    rank,size;
581   PetscSFType    type;
582   PetscErrorCode ierr;
583 
584   PetscFunctionBegin;
585   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
586   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
587   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
588   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
589 
590   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
591     type = PETSCSFALLTOALL;
592     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
593     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
594     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
595     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
596   } else {
597     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
598     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
599     res[0] = n;
600     res[1] = -n;
601     /* Check if n are same over all ranks so that we can optimize it */
602     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
603     if (res[0] == -res[1]) { /* same n */
604       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
605     } else {
606       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
607     }
608     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
609   }
610   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
611 
612   sf->pattern = pattern;
613   sf->mine    = NULL; /* Contiguous */
614 
615   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
616      Also set other easy stuff.
617    */
618   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
619     sf->nleaves      = N;
620     sf->nroots       = n;
621     sf->nranks       = size;
622     sf->minleaf      = 0;
623     sf->maxleaf      = N - 1;
624   } else if (pattern == PETSCSF_PATTERN_GATHER) {
625     sf->nleaves      = rank ? 0 : N;
626     sf->nroots       = n;
627     sf->nranks       = rank ? 0 : size;
628     sf->minleaf      = 0;
629     sf->maxleaf      = rank ? -1 : N - 1;
630   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
631     sf->nleaves      = size;
632     sf->nroots       = size;
633     sf->nranks       = size;
634     sf->minleaf      = 0;
635     sf->maxleaf      = size - 1;
636   }
637   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
638   sf->graphset = PETSC_TRUE;
639   PetscFunctionReturn(0);
640 }
641 
642 /*@
643    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
644 
645    Collective
646 
647    Input Arguments:
648 
649 .  sf - star forest to invert
650 
651    Output Arguments:
652 .  isf - inverse of sf
653    Level: advanced
654 
655    Notes:
656    All roots must have degree 1.
657 
658    The local space may be a permutation, but cannot be sparse.
659 
660 .seealso: PetscSFSetGraph()
661 @*/
662 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
663 {
664   PetscErrorCode ierr;
665   PetscMPIInt    rank;
666   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
667   const PetscInt *ilocal;
668   PetscSFNode    *roots,*leaves;
669 
670   PetscFunctionBegin;
671   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
672   PetscSFCheckGraphSet(sf,1);
673   PetscValidPointer(isf,2);
674 
675   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
676   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
677 
678   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
679   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
680   for (i=0; i<maxlocal; i++) {
681     leaves[i].rank  = rank;
682     leaves[i].index = i;
683   }
684   for (i=0; i <nroots; i++) {
685     roots[i].rank  = -1;
686     roots[i].index = -1;
687   }
688   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPI_REPLACE);CHKERRQ(ierr);
689   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPI_REPLACE);CHKERRQ(ierr);
690 
691   /* Check whether our leaves are sparse */
692   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
693   if (count == nroots) newilocal = NULL;
694   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
695     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
696     for (i=0,count=0; i<nroots; i++) {
697       if (roots[i].rank >= 0) {
698         newilocal[count]   = i;
699         roots[count].rank  = roots[i].rank;
700         roots[count].index = roots[i].index;
701         count++;
702       }
703     }
704   }
705 
706   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
707   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
708   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 /*@
713    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
714 
715    Collective
716 
717    Input Arguments:
718 +  sf - communication object to duplicate
719 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
720 
721    Output Arguments:
722 .  newsf - new communication object
723 
724    Level: beginner
725 
726 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
727 @*/
728 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
729 {
730   PetscSFType    type;
731   PetscErrorCode ierr;
732   MPI_Datatype   dtype=MPIU_SCALAR;
733 
734   PetscFunctionBegin;
735   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
736   PetscValidLogicalCollectiveEnum(sf,opt,2);
737   PetscValidPointer(newsf,3);
738   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
739   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
740   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
741   if (opt == PETSCSF_DUPLICATE_GRAPH) {
742     PetscSFCheckGraphSet(sf,1);
743     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
744       PetscInt          nroots,nleaves;
745       const PetscInt    *ilocal;
746       const PetscSFNode *iremote;
747       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
748       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
749     } else {
750       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
751     }
752   }
753   /* Since oldtype is committed, so is newtype, according to MPI */
754   if (sf->vscat.bs > 1) {ierr = MPI_Type_dup(sf->vscat.unit,&dtype);CHKERRMPI(ierr);}
755   (*newsf)->vscat.bs     = sf->vscat.bs;
756   (*newsf)->vscat.unit   = dtype;
757   (*newsf)->vscat.to_n   = sf->vscat.to_n;
758   (*newsf)->vscat.from_n = sf->vscat.from_n;
759   /* Do not copy lsf. Build it on demand since it is rarely used */
760 
761 #if defined(PETSC_HAVE_DEVICE)
762   (*newsf)->backend              = sf->backend;
763   (*newsf)->unknown_input_stream= sf->unknown_input_stream;
764   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
765   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
766 #endif
767   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
768   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
769   PetscFunctionReturn(0);
770 }
771 
772 /*@C
773    PetscSFGetGraph - Get the graph specifying a parallel star forest
774 
775    Not Collective
776 
777    Input Arguments:
778 .  sf - star forest
779 
780    Output Arguments:
781 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
782 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
783 .  ilocal - locations of leaves in leafdata buffers
784 -  iremote - remote locations of root vertices for each leaf on the current process
785 
786    Notes:
787    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
788 
789    When called from Fortran, the returned iremote array is a copy and must be deallocated after use. Consequently, if you
790    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
791 
792    Level: intermediate
793 
794 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
795 @*/
796 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
797 {
798   PetscErrorCode ierr;
799 
800   PetscFunctionBegin;
801   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
802   if (sf->ops->GetGraph) {
803     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
804   } else {
805     if (nroots) *nroots = sf->nroots;
806     if (nleaves) *nleaves = sf->nleaves;
807     if (ilocal) *ilocal = sf->mine;
808     if (iremote) *iremote = sf->remote;
809   }
810   PetscFunctionReturn(0);
811 }
812 
813 /*@
814    PetscSFGetLeafRange - Get the active leaf ranges
815 
816    Not Collective
817 
818    Input Arguments:
819 .  sf - star forest
820 
821    Output Arguments:
822 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
823 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
824 
825    Level: developer
826 
827 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
828 @*/
829 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
830 {
831   PetscFunctionBegin;
832   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
833   PetscSFCheckGraphSet(sf,1);
834   if (minleaf) *minleaf = sf->minleaf;
835   if (maxleaf) *maxleaf = sf->maxleaf;
836   PetscFunctionReturn(0);
837 }
838 
839 /*@C
840    PetscSFViewFromOptions - View from Options
841 
842    Collective on PetscSF
843 
844    Input Parameters:
845 +  A - the star forest
846 .  obj - Optional object
847 -  name - command line option
848 
849    Level: intermediate
850 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
851 @*/
852 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
853 {
854   PetscErrorCode ierr;
855 
856   PetscFunctionBegin;
857   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
858   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
859   PetscFunctionReturn(0);
860 }
861 
862 /*@C
863    PetscSFView - view a star forest
864 
865    Collective
866 
867    Input Arguments:
868 +  sf - star forest
869 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
870 
871    Level: beginner
872 
873 .seealso: PetscSFCreate(), PetscSFSetGraph()
874 @*/
875 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
876 {
877   PetscErrorCode    ierr;
878   PetscBool         iascii;
879   PetscViewerFormat format;
880 
881   PetscFunctionBegin;
882   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
883   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
884   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
885   PetscCheckSameComm(sf,1,viewer,2);
886   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
887   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
888   if (iascii) {
889     PetscMPIInt rank;
890     PetscInt    ii,i,j;
891 
892     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
893     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
894     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
895       if (!sf->graphset) {
896         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
897         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
898         PetscFunctionReturn(0);
899       }
900       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
901       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
902       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
903       for (i=0; i<sf->nleaves; i++) {
904         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
905       }
906       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
907       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
908       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
909         PetscMPIInt *tmpranks,*perm;
910         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
911         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
912         for (i=0; i<sf->nranks; i++) perm[i] = i;
913         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
914         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
915         for (ii=0; ii<sf->nranks; ii++) {
916           i = perm[ii];
917           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
918           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
919             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
920           }
921         }
922         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
923       }
924       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
925       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
926     }
927     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
928   }
929   if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
930   PetscFunctionReturn(0);
931 }
932 
933 /*@C
934    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
935 
936    Not Collective
937 
938    Input Arguments:
939 .  sf - star forest
940 
941    Output Arguments:
942 +  nranks - number of ranks referenced by local part
943 .  ranks - array of ranks
944 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
945 .  rmine - concatenated array holding local indices referencing each remote rank
946 -  rremote - concatenated array holding remote indices referenced for each remote rank
947 
948    Level: developer
949 
950 .seealso: PetscSFGetLeafRanks()
951 @*/
952 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
953 {
954   PetscErrorCode ierr;
955 
956   PetscFunctionBegin;
957   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
958   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
959   if (sf->ops->GetRootRanks) {
960     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
961   } else {
962     /* The generic implementation */
963     if (nranks)  *nranks  = sf->nranks;
964     if (ranks)   *ranks   = sf->ranks;
965     if (roffset) *roffset = sf->roffset;
966     if (rmine)   *rmine   = sf->rmine;
967     if (rremote) *rremote = sf->rremote;
968   }
969   PetscFunctionReturn(0);
970 }
971 
972 /*@C
973    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
974 
975    Not Collective
976 
977    Input Arguments:
978 .  sf - star forest
979 
980    Output Arguments:
981 +  niranks - number of leaf ranks referencing roots on this process
982 .  iranks - array of ranks
983 .  ioffset - offset in irootloc for each rank (length niranks+1)
984 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
985 
986    Level: developer
987 
988 .seealso: PetscSFGetRootRanks()
989 @*/
990 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
991 {
992   PetscErrorCode ierr;
993 
994   PetscFunctionBegin;
995   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
996   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
997   if (sf->ops->GetLeafRanks) {
998     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
999   } else {
1000     PetscSFType type;
1001     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
1002     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
1003   }
1004   PetscFunctionReturn(0);
1005 }
1006 
1007 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
1008   PetscInt i;
1009   for (i=0; i<n; i++) {
1010     if (needle == list[i]) return PETSC_TRUE;
1011   }
1012   return PETSC_FALSE;
1013 }
1014 
1015 /*@C
1016    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
1017 
1018    Collective
1019 
1020    Input Arguments:
1021 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
1022 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
1023 
1024    Level: developer
1025 
1026 .seealso: PetscSFGetRootRanks()
1027 @*/
1028 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
1029 {
1030   PetscErrorCode     ierr;
1031   PetscTable         table;
1032   PetscTablePosition pos;
1033   PetscMPIInt        size,groupsize,*groupranks;
1034   PetscInt           *rcount,*ranks;
1035   PetscInt           i, irank = -1,orank = -1;
1036 
1037   PetscFunctionBegin;
1038   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1039   PetscSFCheckGraphSet(sf,1);
1040   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
1041   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
1042   for (i=0; i<sf->nleaves; i++) {
1043     /* Log 1-based rank */
1044     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
1045   }
1046   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
1047   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
1048   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
1049   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
1050   for (i=0; i<sf->nranks; i++) {
1051     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
1052     ranks[i]--;             /* Convert back to 0-based */
1053   }
1054   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
1055 
1056   /* We expect that dgroup is reliably "small" while nranks could be large */
1057   {
1058     MPI_Group group = MPI_GROUP_NULL;
1059     PetscMPIInt *dgroupranks;
1060     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1061     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRMPI(ierr);
1062     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
1063     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
1064     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
1065     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRMPI(ierr);}
1066     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1067     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
1068   }
1069 
1070   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1071   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i;) {
1072     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
1073       if (InList(ranks[i],groupsize,groupranks)) break;
1074     }
1075     for (; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1076       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
1077     }
1078     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
1079       PetscInt tmprank,tmpcount;
1080 
1081       tmprank             = ranks[i];
1082       tmpcount            = rcount[i];
1083       ranks[i]            = ranks[sf->ndranks];
1084       rcount[i]           = rcount[sf->ndranks];
1085       ranks[sf->ndranks]  = tmprank;
1086       rcount[sf->ndranks] = tmpcount;
1087       sf->ndranks++;
1088     }
1089   }
1090   ierr = PetscFree(groupranks);CHKERRQ(ierr);
1091   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
1092   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
1093   sf->roffset[0] = 0;
1094   for (i=0; i<sf->nranks; i++) {
1095     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
1096     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
1097     rcount[i]        = 0;
1098   }
1099   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
1100     /* short circuit */
1101     if (orank != sf->remote[i].rank) {
1102       /* Search for index of iremote[i].rank in sf->ranks */
1103       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
1104       if (irank < 0) {
1105         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
1106         if (irank >= 0) irank += sf->ndranks;
1107       }
1108       orank = sf->remote[i].rank;
1109     }
1110     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
1111     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1112     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1113     rcount[irank]++;
1114   }
1115   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
1116   PetscFunctionReturn(0);
1117 }
1118 
1119 /*@C
1120    PetscSFGetGroups - gets incoming and outgoing process groups
1121 
1122    Collective
1123 
1124    Input Argument:
1125 .  sf - star forest
1126 
1127    Output Arguments:
1128 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1129 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1130 
1131    Level: developer
1132 
1133 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1134 @*/
1135 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1136 {
1137   PetscErrorCode ierr;
1138   MPI_Group      group = MPI_GROUP_NULL;
1139 
1140   PetscFunctionBegin;
1141   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1142   if (sf->ingroup == MPI_GROUP_NULL) {
1143     PetscInt       i;
1144     const PetscInt *indegree;
1145     PetscMPIInt    rank,*outranks,*inranks;
1146     PetscSFNode    *remote;
1147     PetscSF        bgcount;
1148 
1149     /* Compute the number of incoming ranks */
1150     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1151     for (i=0; i<sf->nranks; i++) {
1152       remote[i].rank  = sf->ranks[i];
1153       remote[i].index = 0;
1154     }
1155     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1156     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1157     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1158     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1159     /* Enumerate the incoming ranks */
1160     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1161     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1162     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1163     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1164     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1165     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1166     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRMPI(ierr);
1167     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1168     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1169     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1170   }
1171   *incoming = sf->ingroup;
1172 
1173   if (sf->outgroup == MPI_GROUP_NULL) {
1174     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1175     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRMPI(ierr);
1176     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1177   }
1178   *outgoing = sf->outgroup;
1179   PetscFunctionReturn(0);
1180 }
1181 
1182 /*@
1183    PetscSFGetMultiSF - gets the inner SF implementing gathers and scatters
1184 
1185    Collective
1186 
1187    Input Argument:
1188 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1189 
1190    Output Arguments:
1191 .  multi - star forest with split roots, such that each root has degree exactly 1
1192 
1193    Level: developer
1194 
1195    Notes:
1196 
1197    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1198    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1199    edge, it is a candidate for future optimization that might involve its removal.
1200 
1201 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1202 @*/
1203 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1204 {
1205   PetscErrorCode ierr;
1206 
1207   PetscFunctionBegin;
1208   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1209   PetscValidPointer(multi,2);
1210   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1211     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1212     *multi = sf->multi;
1213     sf->multi->multi = sf->multi;
1214     PetscFunctionReturn(0);
1215   }
1216   if (!sf->multi) {
1217     const PetscInt *indegree;
1218     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1219     PetscSFNode    *remote;
1220     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1221     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1222     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1223     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1224     inoffset[0] = 0;
1225     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1226     for (i=0; i<maxlocal; i++) outones[i] = 1;
1227     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1228     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1229     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1230     if (PetscDefined(USE_DEBUG)) { /* Check that the expected number of increments occurred */
1231       for (i=0; i<sf->nroots; i++) {
1232         if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1233       }
1234     }
1235     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1236     for (i=0; i<sf->nleaves; i++) {
1237       remote[i].rank  = sf->remote[i].rank;
1238       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1239     }
1240     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1241     sf->multi->multi = sf->multi;
1242     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1243     if (sf->rankorder) {        /* Sort the ranks */
1244       PetscMPIInt rank;
1245       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1246       PetscSFNode *newremote;
1247       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1248       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1249       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1250       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1251       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPI_REPLACE);CHKERRQ(ierr);
1252       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPI_REPLACE);CHKERRQ(ierr);
1253       /* Sort the incoming ranks at each vertex, build the inverse map */
1254       for (i=0; i<sf->nroots; i++) {
1255         PetscInt j;
1256         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1257         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1258         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1259       }
1260       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset,MPI_REPLACE);CHKERRQ(ierr);
1261       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset,MPI_REPLACE);CHKERRQ(ierr);
1262       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1263       for (i=0; i<sf->nleaves; i++) {
1264         newremote[i].rank  = sf->remote[i].rank;
1265         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1266       }
1267       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1268       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1269     }
1270     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1271   }
1272   *multi = sf->multi;
1273   PetscFunctionReturn(0);
1274 }
1275 
1276 /*@C
1277    PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots, does not remap indices
1278 
1279    Collective
1280 
1281    Input Arguments:
1282 +  sf - original star forest
1283 .  nselected  - number of selected roots on this process
1284 -  selected   - indices of the selected roots on this process
1285 
1286    Output Arguments:
1287 .  esf - new star forest
1288 
1289    Level: advanced
1290 
1291    Note:
1292    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1293    be done by calling PetscSFGetGraph().
1294 
1295 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1296 @*/
1297 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *esf)
1298 {
1299   PetscInt          i,j,n,nroots,nleaves,esf_nleaves,*new_ilocal,minleaf,maxleaf,maxlocal;
1300   const PetscInt    *ilocal;
1301   signed char       *rootdata,*leafdata,*leafmem;
1302   const PetscSFNode *iremote;
1303   PetscSFNode       *new_iremote;
1304   MPI_Comm          comm;
1305   PetscErrorCode    ierr;
1306 
1307   PetscFunctionBegin;
1308   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1309   PetscSFCheckGraphSet(sf,1);
1310   if (nselected) PetscValidPointer(selected,3);
1311   PetscValidPointer(esf,4);
1312 
1313   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1314   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1315   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1316   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1317 
1318   if (PetscDefined(USE_DEBUG)) {  /* Error out if selected[] has dups or  out of range indices */
1319     PetscBool dups;
1320     ierr = PetscCheckDupsInt(nselected,selected,&dups);CHKERRQ(ierr);
1321     if (dups) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"selected[] has dups");
1322     for (i=0; i<nselected; i++)
1323       if (selected[i] < 0 || selected[i] >= nroots) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"selected root indice %D is out of [0,%D)",selected[i],nroots);
1324   }
1325 
1326   if (sf->ops->CreateEmbeddedRootSF) {
1327     ierr = (*sf->ops->CreateEmbeddedRootSF)(sf,nselected,selected,esf);CHKERRQ(ierr);
1328   } else {
1329     /* A generic version of creating embedded sf */
1330     ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
1331     maxlocal = maxleaf - minleaf + 1;
1332     ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafmem);CHKERRQ(ierr);
1333     leafdata = leafmem - minleaf;
1334     /* Tag selected roots and bcast to leaves */
1335     for (i=0; i<nselected; i++) rootdata[selected[i]] = 1;
1336     ierr = PetscSFBcastBegin(sf,MPI_SIGNED_CHAR,rootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1337     ierr = PetscSFBcastEnd(sf,MPI_SIGNED_CHAR,rootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1338 
1339     /* Build esf with leaves that are still connected */
1340     esf_nleaves = 0;
1341     for (i=0; i<nleaves; i++) {
1342       j = ilocal ? ilocal[i] : i;
1343       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1344          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1345       */
1346       esf_nleaves += (leafdata[j] ? 1 : 0);
1347     }
1348     ierr = PetscMalloc1(esf_nleaves,&new_ilocal);CHKERRQ(ierr);
1349     ierr = PetscMalloc1(esf_nleaves,&new_iremote);CHKERRQ(ierr);
1350     for (i=n=0; i<nleaves; i++) {
1351       j = ilocal ? ilocal[i] : i;
1352       if (leafdata[j]) {
1353         new_ilocal[n]        = j;
1354         new_iremote[n].rank  = iremote[i].rank;
1355         new_iremote[n].index = iremote[i].index;
1356         ++n;
1357       }
1358     }
1359     ierr = PetscSFCreate(comm,esf);CHKERRQ(ierr);
1360     ierr = PetscSFSetFromOptions(*esf);CHKERRQ(ierr);
1361     ierr = PetscSFSetGraph(*esf,nroots,esf_nleaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1362     ierr = PetscFree2(rootdata,leafmem);CHKERRQ(ierr);
1363   }
1364   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 /*@C
1369   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1370 
1371   Collective
1372 
1373   Input Arguments:
1374 + sf - original star forest
1375 . nselected  - number of selected leaves on this process
1376 - selected   - indices of the selected leaves on this process
1377 
1378   Output Arguments:
1379 .  newsf - new star forest
1380 
1381   Level: advanced
1382 
1383 .seealso: PetscSFCreateEmbeddedRootSF(), PetscSFSetGraph(), PetscSFGetGraph()
1384 @*/
1385 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1386 {
1387   const PetscSFNode *iremote;
1388   PetscSFNode       *new_iremote;
1389   const PetscInt    *ilocal;
1390   PetscInt          i,nroots,*leaves,*new_ilocal;
1391   MPI_Comm          comm;
1392   PetscErrorCode    ierr;
1393 
1394   PetscFunctionBegin;
1395   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1396   PetscSFCheckGraphSet(sf,1);
1397   if (nselected) PetscValidPointer(selected,3);
1398   PetscValidPointer(newsf,4);
1399 
1400   /* Uniq selected[] and put results in leaves[] */
1401   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1402   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1403   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1404   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1405   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1406 
1407   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1408   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1409     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1410   } else {
1411     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1412     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1413     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1414     for (i=0; i<nselected; ++i) {
1415       const PetscInt l     = leaves[i];
1416       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1417       new_iremote[i].rank  = iremote[l].rank;
1418       new_iremote[i].index = iremote[l].index;
1419     }
1420     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1421     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1422   }
1423   ierr = PetscFree(leaves);CHKERRQ(ierr);
1424   PetscFunctionReturn(0);
1425 }
1426 
1427 /*@C
1428    PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastEnd()
1429 
1430    Collective on PetscSF
1431 
1432    Input Arguments:
1433 +  sf - star forest on which to communicate
1434 .  unit - data type associated with each node
1435 .  rootdata - buffer to broadcast
1436 -  op - operation to use for reduction
1437 
1438    Output Arguments:
1439 .  leafdata - buffer to be reduced with values from each leaf's respective root
1440 
1441    Level: intermediate
1442 
1443    Notes:
1444     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1445     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1446     use PetscSFBcastWithMemTypeBegin() instead.
1447 .seealso: PetscSFBcastEnd(), PetscSFBcastWithMemTypeBegin()
1448 @*/
1449 PetscErrorCode PetscSFBcastBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1450 {
1451   PetscErrorCode ierr;
1452   PetscMemType   rootmtype,leafmtype;
1453 
1454   PetscFunctionBegin;
1455   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1456   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1457   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1458   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1459   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1460   ierr = (*sf->ops->BcastBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1461   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1462   PetscFunctionReturn(0);
1463 }
1464 
1465 /*@C
1466    PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to PetscSFBcastEnd()
1467 
1468    Collective on PetscSF
1469 
1470    Input Arguments:
1471 +  sf - star forest on which to communicate
1472 .  unit - data type associated with each node
1473 .  rootmtype - memory type of rootdata
1474 .  rootdata - buffer to broadcast
1475 .  leafmtype - memory type of leafdata
1476 -  op - operation to use for reduction
1477 
1478    Output Arguments:
1479 .  leafdata - buffer to be reduced with values from each leaf's respective root
1480 
1481    Level: intermediate
1482 
1483 .seealso: PetscSFBcastEnd(), PetscSFBcastBegin()
1484 @*/
1485 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
1486 {
1487   PetscErrorCode ierr;
1488 
1489   PetscFunctionBegin;
1490   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1491   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1492   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1493   ierr = (*sf->ops->BcastBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1494   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1495   PetscFunctionReturn(0);
1496 }
1497 
1498 /*@C
1499    PetscSFBcastEnd - end a broadcast & reduce operation started with PetscSFBcastBegin()
1500 
1501    Collective
1502 
1503    Input Arguments:
1504 +  sf - star forest
1505 .  unit - data type
1506 .  rootdata - buffer to broadcast
1507 -  op - operation to use for reduction
1508 
1509    Output Arguments:
1510 .  leafdata - buffer to be reduced with values from each leaf's respective root
1511 
1512    Level: intermediate
1513 
1514 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1515 @*/
1516 PetscErrorCode PetscSFBcastEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1517 {
1518   PetscErrorCode ierr;
1519 
1520   PetscFunctionBegin;
1521   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1522   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastEnd,sf,0,0,0);CHKERRQ(ierr);}
1523   ierr = (*sf->ops->BcastEnd)(sf,unit,rootdata,leafdata,op);CHKERRQ(ierr);
1524   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastEnd,sf,0,0,0);CHKERRQ(ierr);}
1525   PetscFunctionReturn(0);
1526 }
1527 
1528 /*@C
1529    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1530 
1531    Collective
1532 
1533    Input Arguments:
1534 +  sf - star forest
1535 .  unit - data type
1536 .  leafdata - values to reduce
1537 -  op - reduction operation
1538 
1539    Output Arguments:
1540 .  rootdata - result of reduction of values from all leaves of each root
1541 
1542    Level: intermediate
1543 
1544    Notes:
1545     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1546     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1547     use PetscSFReduceWithMemTypeBegin() instead.
1548 
1549 .seealso: PetscSFBcastBegin(), PetscSFReduceWithMemTypeBegin()
1550 @*/
1551 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1552 {
1553   PetscErrorCode ierr;
1554   PetscMemType   rootmtype,leafmtype;
1555 
1556   PetscFunctionBegin;
1557   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1558   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1559   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1560   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1561   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1562   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1563   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1564   PetscFunctionReturn(0);
1565 }
1566 
1567 /*@C
1568    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to PetscSFReduceEnd()
1569 
1570    Collective
1571 
1572    Input Arguments:
1573 +  sf - star forest
1574 .  unit - data type
1575 .  leafmtype - memory type of leafdata
1576 .  leafdata - values to reduce
1577 .  rootmtype - memory type of rootdata
1578 -  op - reduction operation
1579 
1580    Output Arguments:
1581 .  rootdata - result of reduction of values from all leaves of each root
1582 
1583    Level: intermediate
1584 
1585 .seealso: PetscSFBcastBegin(), PetscSFReduceBegin()
1586 @*/
1587 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
1588 {
1589   PetscErrorCode ierr;
1590 
1591   PetscFunctionBegin;
1592   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1593   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1594   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1595   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1596   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1597   PetscFunctionReturn(0);
1598 }
1599 
1600 /*@C
1601    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1602 
1603    Collective
1604 
1605    Input Arguments:
1606 +  sf - star forest
1607 .  unit - data type
1608 .  leafdata - values to reduce
1609 -  op - reduction operation
1610 
1611    Output Arguments:
1612 .  rootdata - result of reduction of values from all leaves of each root
1613 
1614    Level: intermediate
1615 
1616 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1617 @*/
1618 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1619 {
1620   PetscErrorCode ierr;
1621 
1622   PetscFunctionBegin;
1623   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1624   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1625   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
1626   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1627   PetscFunctionReturn(0);
1628 }
1629 
1630 /*@C
1631    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1632 
1633    Collective
1634 
1635    Input Arguments:
1636 +  sf - star forest
1637 .  unit - data type
1638 .  leafdata - leaf values to use in reduction
1639 -  op - operation to use for reduction
1640 
1641    Output Arguments:
1642 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1643 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1644 
1645    Level: advanced
1646 
1647    Note:
1648    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1649    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1650    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1651    integers.
1652 
1653 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1654 @*/
1655 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1656 {
1657   PetscErrorCode ierr;
1658   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1659 
1660   PetscFunctionBegin;
1661   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1662   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1663   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1664   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1665   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1666   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1667   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1668   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1669   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1670   PetscFunctionReturn(0);
1671 }
1672 
1673 /*@C
1674    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1675 
1676    Collective
1677 
1678    Input Arguments:
1679 +  sf - star forest
1680 .  unit - data type
1681 .  leafdata - leaf values to use in reduction
1682 -  op - operation to use for reduction
1683 
1684    Output Arguments:
1685 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1686 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1687 
1688    Level: advanced
1689 
1690 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1691 @*/
1692 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1693 {
1694   PetscErrorCode ierr;
1695 
1696   PetscFunctionBegin;
1697   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1698   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1699   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootdata,leafdata,leafupdate,op);CHKERRQ(ierr);
1700   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1701   PetscFunctionReturn(0);
1702 }
1703 
1704 /*@C
1705    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1706 
1707    Collective
1708 
1709    Input Arguments:
1710 .  sf - star forest
1711 
1712    Output Arguments:
1713 .  degree - degree of each root vertex
1714 
1715    Level: advanced
1716 
1717    Notes:
1718    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1719 
1720 .seealso: PetscSFGatherBegin()
1721 @*/
1722 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1723 {
1724   PetscErrorCode ierr;
1725 
1726   PetscFunctionBegin;
1727   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1728   PetscSFCheckGraphSet(sf,1);
1729   PetscValidPointer(degree,2);
1730   if (!sf->degreeknown) {
1731     PetscInt i, nroots = sf->nroots, maxlocal;
1732     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1733     maxlocal = sf->maxleaf-sf->minleaf+1;
1734     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1735     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1736     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1737     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1738     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1739   }
1740   *degree = NULL;
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 /*@C
1745    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1746 
1747    Collective
1748 
1749    Input Arguments:
1750 .  sf - star forest
1751 
1752    Output Arguments:
1753 .  degree - degree of each root vertex
1754 
1755    Level: developer
1756 
1757    Notes:
1758    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1759 
1760 .seealso:
1761 @*/
1762 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1763 {
1764   PetscErrorCode ierr;
1765 
1766   PetscFunctionBegin;
1767   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1768   PetscSFCheckGraphSet(sf,1);
1769   PetscValidPointer(degree,2);
1770   if (!sf->degreeknown) {
1771     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1772     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1773     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1774     sf->degreeknown = PETSC_TRUE;
1775   }
1776   *degree = sf->degree;
1777   PetscFunctionReturn(0);
1778 }
1779 
1780 
1781 /*@C
1782    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1783    Each multi-root is assigned index of the corresponding original root.
1784 
1785    Collective
1786 
1787    Input Arguments:
1788 +  sf - star forest
1789 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1790 
1791    Output Arguments:
1792 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1793 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1794 
1795    Level: developer
1796 
1797    Notes:
1798    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1799 
1800 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1801 @*/
1802 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1803 {
1804   PetscSF             msf;
1805   PetscInt            i, j, k, nroots, nmroots;
1806   PetscErrorCode      ierr;
1807 
1808   PetscFunctionBegin;
1809   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1810   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1811   if (nroots) PetscValidIntPointer(degree,2);
1812   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1813   PetscValidPointer(multiRootsOrigNumbering,4);
1814   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1815   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1816   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1817   for (i=0,j=0,k=0; i<nroots; i++) {
1818     if (!degree[i]) continue;
1819     for (j=0; j<degree[i]; j++,k++) {
1820       (*multiRootsOrigNumbering)[k] = i;
1821     }
1822   }
1823   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1824   if (nMultiRoots) *nMultiRoots = nmroots;
1825   PetscFunctionReturn(0);
1826 }
1827 
1828 /*@C
1829    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1830 
1831    Collective
1832 
1833    Input Arguments:
1834 +  sf - star forest
1835 .  unit - data type
1836 -  leafdata - leaf data to gather to roots
1837 
1838    Output Argument:
1839 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1840 
1841    Level: intermediate
1842 
1843 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1844 @*/
1845 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1846 {
1847   PetscErrorCode ierr;
1848   PetscSF        multi = NULL;
1849 
1850   PetscFunctionBegin;
1851   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1852   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1853   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1854   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPI_REPLACE);CHKERRQ(ierr);
1855   PetscFunctionReturn(0);
1856 }
1857 
1858 /*@C
1859    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1860 
1861    Collective
1862 
1863    Input Arguments:
1864 +  sf - star forest
1865 .  unit - data type
1866 -  leafdata - leaf data to gather to roots
1867 
1868    Output Argument:
1869 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1870 
1871    Level: intermediate
1872 
1873 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1874 @*/
1875 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1876 {
1877   PetscErrorCode ierr;
1878   PetscSF        multi = NULL;
1879 
1880   PetscFunctionBegin;
1881   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1882   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1883   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPI_REPLACE);CHKERRQ(ierr);
1884   PetscFunctionReturn(0);
1885 }
1886 
1887 /*@C
1888    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1889 
1890    Collective
1891 
1892    Input Arguments:
1893 +  sf - star forest
1894 .  unit - data type
1895 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1896 
1897    Output Argument:
1898 .  leafdata - leaf data to be update with personal data from each respective root
1899 
1900    Level: intermediate
1901 
1902 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1903 @*/
1904 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1905 {
1906   PetscErrorCode ierr;
1907   PetscSF        multi = NULL;
1908 
1909   PetscFunctionBegin;
1910   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1911   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1912   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1913   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1914   PetscFunctionReturn(0);
1915 }
1916 
1917 /*@C
1918    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1919 
1920    Collective
1921 
1922    Input Arguments:
1923 +  sf - star forest
1924 .  unit - data type
1925 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1926 
1927    Output Argument:
1928 .  leafdata - leaf data to be update with personal data from each respective root
1929 
1930    Level: intermediate
1931 
1932 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1933 @*/
1934 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1935 {
1936   PetscErrorCode ierr;
1937   PetscSF        multi = NULL;
1938 
1939   PetscFunctionBegin;
1940   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1941   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1942   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1943   PetscFunctionReturn(0);
1944 }
1945 
1946 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1947 {
1948   PetscInt        i, n, nleaves;
1949   const PetscInt *ilocal = NULL;
1950   PetscHSetI      seen;
1951   PetscErrorCode  ierr;
1952 
1953   PetscFunctionBegin;
1954   if (PetscDefined(USE_DEBUG)) {
1955     ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1956     ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1957     for (i = 0; i < nleaves; i++) {
1958       const PetscInt leaf = ilocal ? ilocal[i] : i;
1959       ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1960     }
1961     ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1962     if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1963     ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1964   }
1965   PetscFunctionReturn(0);
1966 }
1967 
1968 /*@
1969   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1970 
1971   Input Parameters:
1972 + sfA - The first PetscSF
1973 - sfB - The second PetscSF
1974 
1975   Output Parameters:
1976 . sfBA - The composite SF
1977 
1978   Level: developer
1979 
1980   Notes:
1981   Currently, the two SFs must be defined on congruent communicators and they must be true star
1982   forests, i.e. the same leaf is not connected with different roots.
1983 
1984   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1985   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1986   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1987   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1988 
1989 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1990 @*/
1991 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1992 {
1993   PetscErrorCode    ierr;
1994   const PetscSFNode *remotePointsA,*remotePointsB;
1995   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1996   const PetscInt    *localPointsA,*localPointsB;
1997   PetscInt          *localPointsBA;
1998   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1999   PetscBool         denseB;
2000 
2001   PetscFunctionBegin;
2002   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2003   PetscSFCheckGraphSet(sfA,1);
2004   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2005   PetscSFCheckGraphSet(sfB,2);
2006   PetscCheckSameComm(sfA,1,sfB,2);
2007   PetscValidPointer(sfBA,3);
2008   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2009   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2010 
2011   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
2012   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
2013   if (localPointsA) {
2014     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
2015     for (i=0; i<numRootsB; i++) {
2016       reorderedRemotePointsA[i].rank = -1;
2017       reorderedRemotePointsA[i].index = -1;
2018     }
2019     for (i=0; i<numLeavesA; i++) {
2020       if (localPointsA[i] >= numRootsB) continue;
2021       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
2022     }
2023     remotePointsA = reorderedRemotePointsA;
2024   }
2025   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
2026   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
2027   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf,MPI_REPLACE);CHKERRQ(ierr);
2028   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf,MPI_REPLACE);CHKERRQ(ierr);
2029   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2030 
2031   denseB = (PetscBool)!localPointsB;
2032   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2033     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
2034     else numLeavesBA++;
2035   }
2036   if (denseB) {
2037     localPointsBA  = NULL;
2038     remotePointsBA = leafdataB;
2039   } else {
2040     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
2041     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
2042     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2043       const PetscInt l = localPointsB ? localPointsB[i] : i;
2044 
2045       if (leafdataB[l-minleaf].rank == -1) continue;
2046       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
2047       localPointsBA[numLeavesBA] = l;
2048       numLeavesBA++;
2049     }
2050     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
2051   }
2052   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2053   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2054   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2055   PetscFunctionReturn(0);
2056 }
2057 
2058 /*@
2059   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
2060 
2061   Input Parameters:
2062 + sfA - The first PetscSF
2063 - sfB - The second PetscSF
2064 
2065   Output Parameters:
2066 . sfBA - The composite SF.
2067 
2068   Level: developer
2069 
2070   Notes:
2071   Currently, the two SFs must be defined on congruent communicators and they must be true star
2072   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2073   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2074 
2075   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2076   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2077   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2078   a Reduce on sfB, on connected roots.
2079 
2080 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
2081 @*/
2082 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
2083 {
2084   PetscErrorCode    ierr;
2085   const PetscSFNode *remotePointsA,*remotePointsB;
2086   PetscSFNode       *remotePointsBA;
2087   const PetscInt    *localPointsA,*localPointsB;
2088   PetscSFNode       *reorderedRemotePointsA = NULL;
2089   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
2090   MPI_Op            op;
2091 #if defined(PETSC_USE_64BIT_INDICES)
2092   PetscBool         iswin;
2093 #endif
2094 
2095   PetscFunctionBegin;
2096   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2097   PetscSFCheckGraphSet(sfA,1);
2098   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2099   PetscSFCheckGraphSet(sfB,2);
2100   PetscCheckSameComm(sfA,1,sfB,2);
2101   PetscValidPointer(sfBA,3);
2102   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2103   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2104 
2105   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
2106   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
2107 
2108   /* TODO: Check roots of sfB have degree of 1 */
2109   /* Once we implement it, we can replace the MPI_MAXLOC
2110      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2111      We use MPI_MAXLOC only to have a deterministic output from this routine if
2112      the root condition is not meet.
2113    */
2114   op = MPI_MAXLOC;
2115 #if defined(PETSC_USE_64BIT_INDICES)
2116   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2117   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
2118   if (iswin) op = MPI_REPLACE;
2119 #endif
2120 
2121   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
2122   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
2123   for (i=0; i<maxleaf - minleaf + 1; i++) {
2124     reorderedRemotePointsA[i].rank = -1;
2125     reorderedRemotePointsA[i].index = -1;
2126   }
2127   if (localPointsA) {
2128     for (i=0; i<numLeavesA; i++) {
2129       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2130       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2131     }
2132   } else {
2133     for (i=0; i<numLeavesA; i++) {
2134       if (i > maxleaf || i < minleaf) continue;
2135       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2136     }
2137   }
2138 
2139   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
2140   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
2141   for (i=0; i<numRootsB; i++) {
2142     remotePointsBA[i].rank = -1;
2143     remotePointsBA[i].index = -1;
2144   }
2145 
2146   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2147   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2148   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2149   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
2150     if (remotePointsBA[i].rank == -1) continue;
2151     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
2152     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2153     localPointsBA[numLeavesBA] = i;
2154     numLeavesBA++;
2155   }
2156   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2157   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2158   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2159   PetscFunctionReturn(0);
2160 }
2161 
2162 /*
2163   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
2164 
2165   Input Parameters:
2166 . sf - The global PetscSF
2167 
2168   Output Parameters:
2169 . out - The local PetscSF
2170  */
2171 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
2172 {
2173   MPI_Comm           comm;
2174   PetscMPIInt        myrank;
2175   const PetscInt     *ilocal;
2176   const PetscSFNode  *iremote;
2177   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
2178   PetscSFNode        *liremote;
2179   PetscSF            lsf;
2180   PetscErrorCode     ierr;
2181 
2182   PetscFunctionBegin;
2183   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2184   if (sf->ops->CreateLocalSF) {
2185     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2186   } else {
2187     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2188     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2189     ierr = MPI_Comm_rank(comm,&myrank);CHKERRMPI(ierr);
2190 
2191     /* Find out local edges and build a local SF */
2192     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2193     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2194     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2195     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2196 
2197     for (i=j=0; i<nleaves; i++) {
2198       if (iremote[i].rank == (PetscInt)myrank) {
2199         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2200         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2201         liremote[j].index = iremote[i].index;
2202         j++;
2203       }
2204     }
2205     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2206     ierr = PetscSFSetFromOptions(lsf);CHKERRQ(ierr);
2207     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2208     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2209     *out = lsf;
2210   }
2211   PetscFunctionReturn(0);
2212 }
2213 
2214 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2215 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2216 {
2217   PetscErrorCode     ierr;
2218   PetscMemType       rootmtype,leafmtype;
2219 
2220   PetscFunctionBegin;
2221   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2222   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2223   ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);
2224   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2225   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2226   if (sf->ops->BcastToZero) {
2227     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2228   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2229   ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);
2230   PetscFunctionReturn(0);
2231 }
2232 
2233