xref: /petsc/src/vec/is/sf/interface/sf.c (revision ed587871e994007e9aeee2261b868fe3c596c9b3)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6   #include <cuda_runtime.h>
7 #endif
8 
9 #if defined(PETSC_HAVE_HIP)
10   #include <hip/hip_runtime.h>
11 #endif
12 
13 #if defined(PETSC_USE_DEBUG)
14 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
15     if (PetscUnlikely(!(sf)->graphset))                              \
16       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
17   } while (0)
18 #else
19 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
20 #endif
21 
22 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",NULL};
23 
24 PETSC_STATIC_INLINE PetscErrorCode PetscGetMemType(const void *data,PetscMemType *type)
25 {
26   PetscFunctionBegin;
27   PetscValidPointer(type,2);
28   *type = PETSC_MEMTYPE_HOST;
29 #if defined(PETSC_HAVE_CUDA)
30   if (PetscCUDAInitialized && data) {
31     cudaError_t                  cerr;
32     struct cudaPointerAttributes attr;
33     enum cudaMemoryType          mtype;
34     cerr = cudaPointerGetAttributes(&attr,data); /* Do not check error since before CUDA 11.0, passing a host pointer returns cudaErrorInvalidValue */
35     cudaGetLastError(); /* Reset the last error */
36     #if (CUDART_VERSION < 10000)
37       mtype = attr.memoryType;
38     #else
39       mtype = attr.type;
40     #endif
41     if (cerr == cudaSuccess && mtype == cudaMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
42   }
43 #endif
44 
45 #if defined(PETSC_HAVE_HIP)
46   if (PetscHIPInitialized && data) {
47     hipError_t                   cerr;
48     struct hipPointerAttribute_t attr;
49     enum hipMemoryType           mtype;
50     cerr = hipPointerGetAttributes(&attr,data);
51     hipGetLastError(); /* Reset the last error */
52     mtype = attr.memoryType;
53     if (cerr == hipSuccess && mtype == hipMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
54   }
55 #endif
56   PetscFunctionReturn(0);
57 }
58 
59 /*@
60    PetscSFCreate - create a star forest communication context
61 
62    Collective
63 
64    Input Arguments:
65 .  comm - communicator on which the star forest will operate
66 
67    Output Arguments:
68 .  sf - new star forest context
69 
70    Options Database Keys:
71 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
72 .  -sf_type window    -Use MPI-3 one-sided window for communication
73 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
74 
75    Level: intermediate
76 
77    Notes:
78    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
79    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
80    SFs are optimized and they have better performance than general SFs.
81 
82 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
83 @*/
84 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
85 {
86   PetscErrorCode ierr;
87   PetscSF        b;
88 
89   PetscFunctionBegin;
90   PetscValidPointer(sf,2);
91   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
92 
93   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
94 
95   b->nroots    = -1;
96   b->nleaves   = -1;
97   b->minleaf   = PETSC_MAX_INT;
98   b->maxleaf   = PETSC_MIN_INT;
99   b->nranks    = -1;
100   b->rankorder = PETSC_TRUE;
101   b->ingroup   = MPI_GROUP_NULL;
102   b->outgroup  = MPI_GROUP_NULL;
103   b->graphset  = PETSC_FALSE;
104 #if defined(PETSC_HAVE_DEVICE)
105   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
106   b->use_stream_aware_mpi = PETSC_FALSE;
107   b->unknown_input_stream= PETSC_FALSE;
108   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
109     b->backend = PETSCSF_BACKEND_KOKKOS;
110   #elif defined(PETSC_HAVE_CUDA)
111     b->backend = PETSCSF_BACKEND_CUDA;
112   #elif defined(PETSC_HAVE_HIP)
113     b->backend = PETSCSF_BACKEND_HIP;
114   #endif
115 
116   #if defined(PETSC_HAVE_NVSHMEM)
117     b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
118     b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
119     ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&b->use_nvshmem,NULL);CHKERRQ(ierr);
120     ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem_get",&b->use_nvshmem_get,NULL);CHKERRQ(ierr);
121   #endif
122 #endif
123   b->vscat.from_n = -1;
124   b->vscat.to_n   = -1;
125   b->vscat.unit   = MPIU_SCALAR;
126  *sf = b;
127   PetscFunctionReturn(0);
128 }
129 
130 /*@
131    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
132 
133    Collective
134 
135    Input Arguments:
136 .  sf - star forest
137 
138    Level: advanced
139 
140 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
141 @*/
142 PetscErrorCode PetscSFReset(PetscSF sf)
143 {
144   PetscErrorCode ierr;
145 
146   PetscFunctionBegin;
147   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
148   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
149   sf->nroots   = -1;
150   sf->nleaves  = -1;
151   sf->minleaf  = PETSC_MAX_INT;
152   sf->maxleaf  = PETSC_MIN_INT;
153   sf->mine     = NULL;
154   sf->remote   = NULL;
155   sf->graphset = PETSC_FALSE;
156   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
157   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
158   sf->nranks = -1;
159   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
160   sf->degreeknown = PETSC_FALSE;
161   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
162   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRMPI(ierr);}
163   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRMPI(ierr);}
164   if (sf->multi) sf->multi->multi = NULL;
165   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
166   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
167 
168  #if defined(PETSC_HAVE_DEVICE)
169   for (PetscInt i=0; i<2; i++) {ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,sf->rmine_d[i]);CHKERRQ(ierr);}
170  #endif
171 
172   sf->setupcalled = PETSC_FALSE;
173   PetscFunctionReturn(0);
174 }
175 
176 /*@C
177    PetscSFSetType - Set the PetscSF communication implementation
178 
179    Collective on PetscSF
180 
181    Input Parameters:
182 +  sf - the PetscSF context
183 -  type - a known method
184 
185    Options Database Key:
186 .  -sf_type <type> - Sets the method; use -help for a list
187    of available methods (for instance, window, basic, neighbor)
188 
189    Notes:
190    See "include/petscsf.h" for available methods (for instance)
191 +    PETSCSFWINDOW - MPI-2/3 one-sided
192 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
193 
194   Level: intermediate
195 
196 .seealso: PetscSFType, PetscSFCreate()
197 @*/
198 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
199 {
200   PetscErrorCode ierr,(*r)(PetscSF);
201   PetscBool      match;
202 
203   PetscFunctionBegin;
204   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
205   PetscValidCharPointer(type,2);
206 
207   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
208   if (match) PetscFunctionReturn(0);
209 
210   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
211   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
212   /* Destroy the previous PetscSF implementation context */
213   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
214   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
215   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
216   ierr = (*r)(sf);CHKERRQ(ierr);
217   PetscFunctionReturn(0);
218 }
219 
220 /*@C
221   PetscSFGetType - Get the PetscSF communication implementation
222 
223   Not Collective
224 
225   Input Parameter:
226 . sf  - the PetscSF context
227 
228   Output Parameter:
229 . type - the PetscSF type name
230 
231   Level: intermediate
232 
233 .seealso: PetscSFSetType(), PetscSFCreate()
234 @*/
235 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
236 {
237   PetscFunctionBegin;
238   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
239   PetscValidPointer(type,2);
240   *type = ((PetscObject)sf)->type_name;
241   PetscFunctionReturn(0);
242 }
243 
244 /*@C
245    PetscSFDestroy - destroy star forest
246 
247    Collective
248 
249    Input Arguments:
250 .  sf - address of star forest
251 
252    Level: intermediate
253 
254 .seealso: PetscSFCreate(), PetscSFReset()
255 @*/
256 PetscErrorCode PetscSFDestroy(PetscSF *sf)
257 {
258   PetscErrorCode ierr;
259 
260   PetscFunctionBegin;
261   if (!*sf) PetscFunctionReturn(0);
262   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
263   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
264   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
265   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
266   ierr = PetscSFDestroy(&(*sf)->vscat.lsf);CHKERRQ(ierr);
267   if ((*sf)->vscat.bs > 1) {ierr = MPI_Type_free(&(*sf)->vscat.unit);CHKERRMPI(ierr);}
268   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
269   PetscFunctionReturn(0);
270 }
271 
272 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
273 {
274   PetscInt           i, nleaves;
275   PetscMPIInt        size;
276   const PetscInt    *ilocal;
277   const PetscSFNode *iremote;
278   PetscErrorCode     ierr;
279 
280   PetscFunctionBegin;
281   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
282   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
283   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
284   for (i = 0; i < nleaves; i++) {
285     const PetscInt rank = iremote[i].rank;
286     const PetscInt remote = iremote[i].index;
287     const PetscInt leaf = ilocal ? ilocal[i] : i;
288     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
289     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
290     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
291   }
292   PetscFunctionReturn(0);
293 }
294 
295 /*@
296    PetscSFSetUp - set up communication structures
297 
298    Collective
299 
300    Input Arguments:
301 .  sf - star forest communication object
302 
303    Level: beginner
304 
305 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
306 @*/
307 PetscErrorCode PetscSFSetUp(PetscSF sf)
308 {
309   PetscErrorCode ierr;
310 
311   PetscFunctionBegin;
312   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
313   PetscSFCheckGraphSet(sf,1);
314   if (sf->setupcalled) PetscFunctionReturn(0);
315   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
316   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
317   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);} /* Zero all sf->ops */
318   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
319 #if defined(PETSC_HAVE_CUDA)
320   if (sf->backend == PETSCSF_BACKEND_CUDA) {
321     sf->ops->Malloc = PetscSFMalloc_CUDA;
322     sf->ops->Free   = PetscSFFree_CUDA;
323   }
324 #endif
325 #if defined(PETSC_HAVE_HIP)
326   if (sf->backend == PETSCSF_BACKEND_HIP) {
327     sf->ops->Malloc = PetscSFMalloc_HIP;
328     sf->ops->Free   = PetscSFFree_HIP;
329   }
330 #endif
331 
332 #
333 #if defined(PETSC_HAVE_KOKKOS)
334   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
335     sf->ops->Malloc = PetscSFMalloc_Kokkos;
336     sf->ops->Free   = PetscSFFree_Kokkos;
337   }
338 #endif
339   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
340   sf->setupcalled = PETSC_TRUE;
341   PetscFunctionReturn(0);
342 }
343 
344 /*@
345    PetscSFSetFromOptions - set PetscSF options using the options database
346 
347    Logically Collective
348 
349    Input Arguments:
350 .  sf - star forest
351 
352    Options Database Keys:
353 +  -sf_type               - implementation type, see PetscSFSetType()
354 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
355 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
356                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
357                             If true, this option only works with -use_gpu_aware_mpi 1.
358 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
359                                If true, this option only works with -use_gpu_aware_mpi 1.
360 
361 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently SF has these backends: cuda, hip and Kokkos.
362                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
363                               the only available is kokkos.
364 
365    Level: intermediate
366 @*/
367 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
368 {
369   PetscSFType    deft;
370   char           type[256];
371   PetscErrorCode ierr;
372   PetscBool      flg;
373 
374   PetscFunctionBegin;
375   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
376   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
377   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
378   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
379   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
380   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
381  #if defined(PETSC_HAVE_DEVICE)
382   {
383     char        backendstr[32] = {0};
384     PetscBool   isCuda = PETSC_FALSE,isHip = PETSC_FALSE,isKokkos = PETSC_FALSE,set;
385     /* Change the defaults set in PetscSFCreate() with command line options */
386     ierr = PetscOptionsBool("-sf_unknown_input_stream","SF root/leafdata is computed on arbitary streams unknown to SF","PetscSFSetFromOptions",sf->unknown_input_stream,&sf->unknown_input_stream,NULL);CHKERRQ(ierr);
387     ierr = PetscOptionsBool("-sf_use_stream_aware_mpi","Assume the underlying MPI is cuda-stream aware","PetscSFSetFromOptions",sf->use_stream_aware_mpi,&sf->use_stream_aware_mpi,NULL);CHKERRQ(ierr);
388     ierr = PetscOptionsString("-sf_backend","Select the device backend SF uses","PetscSFSetFromOptions",NULL,backendstr,sizeof(backendstr),&set);CHKERRQ(ierr);
389     ierr = PetscStrcasecmp("cuda",backendstr,&isCuda);CHKERRQ(ierr);
390     ierr = PetscStrcasecmp("kokkos",backendstr,&isKokkos);CHKERRQ(ierr);
391     ierr = PetscStrcasecmp("hip",backendstr,&isHip);CHKERRQ(ierr);
392   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
393     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
394     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
395     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
396     else if (set) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
397   #elif defined(PETSC_HAVE_KOKKOS)
398     if (set && !isKokkos) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You can only choose kokkos", backendstr);
399    #endif
400   }
401  #endif
402   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
403   ierr = PetscOptionsEnd();CHKERRQ(ierr);
404   PetscFunctionReturn(0);
405 }
406 
407 /*@
408    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
409 
410    Logically Collective
411 
412    Input Arguments:
413 +  sf - star forest
414 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
415 
416    Level: advanced
417 
418 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
419 @*/
420 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
421 {
422   PetscFunctionBegin;
423   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
424   PetscValidLogicalCollectiveBool(sf,flg,2);
425   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
426   sf->rankorder = flg;
427   PetscFunctionReturn(0);
428 }
429 
430 /*@
431    PetscSFSetGraph - Set a parallel star forest
432 
433    Collective
434 
435    Input Arguments:
436 +  sf - star forest
437 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
438 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
439 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
440 during setup in debug mode)
441 .  localmode - copy mode for ilocal
442 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
443 during setup in debug mode)
444 -  remotemode - copy mode for iremote
445 
446    Level: intermediate
447 
448    Notes:
449     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
450 
451    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
452    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
453    needed
454 
455    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
456    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
457 
458 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
459 @*/
460 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
461 {
462   PetscErrorCode ierr;
463 
464   PetscFunctionBegin;
465   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
466   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
467   if (nleaves > 0) PetscValidPointer(iremote,6);
468   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
469   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
470 
471   if (sf->nroots >= 0) { /* Reset only if graph already set */
472     ierr = PetscSFReset(sf);CHKERRQ(ierr);
473   }
474 
475   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
476 
477   sf->nroots  = nroots;
478   sf->nleaves = nleaves;
479 
480   if (nleaves && ilocal) {
481     PetscInt i;
482     PetscInt minleaf = PETSC_MAX_INT;
483     PetscInt maxleaf = PETSC_MIN_INT;
484     int      contiguous = 1;
485     for (i=0; i<nleaves; i++) {
486       minleaf = PetscMin(minleaf,ilocal[i]);
487       maxleaf = PetscMax(maxleaf,ilocal[i]);
488       contiguous &= (ilocal[i] == i);
489     }
490     sf->minleaf = minleaf;
491     sf->maxleaf = maxleaf;
492     if (contiguous) {
493       if (localmode == PETSC_OWN_POINTER) {
494         ierr = PetscFree(ilocal);CHKERRQ(ierr);
495       }
496       ilocal = NULL;
497     }
498   } else {
499     sf->minleaf = 0;
500     sf->maxleaf = nleaves - 1;
501   }
502 
503   if (ilocal) {
504     switch (localmode) {
505     case PETSC_COPY_VALUES:
506       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
507       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
508       sf->mine = sf->mine_alloc;
509       break;
510     case PETSC_OWN_POINTER:
511       sf->mine_alloc = (PetscInt*)ilocal;
512       sf->mine       = sf->mine_alloc;
513       break;
514     case PETSC_USE_POINTER:
515       sf->mine_alloc = NULL;
516       sf->mine       = (PetscInt*)ilocal;
517       break;
518     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
519     }
520   }
521 
522   switch (remotemode) {
523   case PETSC_COPY_VALUES:
524     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
525     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
526     sf->remote = sf->remote_alloc;
527     break;
528   case PETSC_OWN_POINTER:
529     sf->remote_alloc = (PetscSFNode*)iremote;
530     sf->remote       = sf->remote_alloc;
531     break;
532   case PETSC_USE_POINTER:
533     sf->remote_alloc = NULL;
534     sf->remote       = (PetscSFNode*)iremote;
535     break;
536   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
537   }
538 
539   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
540   sf->graphset = PETSC_TRUE;
541   PetscFunctionReturn(0);
542 }
543 
544 /*@
545   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
546 
547   Collective
548 
549   Input Parameters:
550 + sf      - The PetscSF
551 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
552 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
553 
554   Notes:
555   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
556   n and N are local and global sizes of x respectively.
557 
558   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
559   sequential vectors y on all ranks.
560 
561   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
562   sequential vector y on rank 0.
563 
564   In above cases, entries of x are roots and entries of y are leaves.
565 
566   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
567   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
568   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
569   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
570   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
571 
572   In this case, roots and leaves are symmetric.
573 
574   Level: intermediate
575  @*/
576 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
577 {
578   MPI_Comm       comm;
579   PetscInt       n,N,res[2];
580   PetscMPIInt    rank,size;
581   PetscSFType    type;
582   PetscErrorCode ierr;
583 
584   PetscFunctionBegin;
585   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
586   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
587   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
588   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
589 
590   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
591     type = PETSCSFALLTOALL;
592     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
593     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
594     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
595     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
596   } else {
597     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
598     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
599     res[0] = n;
600     res[1] = -n;
601     /* Check if n are same over all ranks so that we can optimize it */
602     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRMPI(ierr);
603     if (res[0] == -res[1]) { /* same n */
604       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
605     } else {
606       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
607     }
608     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
609   }
610   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
611 
612   sf->pattern = pattern;
613   sf->mine    = NULL; /* Contiguous */
614 
615   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
616      Also set other easy stuff.
617    */
618   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
619     sf->nleaves      = N;
620     sf->nroots       = n;
621     sf->nranks       = size;
622     sf->minleaf      = 0;
623     sf->maxleaf      = N - 1;
624   } else if (pattern == PETSCSF_PATTERN_GATHER) {
625     sf->nleaves      = rank ? 0 : N;
626     sf->nroots       = n;
627     sf->nranks       = rank ? 0 : size;
628     sf->minleaf      = 0;
629     sf->maxleaf      = rank ? -1 : N - 1;
630   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
631     sf->nleaves      = size;
632     sf->nroots       = size;
633     sf->nranks       = size;
634     sf->minleaf      = 0;
635     sf->maxleaf      = size - 1;
636   }
637   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
638   sf->graphset = PETSC_TRUE;
639   PetscFunctionReturn(0);
640 }
641 
642 /*@
643    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
644 
645    Collective
646 
647    Input Arguments:
648 
649 .  sf - star forest to invert
650 
651    Output Arguments:
652 .  isf - inverse of sf
653    Level: advanced
654 
655    Notes:
656    All roots must have degree 1.
657 
658    The local space may be a permutation, but cannot be sparse.
659 
660 .seealso: PetscSFSetGraph()
661 @*/
662 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
663 {
664   PetscErrorCode ierr;
665   PetscMPIInt    rank;
666   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
667   const PetscInt *ilocal;
668   PetscSFNode    *roots,*leaves;
669 
670   PetscFunctionBegin;
671   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
672   PetscSFCheckGraphSet(sf,1);
673   PetscValidPointer(isf,2);
674 
675   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
676   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
677 
678   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
679   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
680   for (i=0; i<maxlocal; i++) {
681     leaves[i].rank  = rank;
682     leaves[i].index = i;
683   }
684   for (i=0; i <nroots; i++) {
685     roots[i].rank  = -1;
686     roots[i].index = -1;
687   }
688   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPI_REPLACE);CHKERRQ(ierr);
689   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPI_REPLACE);CHKERRQ(ierr);
690 
691   /* Check whether our leaves are sparse */
692   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
693   if (count == nroots) newilocal = NULL;
694   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
695     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
696     for (i=0,count=0; i<nroots; i++) {
697       if (roots[i].rank >= 0) {
698         newilocal[count]   = i;
699         roots[count].rank  = roots[i].rank;
700         roots[count].index = roots[i].index;
701         count++;
702       }
703     }
704   }
705 
706   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
707   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
708   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 /*@
713    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
714 
715    Collective
716 
717    Input Arguments:
718 +  sf - communication object to duplicate
719 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
720 
721    Output Arguments:
722 .  newsf - new communication object
723 
724    Level: beginner
725 
726 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
727 @*/
728 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
729 {
730   PetscSFType    type;
731   PetscErrorCode ierr;
732   MPI_Datatype   dtype=MPIU_SCALAR;
733 
734   PetscFunctionBegin;
735   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
736   PetscValidLogicalCollectiveEnum(sf,opt,2);
737   PetscValidPointer(newsf,3);
738   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
739   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
740   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
741   if (opt == PETSCSF_DUPLICATE_GRAPH) {
742     PetscSFCheckGraphSet(sf,1);
743     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
744       PetscInt          nroots,nleaves;
745       const PetscInt    *ilocal;
746       const PetscSFNode *iremote;
747       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
748       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
749     } else {
750       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
751     }
752   }
753   /* Since oldtype is committed, so is newtype, according to MPI */
754   if (sf->vscat.bs > 1) {ierr = MPI_Type_dup(sf->vscat.unit,&dtype);CHKERRMPI(ierr);}
755   (*newsf)->vscat.bs     = sf->vscat.bs;
756   (*newsf)->vscat.unit   = dtype;
757   (*newsf)->vscat.to_n   = sf->vscat.to_n;
758   (*newsf)->vscat.from_n = sf->vscat.from_n;
759   /* Do not copy lsf. Build it on demand since it is rarely used */
760 
761 #if defined(PETSC_HAVE_DEVICE)
762   (*newsf)->backend              = sf->backend;
763   (*newsf)->unknown_input_stream= sf->unknown_input_stream;
764   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
765   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
766 #endif
767   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
768   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
769   PetscFunctionReturn(0);
770 }
771 
772 /*@C
773    PetscSFGetGraph - Get the graph specifying a parallel star forest
774 
775    Not Collective
776 
777    Input Arguments:
778 .  sf - star forest
779 
780    Output Arguments:
781 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
782 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
783 .  ilocal - locations of leaves in leafdata buffers
784 -  iremote - remote locations of root vertices for each leaf on the current process
785 
786    Notes:
787    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
788 
789    When called from Fortran, the returned iremote array is a copy and must be deallocated after use. Consequently, if you
790    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
791 
792    Level: intermediate
793 
794 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
795 @*/
796 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
797 {
798   PetscErrorCode ierr;
799 
800   PetscFunctionBegin;
801   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
802   if (sf->ops->GetGraph) {
803     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
804   } else {
805     if (nroots) *nroots = sf->nroots;
806     if (nleaves) *nleaves = sf->nleaves;
807     if (ilocal) *ilocal = sf->mine;
808     if (iremote) *iremote = sf->remote;
809   }
810   PetscFunctionReturn(0);
811 }
812 
813 /*@
814    PetscSFGetLeafRange - Get the active leaf ranges
815 
816    Not Collective
817 
818    Input Arguments:
819 .  sf - star forest
820 
821    Output Arguments:
822 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
823 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
824 
825    Level: developer
826 
827 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
828 @*/
829 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
830 {
831   PetscFunctionBegin;
832   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
833   PetscSFCheckGraphSet(sf,1);
834   if (minleaf) *minleaf = sf->minleaf;
835   if (maxleaf) *maxleaf = sf->maxleaf;
836   PetscFunctionReturn(0);
837 }
838 
839 /*@C
840    PetscSFViewFromOptions - View from Options
841 
842    Collective on PetscSF
843 
844    Input Parameters:
845 +  A - the star forest
846 .  obj - Optional object
847 -  name - command line option
848 
849    Level: intermediate
850 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
851 @*/
852 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
853 {
854   PetscErrorCode ierr;
855 
856   PetscFunctionBegin;
857   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
858   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
859   PetscFunctionReturn(0);
860 }
861 
862 /*@C
863    PetscSFView - view a star forest
864 
865    Collective
866 
867    Input Arguments:
868 +  sf - star forest
869 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
870 
871    Level: beginner
872 
873 .seealso: PetscSFCreate(), PetscSFSetGraph()
874 @*/
875 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
876 {
877   PetscErrorCode    ierr;
878   PetscBool         iascii;
879   PetscViewerFormat format;
880 
881   PetscFunctionBegin;
882   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
883   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
884   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
885   PetscCheckSameComm(sf,1,viewer,2);
886   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
887   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
888   if (iascii) {
889     PetscMPIInt rank;
890     PetscInt    ii,i,j;
891 
892     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
893     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
894     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
895       if (!sf->graphset) {
896         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
897         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
898         PetscFunctionReturn(0);
899       }
900       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
901       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
902       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
903       for (i=0; i<sf->nleaves; i++) {
904         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
905       }
906       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
907       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
908       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
909         PetscMPIInt *tmpranks,*perm;
910         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
911         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
912         for (i=0; i<sf->nranks; i++) perm[i] = i;
913         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
914         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
915         for (ii=0; ii<sf->nranks; ii++) {
916           i = perm[ii];
917           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
918           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
919             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
920           }
921         }
922         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
923       }
924       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
925       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
926     }
927     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
928   }
929   if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
930   PetscFunctionReturn(0);
931 }
932 
933 /*@C
934    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
935 
936    Not Collective
937 
938    Input Arguments:
939 .  sf - star forest
940 
941    Output Arguments:
942 +  nranks - number of ranks referenced by local part
943 .  ranks - array of ranks
944 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
945 .  rmine - concatenated array holding local indices referencing each remote rank
946 -  rremote - concatenated array holding remote indices referenced for each remote rank
947 
948    Level: developer
949 
950 .seealso: PetscSFGetLeafRanks()
951 @*/
952 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
953 {
954   PetscErrorCode ierr;
955 
956   PetscFunctionBegin;
957   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
958   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
959   if (sf->ops->GetRootRanks) {
960     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
961   } else {
962     /* The generic implementation */
963     if (nranks)  *nranks  = sf->nranks;
964     if (ranks)   *ranks   = sf->ranks;
965     if (roffset) *roffset = sf->roffset;
966     if (rmine)   *rmine   = sf->rmine;
967     if (rremote) *rremote = sf->rremote;
968   }
969   PetscFunctionReturn(0);
970 }
971 
972 /*@C
973    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
974 
975    Not Collective
976 
977    Input Arguments:
978 .  sf - star forest
979 
980    Output Arguments:
981 +  niranks - number of leaf ranks referencing roots on this process
982 .  iranks - array of ranks
983 .  ioffset - offset in irootloc for each rank (length niranks+1)
984 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
985 
986    Level: developer
987 
988 .seealso: PetscSFGetRootRanks()
989 @*/
990 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
991 {
992   PetscErrorCode ierr;
993 
994   PetscFunctionBegin;
995   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
996   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
997   if (sf->ops->GetLeafRanks) {
998     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
999   } else {
1000     PetscSFType type;
1001     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
1002     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
1003   }
1004   PetscFunctionReturn(0);
1005 }
1006 
1007 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
1008   PetscInt i;
1009   for (i=0; i<n; i++) {
1010     if (needle == list[i]) return PETSC_TRUE;
1011   }
1012   return PETSC_FALSE;
1013 }
1014 
1015 /*@C
1016    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
1017 
1018    Collective
1019 
1020    Input Arguments:
1021 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
1022 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
1023 
1024    Level: developer
1025 
1026 .seealso: PetscSFGetRootRanks()
1027 @*/
1028 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
1029 {
1030   PetscErrorCode     ierr;
1031   PetscTable         table;
1032   PetscTablePosition pos;
1033   PetscMPIInt        size,groupsize,*groupranks;
1034   PetscInt           *rcount,*ranks;
1035   PetscInt           i, irank = -1,orank = -1;
1036 
1037   PetscFunctionBegin;
1038   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1039   PetscSFCheckGraphSet(sf,1);
1040   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
1041   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
1042   for (i=0; i<sf->nleaves; i++) {
1043     /* Log 1-based rank */
1044     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
1045   }
1046   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
1047   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
1048   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
1049   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
1050   for (i=0; i<sf->nranks; i++) {
1051     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
1052     ranks[i]--;             /* Convert back to 0-based */
1053   }
1054   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
1055 
1056   /* We expect that dgroup is reliably "small" while nranks could be large */
1057   {
1058     MPI_Group group = MPI_GROUP_NULL;
1059     PetscMPIInt *dgroupranks;
1060     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1061     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRMPI(ierr);
1062     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
1063     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
1064     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
1065     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRMPI(ierr);}
1066     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1067     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
1068   }
1069 
1070   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1071   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i;) {
1072     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
1073       if (InList(ranks[i],groupsize,groupranks)) break;
1074     }
1075     for (; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1076       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
1077     }
1078     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
1079       PetscInt tmprank,tmpcount;
1080 
1081       tmprank             = ranks[i];
1082       tmpcount            = rcount[i];
1083       ranks[i]            = ranks[sf->ndranks];
1084       rcount[i]           = rcount[sf->ndranks];
1085       ranks[sf->ndranks]  = tmprank;
1086       rcount[sf->ndranks] = tmpcount;
1087       sf->ndranks++;
1088     }
1089   }
1090   ierr = PetscFree(groupranks);CHKERRQ(ierr);
1091   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
1092   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
1093   sf->roffset[0] = 0;
1094   for (i=0; i<sf->nranks; i++) {
1095     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
1096     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
1097     rcount[i]        = 0;
1098   }
1099   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
1100     /* short circuit */
1101     if (orank != sf->remote[i].rank) {
1102       /* Search for index of iremote[i].rank in sf->ranks */
1103       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
1104       if (irank < 0) {
1105         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
1106         if (irank >= 0) irank += sf->ndranks;
1107       }
1108       orank = sf->remote[i].rank;
1109     }
1110     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
1111     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1112     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1113     rcount[irank]++;
1114   }
1115   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
1116   PetscFunctionReturn(0);
1117 }
1118 
1119 /*@C
1120    PetscSFGetGroups - gets incoming and outgoing process groups
1121 
1122    Collective
1123 
1124    Input Argument:
1125 .  sf - star forest
1126 
1127    Output Arguments:
1128 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1129 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1130 
1131    Level: developer
1132 
1133 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1134 @*/
1135 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1136 {
1137   PetscErrorCode ierr;
1138   MPI_Group      group = MPI_GROUP_NULL;
1139 
1140   PetscFunctionBegin;
1141   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1142   if (sf->ingroup == MPI_GROUP_NULL) {
1143     PetscInt       i;
1144     const PetscInt *indegree;
1145     PetscMPIInt    rank,*outranks,*inranks;
1146     PetscSFNode    *remote;
1147     PetscSF        bgcount;
1148 
1149     /* Compute the number of incoming ranks */
1150     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1151     for (i=0; i<sf->nranks; i++) {
1152       remote[i].rank  = sf->ranks[i];
1153       remote[i].index = 0;
1154     }
1155     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1156     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1157     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1158     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1159     /* Enumerate the incoming ranks */
1160     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1161     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1162     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1163     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1164     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1165     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1166     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRMPI(ierr);
1167     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1168     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1169     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1170   }
1171   *incoming = sf->ingroup;
1172 
1173   if (sf->outgroup == MPI_GROUP_NULL) {
1174     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1175     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRMPI(ierr);
1176     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1177   }
1178   *outgoing = sf->outgroup;
1179   PetscFunctionReturn(0);
1180 }
1181 
1182 /*@
1183    PetscSFGetMultiSF - gets the inner SF implementing gathers and scatters
1184 
1185    Collective
1186 
1187    Input Argument:
1188 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1189 
1190    Output Arguments:
1191 .  multi - star forest with split roots, such that each root has degree exactly 1
1192 
1193    Level: developer
1194 
1195    Notes:
1196 
1197    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1198    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1199    edge, it is a candidate for future optimization that might involve its removal.
1200 
1201 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1202 @*/
1203 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1204 {
1205   PetscErrorCode ierr;
1206 
1207   PetscFunctionBegin;
1208   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1209   PetscValidPointer(multi,2);
1210   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1211     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1212     *multi = sf->multi;
1213     sf->multi->multi = sf->multi;
1214     PetscFunctionReturn(0);
1215   }
1216   if (!sf->multi) {
1217     const PetscInt *indegree;
1218     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1219     PetscSFNode    *remote;
1220     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1221     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1222     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1223     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1224     inoffset[0] = 0;
1225     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1226     for (i=0; i<maxlocal; i++) outones[i] = 1;
1227     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1228     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1229     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1230     if (PetscDefined(USE_DEBUG)) { /* Check that the expected number of increments occurred */
1231       for (i=0; i<sf->nroots; i++) {
1232         if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1233       }
1234     }
1235     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1236     for (i=0; i<sf->nleaves; i++) {
1237       remote[i].rank  = sf->remote[i].rank;
1238       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1239     }
1240     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1241     sf->multi->multi = sf->multi;
1242     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1243     if (sf->rankorder) {        /* Sort the ranks */
1244       PetscMPIInt rank;
1245       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1246       PetscSFNode *newremote;
1247       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1248       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1249       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1250       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1251       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPI_REPLACE);CHKERRQ(ierr);
1252       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPI_REPLACE);CHKERRQ(ierr);
1253       /* Sort the incoming ranks at each vertex, build the inverse map */
1254       for (i=0; i<sf->nroots; i++) {
1255         PetscInt j;
1256         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1257         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1258         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1259       }
1260       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset,MPI_REPLACE);CHKERRQ(ierr);
1261       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset,MPI_REPLACE);CHKERRQ(ierr);
1262       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1263       for (i=0; i<sf->nleaves; i++) {
1264         newremote[i].rank  = sf->remote[i].rank;
1265         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1266       }
1267       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1268       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1269     }
1270     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1271   }
1272   *multi = sf->multi;
1273   PetscFunctionReturn(0);
1274 }
1275 
1276 /*@C
1277    PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots, does not remap indices
1278 
1279    Collective
1280 
1281    Input Arguments:
1282 +  sf - original star forest
1283 .  nselected  - number of selected roots on this process
1284 -  selected   - indices of the selected roots on this process
1285 
1286    Output Arguments:
1287 .  esf - new star forest
1288 
1289    Level: advanced
1290 
1291    Note:
1292    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1293    be done by calling PetscSFGetGraph().
1294 
1295 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1296 @*/
1297 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *esf)
1298 {
1299   PetscInt          i,j,n,nroots,nleaves,esf_nleaves,*new_ilocal,minleaf,maxleaf,maxlocal;
1300   const PetscInt    *ilocal;
1301   signed char       *rootdata,*leafdata,*leafmem;
1302   const PetscSFNode *iremote;
1303   PetscSFNode       *new_iremote;
1304   MPI_Comm          comm;
1305   PetscErrorCode    ierr;
1306 
1307   PetscFunctionBegin;
1308   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1309   PetscSFCheckGraphSet(sf,1);
1310   if (nselected) PetscValidPointer(selected,3);
1311   PetscValidPointer(esf,4);
1312 
1313   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1314   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1315   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1316   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1317 
1318   if (PetscDefined(USE_DEBUG)) {  /* Error out if selected[] has dups or  out of range indices */
1319     PetscBool dups;
1320     ierr = PetscCheckDupsInt(nselected,selected,&dups);CHKERRQ(ierr);
1321     if (dups) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"selected[] has dups");
1322     for (i=0; i<nselected; i++)
1323       if (selected[i] < 0 || selected[i] >= nroots) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"selected root indice %D is out of [0,%D)",selected[i],nroots);
1324   }
1325 
1326   if (sf->ops->CreateEmbeddedRootSF) {
1327     ierr = (*sf->ops->CreateEmbeddedRootSF)(sf,nselected,selected,esf);CHKERRQ(ierr);
1328   } else {
1329     /* A generic version of creating embedded sf */
1330     ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
1331     maxlocal = maxleaf - minleaf + 1;
1332     ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafmem);CHKERRQ(ierr);
1333     leafdata = leafmem - minleaf;
1334     /* Tag selected roots and bcast to leaves */
1335     for (i=0; i<nselected; i++) rootdata[selected[i]] = 1;
1336     ierr = PetscSFBcastBegin(sf,MPI_SIGNED_CHAR,rootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1337     ierr = PetscSFBcastEnd(sf,MPI_SIGNED_CHAR,rootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1338 
1339     /* Build esf with leaves that are still connected */
1340     esf_nleaves = 0;
1341     for (i=0; i<nleaves; i++) {
1342       j = ilocal ? ilocal[i] : i;
1343       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1344          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1345       */
1346       esf_nleaves += (leafdata[j] ? 1 : 0);
1347     }
1348     ierr = PetscMalloc1(esf_nleaves,&new_ilocal);CHKERRQ(ierr);
1349     ierr = PetscMalloc1(esf_nleaves,&new_iremote);CHKERRQ(ierr);
1350     for (i=n=0; i<nleaves; i++) {
1351       j = ilocal ? ilocal[i] : i;
1352       if (leafdata[j]) {
1353         new_ilocal[n]        = j;
1354         new_iremote[n].rank  = iremote[i].rank;
1355         new_iremote[n].index = iremote[i].index;
1356         ++n;
1357       }
1358     }
1359     ierr = PetscSFCreate(comm,esf);CHKERRQ(ierr);
1360     ierr = PetscSFSetFromOptions(*esf);CHKERRQ(ierr);
1361     ierr = PetscSFSetGraph(*esf,nroots,esf_nleaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1362     ierr = PetscFree2(rootdata,leafmem);CHKERRQ(ierr);
1363   }
1364   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 /*@C
1369   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1370 
1371   Collective
1372 
1373   Input Arguments:
1374 + sf - original star forest
1375 . nselected  - number of selected leaves on this process
1376 - selected   - indices of the selected leaves on this process
1377 
1378   Output Arguments:
1379 .  newsf - new star forest
1380 
1381   Level: advanced
1382 
1383 .seealso: PetscSFCreateEmbeddedRootSF(), PetscSFSetGraph(), PetscSFGetGraph()
1384 @*/
1385 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1386 {
1387   const PetscSFNode *iremote;
1388   PetscSFNode       *new_iremote;
1389   const PetscInt    *ilocal;
1390   PetscInt          i,nroots,*leaves,*new_ilocal;
1391   MPI_Comm          comm;
1392   PetscErrorCode    ierr;
1393 
1394   PetscFunctionBegin;
1395   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1396   PetscSFCheckGraphSet(sf,1);
1397   if (nselected) PetscValidPointer(selected,3);
1398   PetscValidPointer(newsf,4);
1399 
1400   /* Uniq selected[] and put results in leaves[] */
1401   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1402   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1403   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1404   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1405   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1406 
1407   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1408   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1409     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1410   } else {
1411     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1412     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1413     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1414     for (i=0; i<nselected; ++i) {
1415       const PetscInt l     = leaves[i];
1416       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1417       new_iremote[i].rank  = iremote[l].rank;
1418       new_iremote[i].index = iremote[l].index;
1419     }
1420     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1421     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1422   }
1423   ierr = PetscFree(leaves);CHKERRQ(ierr);
1424   PetscFunctionReturn(0);
1425 }
1426 
1427 /*@C
1428    PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastEnd()
1429 
1430    Collective on PetscSF
1431 
1432    Input Arguments:
1433 +  sf - star forest on which to communicate
1434 .  unit - data type associated with each node
1435 .  rootdata - buffer to broadcast
1436 -  op - operation to use for reduction
1437 
1438    Output Arguments:
1439 .  leafdata - buffer to be reduced with values from each leaf's respective root
1440 
1441    Level: intermediate
1442 
1443    Notes:
1444     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1445     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1446     use PetscSFBcastWithMemTypeBegin() instead.
1447 .seealso: PetscSFBcastEnd(), PetscSFBcastWithMemTypeBegin()
1448 @*/
1449 PetscErrorCode PetscSFBcastBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1450 {
1451   PetscErrorCode ierr;
1452   PetscMemType   rootmtype,leafmtype;
1453 
1454   PetscFunctionBegin;
1455   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1456   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1457   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1458   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1459   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1460   ierr = (*sf->ops->BcastBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1461   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1462   PetscFunctionReturn(0);
1463 }
1464 
1465 /*@C
1466    PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to PetscSFBcastEnd()
1467 
1468    Collective on PetscSF
1469 
1470    Input Arguments:
1471 +  sf - star forest on which to communicate
1472 .  unit - data type associated with each node
1473 .  rootmtype - memory type of rootdata
1474 .  rootdata - buffer to broadcast
1475 .  leafmtype - memory type of leafdata
1476 -  op - operation to use for reduction
1477 
1478    Output Arguments:
1479 .  leafdata - buffer to be reduced with values from each leaf's respective root
1480 
1481    Level: intermediate
1482 
1483 .seealso: PetscSFBcastEnd(), PetscSFBcastBegin()
1484 @*/
1485 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
1486 {
1487   PetscErrorCode ierr;
1488 
1489   PetscFunctionBegin;
1490   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1491   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1492   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1493   ierr = (*sf->ops->BcastBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1494   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1495   PetscFunctionReturn(0);
1496 }
1497 
1498 /*@C
1499    PetscSFBcastEnd - end a broadcast & reduce operation started with PetscSFBcastBegin()
1500 
1501    Collective
1502 
1503    Input Arguments:
1504 +  sf - star forest
1505 .  unit - data type
1506 .  rootdata - buffer to broadcast
1507 -  op - operation to use for reduction
1508 
1509    Output Arguments:
1510 .  leafdata - buffer to be reduced with values from each leaf's respective root
1511 
1512    Level: intermediate
1513 
1514 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1515 @*/
1516 PetscErrorCode PetscSFBcastEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1517 {
1518   PetscErrorCode ierr;
1519 
1520   PetscFunctionBegin;
1521   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1522   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastEnd,sf,0,0,0);CHKERRQ(ierr);}
1523   ierr = (*sf->ops->BcastEnd)(sf,unit,rootdata,leafdata,op);CHKERRQ(ierr);
1524   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastEnd,sf,0,0,0);CHKERRQ(ierr);}
1525   PetscFunctionReturn(0);
1526 }
1527 
1528 /*@C
1529    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1530 
1531    Collective
1532 
1533    Input Arguments:
1534 +  sf - star forest
1535 .  unit - data type
1536 .  leafdata - values to reduce
1537 -  op - reduction operation
1538 
1539    Output Arguments:
1540 .  rootdata - result of reduction of values from all leaves of each root
1541 
1542    Level: intermediate
1543 
1544    Notes:
1545     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1546     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1547     use PetscSFReduceWithMemTypeBegin() instead.
1548 
1549 .seealso: PetscSFBcastBegin(), PetscSFReduceWithMemTypeBegin()
1550 @*/
1551 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1552 {
1553   PetscErrorCode ierr;
1554   PetscMemType   rootmtype,leafmtype;
1555 
1556   PetscFunctionBegin;
1557   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1558   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1559   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1560   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1561   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1562   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1563   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1564   PetscFunctionReturn(0);
1565 }
1566 
1567 /*@C
1568    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to PetscSFReduceEnd()
1569 
1570    Collective
1571 
1572    Input Arguments:
1573 +  sf - star forest
1574 .  unit - data type
1575 .  leafmtype - memory type of leafdata
1576 .  leafdata - values to reduce
1577 .  rootmtype - memory type of rootdata
1578 -  op - reduction operation
1579 
1580    Output Arguments:
1581 .  rootdata - result of reduction of values from all leaves of each root
1582 
1583    Level: intermediate
1584 
1585 .seealso: PetscSFBcastBegin(), PetscSFReduceBegin()
1586 @*/
1587 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
1588 {
1589   PetscErrorCode ierr;
1590 
1591   PetscFunctionBegin;
1592   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1593   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1594   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1595   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1596   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1597   PetscFunctionReturn(0);
1598 }
1599 
1600 /*@C
1601    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1602 
1603    Collective
1604 
1605    Input Arguments:
1606 +  sf - star forest
1607 .  unit - data type
1608 .  leafdata - values to reduce
1609 -  op - reduction operation
1610 
1611    Output Arguments:
1612 .  rootdata - result of reduction of values from all leaves of each root
1613 
1614    Level: intermediate
1615 
1616 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1617 @*/
1618 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1619 {
1620   PetscErrorCode ierr;
1621 
1622   PetscFunctionBegin;
1623   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1624   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1625   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
1626   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1627   PetscFunctionReturn(0);
1628 }
1629 
1630 /*@C
1631    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1632 
1633    Collective
1634 
1635    Input Arguments:
1636 +  sf - star forest
1637 .  unit - data type
1638 .  leafdata - leaf values to use in reduction
1639 -  op - operation to use for reduction
1640 
1641    Output Arguments:
1642 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1643 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1644 
1645    Level: advanced
1646 
1647    Note:
1648    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1649    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1650    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1651    integers.
1652 
1653 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1654 @*/
1655 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1656 {
1657   PetscErrorCode ierr;
1658   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1659 
1660   PetscFunctionBegin;
1661   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1662   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1663   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1664   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1665   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1666   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1667   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1668   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1669   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1670   PetscFunctionReturn(0);
1671 }
1672 
1673 /*@C
1674    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1675 
1676    Collective
1677 
1678    Input Arguments:
1679 +  sf - star forest
1680 .  unit - data type
1681 .  leafdata - leaf values to use in reduction
1682 -  op - operation to use for reduction
1683 
1684    Output Arguments:
1685 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1686 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1687 
1688    Level: advanced
1689 
1690 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1691 @*/
1692 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1693 {
1694   PetscErrorCode ierr;
1695 
1696   PetscFunctionBegin;
1697   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1698   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1699   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootdata,leafdata,leafupdate,op);CHKERRQ(ierr);
1700   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1701   PetscFunctionReturn(0);
1702 }
1703 
1704 /*@C
1705    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1706 
1707    Collective
1708 
1709    Input Arguments:
1710 .  sf - star forest
1711 
1712    Output Arguments:
1713 .  degree - degree of each root vertex
1714 
1715    Level: advanced
1716 
1717    Notes:
1718    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1719 
1720 .seealso: PetscSFGatherBegin()
1721 @*/
1722 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1723 {
1724   PetscErrorCode ierr;
1725 
1726   PetscFunctionBegin;
1727   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1728   PetscSFCheckGraphSet(sf,1);
1729   PetscValidPointer(degree,2);
1730   if (!sf->degreeknown) {
1731     PetscInt i, nroots = sf->nroots, maxlocal;
1732     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1733     maxlocal = sf->maxleaf-sf->minleaf+1;
1734     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1735     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1736     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1737     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1738     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1739   }
1740   *degree = NULL;
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 /*@C
1745    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1746 
1747    Collective
1748 
1749    Input Arguments:
1750 .  sf - star forest
1751 
1752    Output Arguments:
1753 .  degree - degree of each root vertex
1754 
1755    Level: developer
1756 
1757    Notes:
1758    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1759 
1760 .seealso:
1761 @*/
1762 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1763 {
1764   PetscErrorCode ierr;
1765 
1766   PetscFunctionBegin;
1767   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1768   PetscSFCheckGraphSet(sf,1);
1769   PetscValidPointer(degree,2);
1770   if (!sf->degreeknown) {
1771     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1772     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1773     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1774     sf->degreeknown = PETSC_TRUE;
1775   }
1776   *degree = sf->degree;
1777   PetscFunctionReturn(0);
1778 }
1779 
1780 /*@C
1781    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1782    Each multi-root is assigned index of the corresponding original root.
1783 
1784    Collective
1785 
1786    Input Arguments:
1787 +  sf - star forest
1788 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1789 
1790    Output Arguments:
1791 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1792 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1793 
1794    Level: developer
1795 
1796    Notes:
1797    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1798 
1799 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1800 @*/
1801 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1802 {
1803   PetscSF             msf;
1804   PetscInt            i, j, k, nroots, nmroots;
1805   PetscErrorCode      ierr;
1806 
1807   PetscFunctionBegin;
1808   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1809   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1810   if (nroots) PetscValidIntPointer(degree,2);
1811   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1812   PetscValidPointer(multiRootsOrigNumbering,4);
1813   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1814   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1815   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1816   for (i=0,j=0,k=0; i<nroots; i++) {
1817     if (!degree[i]) continue;
1818     for (j=0; j<degree[i]; j++,k++) {
1819       (*multiRootsOrigNumbering)[k] = i;
1820     }
1821   }
1822   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1823   if (nMultiRoots) *nMultiRoots = nmroots;
1824   PetscFunctionReturn(0);
1825 }
1826 
1827 /*@C
1828    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1829 
1830    Collective
1831 
1832    Input Arguments:
1833 +  sf - star forest
1834 .  unit - data type
1835 -  leafdata - leaf data to gather to roots
1836 
1837    Output Argument:
1838 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1839 
1840    Level: intermediate
1841 
1842 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1843 @*/
1844 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1845 {
1846   PetscErrorCode ierr;
1847   PetscSF        multi = NULL;
1848 
1849   PetscFunctionBegin;
1850   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1851   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1852   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1853   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPI_REPLACE);CHKERRQ(ierr);
1854   PetscFunctionReturn(0);
1855 }
1856 
1857 /*@C
1858    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1859 
1860    Collective
1861 
1862    Input Arguments:
1863 +  sf - star forest
1864 .  unit - data type
1865 -  leafdata - leaf data to gather to roots
1866 
1867    Output Argument:
1868 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1869 
1870    Level: intermediate
1871 
1872 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1873 @*/
1874 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1875 {
1876   PetscErrorCode ierr;
1877   PetscSF        multi = NULL;
1878 
1879   PetscFunctionBegin;
1880   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1881   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1882   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPI_REPLACE);CHKERRQ(ierr);
1883   PetscFunctionReturn(0);
1884 }
1885 
1886 /*@C
1887    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1888 
1889    Collective
1890 
1891    Input Arguments:
1892 +  sf - star forest
1893 .  unit - data type
1894 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1895 
1896    Output Argument:
1897 .  leafdata - leaf data to be update with personal data from each respective root
1898 
1899    Level: intermediate
1900 
1901 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1902 @*/
1903 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1904 {
1905   PetscErrorCode ierr;
1906   PetscSF        multi = NULL;
1907 
1908   PetscFunctionBegin;
1909   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1910   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1911   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1912   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1913   PetscFunctionReturn(0);
1914 }
1915 
1916 /*@C
1917    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1918 
1919    Collective
1920 
1921    Input Arguments:
1922 +  sf - star forest
1923 .  unit - data type
1924 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1925 
1926    Output Argument:
1927 .  leafdata - leaf data to be update with personal data from each respective root
1928 
1929    Level: intermediate
1930 
1931 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1932 @*/
1933 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1934 {
1935   PetscErrorCode ierr;
1936   PetscSF        multi = NULL;
1937 
1938   PetscFunctionBegin;
1939   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1940   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1941   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1942   PetscFunctionReturn(0);
1943 }
1944 
1945 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1946 {
1947   PetscInt        i, n, nleaves;
1948   const PetscInt *ilocal = NULL;
1949   PetscHSetI      seen;
1950   PetscErrorCode  ierr;
1951 
1952   PetscFunctionBegin;
1953   if (PetscDefined(USE_DEBUG)) {
1954     ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1955     ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1956     for (i = 0; i < nleaves; i++) {
1957       const PetscInt leaf = ilocal ? ilocal[i] : i;
1958       ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1959     }
1960     ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1961     if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1962     ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1963   }
1964   PetscFunctionReturn(0);
1965 }
1966 
1967 /*@
1968   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1969 
1970   Input Parameters:
1971 + sfA - The first PetscSF
1972 - sfB - The second PetscSF
1973 
1974   Output Parameters:
1975 . sfBA - The composite SF
1976 
1977   Level: developer
1978 
1979   Notes:
1980   Currently, the two SFs must be defined on congruent communicators and they must be true star
1981   forests, i.e. the same leaf is not connected with different roots.
1982 
1983   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1984   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1985   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1986   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1987 
1988 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1989 @*/
1990 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1991 {
1992   PetscErrorCode    ierr;
1993   const PetscSFNode *remotePointsA,*remotePointsB;
1994   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1995   const PetscInt    *localPointsA,*localPointsB;
1996   PetscInt          *localPointsBA;
1997   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1998   PetscBool         denseB;
1999 
2000   PetscFunctionBegin;
2001   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2002   PetscSFCheckGraphSet(sfA,1);
2003   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2004   PetscSFCheckGraphSet(sfB,2);
2005   PetscCheckSameComm(sfA,1,sfB,2);
2006   PetscValidPointer(sfBA,3);
2007   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2008   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2009 
2010   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
2011   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
2012   if (localPointsA) {
2013     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
2014     for (i=0; i<numRootsB; i++) {
2015       reorderedRemotePointsA[i].rank = -1;
2016       reorderedRemotePointsA[i].index = -1;
2017     }
2018     for (i=0; i<numLeavesA; i++) {
2019       if (localPointsA[i] >= numRootsB) continue;
2020       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
2021     }
2022     remotePointsA = reorderedRemotePointsA;
2023   }
2024   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
2025   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
2026   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf,MPI_REPLACE);CHKERRQ(ierr);
2027   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf,MPI_REPLACE);CHKERRQ(ierr);
2028   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2029 
2030   denseB = (PetscBool)!localPointsB;
2031   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2032     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
2033     else numLeavesBA++;
2034   }
2035   if (denseB) {
2036     localPointsBA  = NULL;
2037     remotePointsBA = leafdataB;
2038   } else {
2039     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
2040     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
2041     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2042       const PetscInt l = localPointsB ? localPointsB[i] : i;
2043 
2044       if (leafdataB[l-minleaf].rank == -1) continue;
2045       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
2046       localPointsBA[numLeavesBA] = l;
2047       numLeavesBA++;
2048     }
2049     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
2050   }
2051   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2052   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2053   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2054   PetscFunctionReturn(0);
2055 }
2056 
2057 /*@
2058   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
2059 
2060   Input Parameters:
2061 + sfA - The first PetscSF
2062 - sfB - The second PetscSF
2063 
2064   Output Parameters:
2065 . sfBA - The composite SF.
2066 
2067   Level: developer
2068 
2069   Notes:
2070   Currently, the two SFs must be defined on congruent communicators and they must be true star
2071   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2072   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2073 
2074   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2075   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2076   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2077   a Reduce on sfB, on connected roots.
2078 
2079 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
2080 @*/
2081 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
2082 {
2083   PetscErrorCode    ierr;
2084   const PetscSFNode *remotePointsA,*remotePointsB;
2085   PetscSFNode       *remotePointsBA;
2086   const PetscInt    *localPointsA,*localPointsB;
2087   PetscSFNode       *reorderedRemotePointsA = NULL;
2088   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
2089   MPI_Op            op;
2090 #if defined(PETSC_USE_64BIT_INDICES)
2091   PetscBool         iswin;
2092 #endif
2093 
2094   PetscFunctionBegin;
2095   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2096   PetscSFCheckGraphSet(sfA,1);
2097   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2098   PetscSFCheckGraphSet(sfB,2);
2099   PetscCheckSameComm(sfA,1,sfB,2);
2100   PetscValidPointer(sfBA,3);
2101   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2102   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2103 
2104   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
2105   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
2106 
2107   /* TODO: Check roots of sfB have degree of 1 */
2108   /* Once we implement it, we can replace the MPI_MAXLOC
2109      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2110      We use MPI_MAXLOC only to have a deterministic output from this routine if
2111      the root condition is not meet.
2112    */
2113   op = MPI_MAXLOC;
2114 #if defined(PETSC_USE_64BIT_INDICES)
2115   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2116   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
2117   if (iswin) op = MPI_REPLACE;
2118 #endif
2119 
2120   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
2121   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
2122   for (i=0; i<maxleaf - minleaf + 1; i++) {
2123     reorderedRemotePointsA[i].rank = -1;
2124     reorderedRemotePointsA[i].index = -1;
2125   }
2126   if (localPointsA) {
2127     for (i=0; i<numLeavesA; i++) {
2128       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2129       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2130     }
2131   } else {
2132     for (i=0; i<numLeavesA; i++) {
2133       if (i > maxleaf || i < minleaf) continue;
2134       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2135     }
2136   }
2137 
2138   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
2139   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
2140   for (i=0; i<numRootsB; i++) {
2141     remotePointsBA[i].rank = -1;
2142     remotePointsBA[i].index = -1;
2143   }
2144 
2145   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2146   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2147   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2148   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
2149     if (remotePointsBA[i].rank == -1) continue;
2150     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
2151     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2152     localPointsBA[numLeavesBA] = i;
2153     numLeavesBA++;
2154   }
2155   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2156   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2157   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2158   PetscFunctionReturn(0);
2159 }
2160 
2161 /*
2162   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
2163 
2164   Input Parameters:
2165 . sf - The global PetscSF
2166 
2167   Output Parameters:
2168 . out - The local PetscSF
2169  */
2170 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
2171 {
2172   MPI_Comm           comm;
2173   PetscMPIInt        myrank;
2174   const PetscInt     *ilocal;
2175   const PetscSFNode  *iremote;
2176   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
2177   PetscSFNode        *liremote;
2178   PetscSF            lsf;
2179   PetscErrorCode     ierr;
2180 
2181   PetscFunctionBegin;
2182   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2183   if (sf->ops->CreateLocalSF) {
2184     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2185   } else {
2186     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2187     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2188     ierr = MPI_Comm_rank(comm,&myrank);CHKERRMPI(ierr);
2189 
2190     /* Find out local edges and build a local SF */
2191     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2192     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2193     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2194     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2195 
2196     for (i=j=0; i<nleaves; i++) {
2197       if (iremote[i].rank == (PetscInt)myrank) {
2198         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2199         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2200         liremote[j].index = iremote[i].index;
2201         j++;
2202       }
2203     }
2204     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2205     ierr = PetscSFSetFromOptions(lsf);CHKERRQ(ierr);
2206     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2207     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2208     *out = lsf;
2209   }
2210   PetscFunctionReturn(0);
2211 }
2212 
2213 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2214 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2215 {
2216   PetscErrorCode     ierr;
2217   PetscMemType       rootmtype,leafmtype;
2218 
2219   PetscFunctionBegin;
2220   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2221   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2222   ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);
2223   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2224   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2225   if (sf->ops->BcastToZero) {
2226     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2227   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2228   ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);
2229   PetscFunctionReturn(0);
2230 }
2231 
2232