xref: /petsc/src/vec/is/sf/interface/sf.c (revision efa12513287cff49a2b9648ae83199dcbfaad71a) !
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6   #include <cuda_runtime.h>
7 #endif
8 
9 #if defined(PETSC_HAVE_HIP)
10   #include <hip/hip_runtime.h>
11 #endif
12 
13 #if defined(PETSC_USE_DEBUG)
14 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
15     if (PetscUnlikely(!(sf)->graphset))                              \
16       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
17   } while (0)
18 #else
19 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
20 #endif
21 
22 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",NULL};
23 
24 PETSC_STATIC_INLINE PetscErrorCode PetscGetMemType(const void *data,PetscMemType *type)
25 {
26   PetscFunctionBegin;
27   PetscValidPointer(type,2);
28   *type = PETSC_MEMTYPE_HOST;
29 #if defined(PETSC_HAVE_CUDA)
30   if (PetscCUDAInitialized && data) {
31     cudaError_t                  cerr;
32     struct cudaPointerAttributes attr;
33     enum cudaMemoryType          mtype;
34     cerr = cudaPointerGetAttributes(&attr,data); /* Do not check error since before CUDA 11.0, passing a host pointer returns cudaErrorInvalidValue */
35     cudaGetLastError(); /* Reset the last error */
36     #if (CUDART_VERSION < 10000)
37       mtype = attr.memoryType;
38     #else
39       mtype = attr.type;
40     #endif
41     if (cerr == cudaSuccess && mtype == cudaMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
42   }
43 #endif
44 
45 #if defined(PETSC_HAVE_HIP)
46   if (PetscHIPInitialized && data) {
47     hipError_t                   cerr;
48     struct hipPointerAttribute_t attr;
49     enum hipMemoryType           mtype;
50     cerr = hipPointerGetAttributes(&attr,data);
51     hipGetLastError(); /* Reset the last error */
52     mtype = attr.memoryType;
53     if (cerr == hipSuccess && mtype == hipMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
54   }
55 #endif
56   PetscFunctionReturn(0);
57 }
58 
59 /*@
60    PetscSFCreate - create a star forest communication context
61 
62    Collective
63 
64    Input Arguments:
65 .  comm - communicator on which the star forest will operate
66 
67    Output Arguments:
68 .  sf - new star forest context
69 
70    Options Database Keys:
71 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
72 .  -sf_type window    -Use MPI-3 one-sided window for communication
73 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
74 
75    Level: intermediate
76 
77    Notes:
78    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
79    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
80    SFs are optimized and they have better performance than general SFs.
81 
82 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
83 @*/
84 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
85 {
86   PetscErrorCode ierr;
87   PetscSF        b;
88 
89   PetscFunctionBegin;
90   PetscValidPointer(sf,2);
91   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
92 
93   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
94 
95   b->nroots    = -1;
96   b->nleaves   = -1;
97   b->minleaf   = PETSC_MAX_INT;
98   b->maxleaf   = PETSC_MIN_INT;
99   b->nranks    = -1;
100   b->rankorder = PETSC_TRUE;
101   b->ingroup   = MPI_GROUP_NULL;
102   b->outgroup  = MPI_GROUP_NULL;
103   b->graphset  = PETSC_FALSE;
104 #if defined(PETSC_HAVE_DEVICE)
105   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
106   b->use_stream_aware_mpi = PETSC_FALSE;
107   b->use_default_stream   = PETSC_TRUE; /* The assumption is true for PETSc internal use of SF */
108   /* Set the default */
109   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
110     b->backend = PETSCSF_BACKEND_KOKKOS;
111   #elif defined(PETSC_HAVE_CUDA)
112     b->backend = PETSCSF_BACKEND_CUDA;
113   #elif defined(PETSC_HAVE_HIP)
114     b->backend = PETSCSF_BACKEND_HIP;
115   #endif
116 #endif
117   b->vscat.from_n = -1;
118   b->vscat.to_n   = -1;
119   b->vscat.unit   = MPIU_SCALAR;
120  *sf = b;
121   PetscFunctionReturn(0);
122 }
123 
124 /*@
125    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
126 
127    Collective
128 
129    Input Arguments:
130 .  sf - star forest
131 
132    Level: advanced
133 
134 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
135 @*/
136 PetscErrorCode PetscSFReset(PetscSF sf)
137 {
138   PetscErrorCode ierr;
139 
140   PetscFunctionBegin;
141   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
142   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
143   sf->nroots   = -1;
144   sf->nleaves  = -1;
145   sf->minleaf  = PETSC_MAX_INT;
146   sf->maxleaf  = PETSC_MIN_INT;
147   sf->mine     = NULL;
148   sf->remote   = NULL;
149   sf->graphset = PETSC_FALSE;
150   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
151   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
152   sf->nranks = -1;
153   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
154 #if defined(PETSC_HAVE_DEVICE)
155   for (PetscInt i=0; i<2; i++) {ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,sf->rmine_d[i]);CHKERRQ(ierr);}
156 #endif
157   sf->degreeknown = PETSC_FALSE;
158   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
159   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRMPI(ierr);}
160   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRMPI(ierr);}
161   if (sf->multi) sf->multi->multi = NULL;
162   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
163   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
164   sf->setupcalled = PETSC_FALSE;
165   PetscFunctionReturn(0);
166 }
167 
168 /*@C
169    PetscSFSetType - Set the PetscSF communication implementation
170 
171    Collective on PetscSF
172 
173    Input Parameters:
174 +  sf - the PetscSF context
175 -  type - a known method
176 
177    Options Database Key:
178 .  -sf_type <type> - Sets the method; use -help for a list
179    of available methods (for instance, window, basic, neighbor)
180 
181    Notes:
182    See "include/petscsf.h" for available methods (for instance)
183 +    PETSCSFWINDOW - MPI-2/3 one-sided
184 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
185 
186   Level: intermediate
187 
188 .seealso: PetscSFType, PetscSFCreate()
189 @*/
190 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
191 {
192   PetscErrorCode ierr,(*r)(PetscSF);
193   PetscBool      match;
194 
195   PetscFunctionBegin;
196   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
197   PetscValidCharPointer(type,2);
198 
199   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
200   if (match) PetscFunctionReturn(0);
201 
202   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
203   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
204   /* Destroy the previous PetscSF implementation context */
205   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
206   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
207   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
208   ierr = (*r)(sf);CHKERRQ(ierr);
209   PetscFunctionReturn(0);
210 }
211 
212 /*@C
213   PetscSFGetType - Get the PetscSF communication implementation
214 
215   Not Collective
216 
217   Input Parameter:
218 . sf  - the PetscSF context
219 
220   Output Parameter:
221 . type - the PetscSF type name
222 
223   Level: intermediate
224 
225 .seealso: PetscSFSetType(), PetscSFCreate()
226 @*/
227 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
228 {
229   PetscFunctionBegin;
230   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
231   PetscValidPointer(type,2);
232   *type = ((PetscObject)sf)->type_name;
233   PetscFunctionReturn(0);
234 }
235 
236 /*@
237    PetscSFDestroy - destroy star forest
238 
239    Collective
240 
241    Input Arguments:
242 .  sf - address of star forest
243 
244    Level: intermediate
245 
246 .seealso: PetscSFCreate(), PetscSFReset()
247 @*/
248 PetscErrorCode PetscSFDestroy(PetscSF *sf)
249 {
250   PetscErrorCode ierr;
251 
252   PetscFunctionBegin;
253   if (!*sf) PetscFunctionReturn(0);
254   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
255   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
256   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
257   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
258   ierr = PetscSFDestroy(&(*sf)->vscat.lsf);CHKERRQ(ierr);
259   if ((*sf)->vscat.bs > 1) {ierr = MPI_Type_free(&(*sf)->vscat.unit);CHKERRQ(ierr);}
260   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
265 {
266   PetscInt           i, nleaves;
267   PetscMPIInt        size;
268   const PetscInt    *ilocal;
269   const PetscSFNode *iremote;
270   PetscErrorCode     ierr;
271 
272   PetscFunctionBegin;
273   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
274   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
275   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
276   for (i = 0; i < nleaves; i++) {
277     const PetscInt rank = iremote[i].rank;
278     const PetscInt remote = iremote[i].index;
279     const PetscInt leaf = ilocal ? ilocal[i] : i;
280     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
281     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
282     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
283   }
284   PetscFunctionReturn(0);
285 }
286 
287 /*@
288    PetscSFSetUp - set up communication structures
289 
290    Collective
291 
292    Input Arguments:
293 .  sf - star forest communication object
294 
295    Level: beginner
296 
297 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
298 @*/
299 PetscErrorCode PetscSFSetUp(PetscSF sf)
300 {
301   PetscErrorCode ierr;
302 
303   PetscFunctionBegin;
304   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
305   PetscSFCheckGraphSet(sf,1);
306   if (sf->setupcalled) PetscFunctionReturn(0);
307   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
308   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
309   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);} /* Zero all sf->ops */
310   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
311 #if defined(PETSC_HAVE_CUDA)
312   if (sf->backend == PETSCSF_BACKEND_CUDA) {
313     sf->ops->Malloc = PetscSFMalloc_Cuda;
314     sf->ops->Free   = PetscSFFree_Cuda;
315   }
316 #endif
317 #if defined(PETSC_HAVE_HIP)
318   if (sf->backend == PETSCSF_BACKEND_HIP) {
319     sf->ops->Malloc = PetscSFMalloc_HIP;
320     sf->ops->Free   = PetscSFFree_HIP;
321   }
322 #endif
323 
324 #
325 #if defined(PETSC_HAVE_KOKKOS)
326   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
327     sf->ops->Malloc = PetscSFMalloc_Kokkos;
328     sf->ops->Free   = PetscSFFree_Kokkos;
329   }
330 #endif
331   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
332   sf->setupcalled = PETSC_TRUE;
333   PetscFunctionReturn(0);
334 }
335 
336 /*@
337    PetscSFSetFromOptions - set PetscSF options using the options database
338 
339    Logically Collective
340 
341    Input Arguments:
342 .  sf - star forest
343 
344    Options Database Keys:
345 +  -sf_type               - implementation type, see PetscSFSetType()
346 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
347 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
348                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
349                             If true, this option only works with -use_gpu_aware_mpi 1.
350 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
351                                If true, this option only works with -use_gpu_aware_mpi 1.
352 
353 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently SF has these backends: cuda, hip and Kokkos.
354                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
355                               the only available is kokkos.
356 
357    Level: intermediate
358 @*/
359 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
360 {
361   PetscSFType    deft;
362   char           type[256];
363   PetscErrorCode ierr;
364   PetscBool      flg;
365 
366   PetscFunctionBegin;
367   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
368   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
369   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
370   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
371   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
372   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
373 #if defined(PETSC_HAVE_DEVICE)
374   {
375     char        backendstr[32] = {0};
376     PetscBool   isCuda = PETSC_FALSE,isHip = PETSC_FALSE,isKokkos = PETSC_FALSE,set;
377     /* Change the defaults set in PetscSFCreate() with command line options */
378     ierr = PetscOptionsBool("-sf_use_default_stream","Assume SF's input and output root/leafdata is computed on the default stream","PetscSFSetFromOptions",sf->use_default_stream,&sf->use_default_stream,NULL);CHKERRQ(ierr);
379     ierr = PetscOptionsBool("-sf_use_stream_aware_mpi","Assume the underlying MPI is cuda-stream aware","PetscSFSetFromOptions",sf->use_stream_aware_mpi,&sf->use_stream_aware_mpi,NULL);CHKERRQ(ierr);
380     ierr = PetscOptionsString("-sf_backend","Select the device backend SF uses","PetscSFSetFromOptions",NULL,backendstr,sizeof(backendstr),&set);CHKERRQ(ierr);
381     ierr = PetscStrcasecmp("cuda",backendstr,&isCuda);CHKERRQ(ierr);
382     ierr = PetscStrcasecmp("kokkos",backendstr,&isKokkos);CHKERRQ(ierr);
383     ierr = PetscStrcasecmp("hip",backendstr,&isHip);CHKERRQ(ierr);
384   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
385     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
386     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
387     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
388     else if (set) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
389   #elif defined(PETSC_HAVE_KOKKOS)
390     if (set && !isKokkos) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You can only choose kokkos", backendstr);
391   #endif
392   }
393 #endif
394   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
395   ierr = PetscOptionsEnd();CHKERRQ(ierr);
396   PetscFunctionReturn(0);
397 }
398 
399 /*@
400    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
401 
402    Logically Collective
403 
404    Input Arguments:
405 +  sf - star forest
406 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
407 
408    Level: advanced
409 
410 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
411 @*/
412 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
413 {
414   PetscFunctionBegin;
415   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
416   PetscValidLogicalCollectiveBool(sf,flg,2);
417   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
418   sf->rankorder = flg;
419   PetscFunctionReturn(0);
420 }
421 
422 /*@
423    PetscSFSetGraph - Set a parallel star forest
424 
425    Collective
426 
427    Input Arguments:
428 +  sf - star forest
429 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
430 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
431 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
432 during setup in debug mode)
433 .  localmode - copy mode for ilocal
434 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
435 during setup in debug mode)
436 -  remotemode - copy mode for iremote
437 
438    Level: intermediate
439 
440    Notes:
441     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
442 
443    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
444    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
445    needed
446 
447    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
448    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
449 
450 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
451 @*/
452 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
453 {
454   PetscErrorCode ierr;
455 
456   PetscFunctionBegin;
457   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
458   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
459   if (nleaves > 0) PetscValidPointer(iremote,6);
460   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
461   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
462 
463   if (sf->nroots >= 0) { /* Reset only if graph already set */
464     ierr = PetscSFReset(sf);CHKERRQ(ierr);
465   }
466 
467   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
468 
469   sf->nroots  = nroots;
470   sf->nleaves = nleaves;
471 
472   if (nleaves && ilocal) {
473     PetscInt i;
474     PetscInt minleaf = PETSC_MAX_INT;
475     PetscInt maxleaf = PETSC_MIN_INT;
476     int      contiguous = 1;
477     for (i=0; i<nleaves; i++) {
478       minleaf = PetscMin(minleaf,ilocal[i]);
479       maxleaf = PetscMax(maxleaf,ilocal[i]);
480       contiguous &= (ilocal[i] == i);
481     }
482     sf->minleaf = minleaf;
483     sf->maxleaf = maxleaf;
484     if (contiguous) {
485       if (localmode == PETSC_OWN_POINTER) {
486         ierr = PetscFree(ilocal);CHKERRQ(ierr);
487       }
488       ilocal = NULL;
489     }
490   } else {
491     sf->minleaf = 0;
492     sf->maxleaf = nleaves - 1;
493   }
494 
495   if (ilocal) {
496     switch (localmode) {
497     case PETSC_COPY_VALUES:
498       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
499       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
500       sf->mine = sf->mine_alloc;
501       break;
502     case PETSC_OWN_POINTER:
503       sf->mine_alloc = (PetscInt*)ilocal;
504       sf->mine       = sf->mine_alloc;
505       break;
506     case PETSC_USE_POINTER:
507       sf->mine_alloc = NULL;
508       sf->mine       = (PetscInt*)ilocal;
509       break;
510     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
511     }
512   }
513 
514   switch (remotemode) {
515   case PETSC_COPY_VALUES:
516     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
517     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
518     sf->remote = sf->remote_alloc;
519     break;
520   case PETSC_OWN_POINTER:
521     sf->remote_alloc = (PetscSFNode*)iremote;
522     sf->remote       = sf->remote_alloc;
523     break;
524   case PETSC_USE_POINTER:
525     sf->remote_alloc = NULL;
526     sf->remote       = (PetscSFNode*)iremote;
527     break;
528   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
529   }
530 
531   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
532   sf->graphset = PETSC_TRUE;
533   PetscFunctionReturn(0);
534 }
535 
536 /*@
537   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
538 
539   Collective
540 
541   Input Parameters:
542 + sf      - The PetscSF
543 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
544 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
545 
546   Notes:
547   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
548   n and N are local and global sizes of x respectively.
549 
550   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
551   sequential vectors y on all ranks.
552 
553   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
554   sequential vector y on rank 0.
555 
556   In above cases, entries of x are roots and entries of y are leaves.
557 
558   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
559   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
560   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
561   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
562   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
563 
564   In this case, roots and leaves are symmetric.
565 
566   Level: intermediate
567  @*/
568 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
569 {
570   MPI_Comm       comm;
571   PetscInt       n,N,res[2];
572   PetscMPIInt    rank,size;
573   PetscSFType    type;
574   PetscErrorCode ierr;
575 
576   PetscFunctionBegin;
577   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
578   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
579   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
580   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
581 
582   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
583     type = PETSCSFALLTOALL;
584     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
585     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
586     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
587     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
588   } else {
589     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
590     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
591     res[0] = n;
592     res[1] = -n;
593     /* Check if n are same over all ranks so that we can optimize it */
594     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
595     if (res[0] == -res[1]) { /* same n */
596       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
597     } else {
598       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
599     }
600     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
601   }
602   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
603 
604   sf->pattern = pattern;
605   sf->mine    = NULL; /* Contiguous */
606 
607   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
608      Also set other easy stuff.
609    */
610   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
611     sf->nleaves      = N;
612     sf->nroots       = n;
613     sf->nranks       = size;
614     sf->minleaf      = 0;
615     sf->maxleaf      = N - 1;
616   } else if (pattern == PETSCSF_PATTERN_GATHER) {
617     sf->nleaves      = rank ? 0 : N;
618     sf->nroots       = n;
619     sf->nranks       = rank ? 0 : size;
620     sf->minleaf      = 0;
621     sf->maxleaf      = rank ? -1 : N - 1;
622   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
623     sf->nleaves      = size;
624     sf->nroots       = size;
625     sf->nranks       = size;
626     sf->minleaf      = 0;
627     sf->maxleaf      = size - 1;
628   }
629   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
630   sf->graphset = PETSC_TRUE;
631   PetscFunctionReturn(0);
632 }
633 
634 /*@
635    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
636 
637    Collective
638 
639    Input Arguments:
640 
641 .  sf - star forest to invert
642 
643    Output Arguments:
644 .  isf - inverse of sf
645    Level: advanced
646 
647    Notes:
648    All roots must have degree 1.
649 
650    The local space may be a permutation, but cannot be sparse.
651 
652 .seealso: PetscSFSetGraph()
653 @*/
654 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
655 {
656   PetscErrorCode ierr;
657   PetscMPIInt    rank;
658   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
659   const PetscInt *ilocal;
660   PetscSFNode    *roots,*leaves;
661 
662   PetscFunctionBegin;
663   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
664   PetscSFCheckGraphSet(sf,1);
665   PetscValidPointer(isf,2);
666 
667   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
668   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
669 
670   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
671   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
672   for (i=0; i<maxlocal; i++) {
673     leaves[i].rank  = rank;
674     leaves[i].index = i;
675   }
676   for (i=0; i <nroots; i++) {
677     roots[i].rank  = -1;
678     roots[i].index = -1;
679   }
680   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
681   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
682 
683   /* Check whether our leaves are sparse */
684   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
685   if (count == nroots) newilocal = NULL;
686   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
687     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
688     for (i=0,count=0; i<nroots; i++) {
689       if (roots[i].rank >= 0) {
690         newilocal[count]   = i;
691         roots[count].rank  = roots[i].rank;
692         roots[count].index = roots[i].index;
693         count++;
694       }
695     }
696   }
697 
698   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
699   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
700   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
701   PetscFunctionReturn(0);
702 }
703 
704 /*@
705    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
706 
707    Collective
708 
709    Input Arguments:
710 +  sf - communication object to duplicate
711 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
712 
713    Output Arguments:
714 .  newsf - new communication object
715 
716    Level: beginner
717 
718 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
719 @*/
720 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
721 {
722   PetscSFType    type;
723   PetscErrorCode ierr;
724   MPI_Datatype   dtype=MPIU_SCALAR;
725 
726   PetscFunctionBegin;
727   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
728   PetscValidLogicalCollectiveEnum(sf,opt,2);
729   PetscValidPointer(newsf,3);
730   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
731   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
732   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
733   if (opt == PETSCSF_DUPLICATE_GRAPH) {
734     PetscSFCheckGraphSet(sf,1);
735     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
736       PetscInt          nroots,nleaves;
737       const PetscInt    *ilocal;
738       const PetscSFNode *iremote;
739       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
740       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
741     } else {
742       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
743     }
744   }
745   /* Since oldtype is committed, so is newtype, according to MPI */
746   if (sf->vscat.bs > 1) {ierr = MPI_Type_dup(sf->vscat.unit,&dtype);CHKERRQ(ierr);}
747   (*newsf)->vscat.bs     = sf->vscat.bs;
748   (*newsf)->vscat.unit   = dtype;
749   (*newsf)->vscat.to_n   = sf->vscat.to_n;
750   (*newsf)->vscat.from_n = sf->vscat.from_n;
751   /* Do not copy lsf. Build it on demand since it is rarely used */
752 
753 #if defined(PETSC_HAVE_DEVICE)
754   (*newsf)->backend              = sf->backend;
755   (*newsf)->use_default_stream   = sf->use_default_stream;
756   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
757   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
758 #endif
759   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
760   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
761   PetscFunctionReturn(0);
762 }
763 
764 /*@C
765    PetscSFGetGraph - Get the graph specifying a parallel star forest
766 
767    Not Collective
768 
769    Input Arguments:
770 .  sf - star forest
771 
772    Output Arguments:
773 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
774 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
775 .  ilocal - locations of leaves in leafdata buffers
776 -  iremote - remote locations of root vertices for each leaf on the current process
777 
778    Notes:
779    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
780 
781    When called from Fortran, the returned iremote array is a copy and must be deallocated after use. Consequently, if you
782    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
783 
784    Level: intermediate
785 
786 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
787 @*/
788 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
789 {
790   PetscErrorCode ierr;
791 
792   PetscFunctionBegin;
793   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
794   if (sf->ops->GetGraph) {
795     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
796   } else {
797     if (nroots) *nroots = sf->nroots;
798     if (nleaves) *nleaves = sf->nleaves;
799     if (ilocal) *ilocal = sf->mine;
800     if (iremote) *iremote = sf->remote;
801   }
802   PetscFunctionReturn(0);
803 }
804 
805 /*@
806    PetscSFGetLeafRange - Get the active leaf ranges
807 
808    Not Collective
809 
810    Input Arguments:
811 .  sf - star forest
812 
813    Output Arguments:
814 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
815 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
816 
817    Level: developer
818 
819 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
820 @*/
821 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
822 {
823   PetscFunctionBegin;
824   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
825   PetscSFCheckGraphSet(sf,1);
826   if (minleaf) *minleaf = sf->minleaf;
827   if (maxleaf) *maxleaf = sf->maxleaf;
828   PetscFunctionReturn(0);
829 }
830 
831 /*@C
832    PetscSFViewFromOptions - View from Options
833 
834    Collective on PetscSF
835 
836    Input Parameters:
837 +  A - the star forest
838 .  obj - Optional object
839 -  name - command line option
840 
841    Level: intermediate
842 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
843 @*/
844 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
845 {
846   PetscErrorCode ierr;
847 
848   PetscFunctionBegin;
849   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
850   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
851   PetscFunctionReturn(0);
852 }
853 
854 /*@C
855    PetscSFView - view a star forest
856 
857    Collective
858 
859    Input Arguments:
860 +  sf - star forest
861 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
862 
863    Level: beginner
864 
865 .seealso: PetscSFCreate(), PetscSFSetGraph()
866 @*/
867 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
868 {
869   PetscErrorCode    ierr;
870   PetscBool         iascii;
871   PetscViewerFormat format;
872 
873   PetscFunctionBegin;
874   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
875   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
876   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
877   PetscCheckSameComm(sf,1,viewer,2);
878   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
879   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
880   if (iascii) {
881     PetscMPIInt rank;
882     PetscInt    ii,i,j;
883 
884     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
885     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
886     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
887     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
888       if (!sf->graphset) {
889         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
890         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
891         PetscFunctionReturn(0);
892       }
893       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
894       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
895       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
896       for (i=0; i<sf->nleaves; i++) {
897         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
898       }
899       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
900       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
901       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
902         PetscMPIInt *tmpranks,*perm;
903         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
904         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
905         for (i=0; i<sf->nranks; i++) perm[i] = i;
906         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
907         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
908         for (ii=0; ii<sf->nranks; ii++) {
909           i = perm[ii];
910           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
911           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
912             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
913           }
914         }
915         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
916       }
917       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
918       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
919     }
920     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
921   }
922   PetscFunctionReturn(0);
923 }
924 
925 /*@C
926    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
927 
928    Not Collective
929 
930    Input Arguments:
931 .  sf - star forest
932 
933    Output Arguments:
934 +  nranks - number of ranks referenced by local part
935 .  ranks - array of ranks
936 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
937 .  rmine - concatenated array holding local indices referencing each remote rank
938 -  rremote - concatenated array holding remote indices referenced for each remote rank
939 
940    Level: developer
941 
942 .seealso: PetscSFGetLeafRanks()
943 @*/
944 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
945 {
946   PetscErrorCode ierr;
947 
948   PetscFunctionBegin;
949   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
950   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
951   if (sf->ops->GetRootRanks) {
952     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
953   } else {
954     /* The generic implementation */
955     if (nranks)  *nranks  = sf->nranks;
956     if (ranks)   *ranks   = sf->ranks;
957     if (roffset) *roffset = sf->roffset;
958     if (rmine)   *rmine   = sf->rmine;
959     if (rremote) *rremote = sf->rremote;
960   }
961   PetscFunctionReturn(0);
962 }
963 
964 /*@C
965    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
966 
967    Not Collective
968 
969    Input Arguments:
970 .  sf - star forest
971 
972    Output Arguments:
973 +  niranks - number of leaf ranks referencing roots on this process
974 .  iranks - array of ranks
975 .  ioffset - offset in irootloc for each rank (length niranks+1)
976 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
977 
978    Level: developer
979 
980 .seealso: PetscSFGetRootRanks()
981 @*/
982 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
983 {
984   PetscErrorCode ierr;
985 
986   PetscFunctionBegin;
987   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
988   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
989   if (sf->ops->GetLeafRanks) {
990     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
991   } else {
992     PetscSFType type;
993     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
994     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
995   }
996   PetscFunctionReturn(0);
997 }
998 
999 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
1000   PetscInt i;
1001   for (i=0; i<n; i++) {
1002     if (needle == list[i]) return PETSC_TRUE;
1003   }
1004   return PETSC_FALSE;
1005 }
1006 
1007 /*@C
1008    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
1009 
1010    Collective
1011 
1012    Input Arguments:
1013 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
1014 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
1015 
1016    Level: developer
1017 
1018 .seealso: PetscSFGetRootRanks()
1019 @*/
1020 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
1021 {
1022   PetscErrorCode     ierr;
1023   PetscTable         table;
1024   PetscTablePosition pos;
1025   PetscMPIInt        size,groupsize,*groupranks;
1026   PetscInt           *rcount,*ranks;
1027   PetscInt           i, irank = -1,orank = -1;
1028 
1029   PetscFunctionBegin;
1030   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1031   PetscSFCheckGraphSet(sf,1);
1032   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
1033   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
1034   for (i=0; i<sf->nleaves; i++) {
1035     /* Log 1-based rank */
1036     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
1037   }
1038   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
1039   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
1040   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
1041   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
1042   for (i=0; i<sf->nranks; i++) {
1043     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
1044     ranks[i]--;             /* Convert back to 0-based */
1045   }
1046   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
1047 
1048   /* We expect that dgroup is reliably "small" while nranks could be large */
1049   {
1050     MPI_Group group = MPI_GROUP_NULL;
1051     PetscMPIInt *dgroupranks;
1052     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1053     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRMPI(ierr);
1054     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
1055     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
1056     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
1057     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRMPI(ierr);}
1058     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1059     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
1060   }
1061 
1062   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1063   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i;) {
1064     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
1065       if (InList(ranks[i],groupsize,groupranks)) break;
1066     }
1067     for (; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1068       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
1069     }
1070     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
1071       PetscInt tmprank,tmpcount;
1072 
1073       tmprank             = ranks[i];
1074       tmpcount            = rcount[i];
1075       ranks[i]            = ranks[sf->ndranks];
1076       rcount[i]           = rcount[sf->ndranks];
1077       ranks[sf->ndranks]  = tmprank;
1078       rcount[sf->ndranks] = tmpcount;
1079       sf->ndranks++;
1080     }
1081   }
1082   ierr = PetscFree(groupranks);CHKERRQ(ierr);
1083   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
1084   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
1085   sf->roffset[0] = 0;
1086   for (i=0; i<sf->nranks; i++) {
1087     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
1088     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
1089     rcount[i]        = 0;
1090   }
1091   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
1092     /* short circuit */
1093     if (orank != sf->remote[i].rank) {
1094       /* Search for index of iremote[i].rank in sf->ranks */
1095       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
1096       if (irank < 0) {
1097         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
1098         if (irank >= 0) irank += sf->ndranks;
1099       }
1100       orank = sf->remote[i].rank;
1101     }
1102     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
1103     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1104     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1105     rcount[irank]++;
1106   }
1107   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
1108   PetscFunctionReturn(0);
1109 }
1110 
1111 /*@C
1112    PetscSFGetGroups - gets incoming and outgoing process groups
1113 
1114    Collective
1115 
1116    Input Argument:
1117 .  sf - star forest
1118 
1119    Output Arguments:
1120 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1121 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1122 
1123    Level: developer
1124 
1125 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1126 @*/
1127 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1128 {
1129   PetscErrorCode ierr;
1130   MPI_Group      group = MPI_GROUP_NULL;
1131 
1132   PetscFunctionBegin;
1133   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1134   if (sf->ingroup == MPI_GROUP_NULL) {
1135     PetscInt       i;
1136     const PetscInt *indegree;
1137     PetscMPIInt    rank,*outranks,*inranks;
1138     PetscSFNode    *remote;
1139     PetscSF        bgcount;
1140 
1141     /* Compute the number of incoming ranks */
1142     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1143     for (i=0; i<sf->nranks; i++) {
1144       remote[i].rank  = sf->ranks[i];
1145       remote[i].index = 0;
1146     }
1147     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1148     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1149     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1150     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1151     /* Enumerate the incoming ranks */
1152     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1153     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1154     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1155     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1156     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1157     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1158     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRMPI(ierr);
1159     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1160     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1161     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1162   }
1163   *incoming = sf->ingroup;
1164 
1165   if (sf->outgroup == MPI_GROUP_NULL) {
1166     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1167     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRMPI(ierr);
1168     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1169   }
1170   *outgoing = sf->outgroup;
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 /*@
1175    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1176 
1177    Collective
1178 
1179    Input Argument:
1180 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1181 
1182    Output Arguments:
1183 .  multi - star forest with split roots, such that each root has degree exactly 1
1184 
1185    Level: developer
1186 
1187    Notes:
1188 
1189    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1190    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1191    edge, it is a candidate for future optimization that might involve its removal.
1192 
1193 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1194 @*/
1195 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1196 {
1197   PetscErrorCode ierr;
1198 
1199   PetscFunctionBegin;
1200   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1201   PetscValidPointer(multi,2);
1202   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1203     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1204     *multi = sf->multi;
1205     sf->multi->multi = sf->multi;
1206     PetscFunctionReturn(0);
1207   }
1208   if (!sf->multi) {
1209     const PetscInt *indegree;
1210     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1211     PetscSFNode    *remote;
1212     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1213     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1214     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1215     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1216     inoffset[0] = 0;
1217     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1218     for (i=0; i<maxlocal; i++) outones[i] = 1;
1219     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1220     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1221     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1222     if (PetscDefined(USE_DEBUG)) { /* Check that the expected number of increments occurred */
1223       for (i=0; i<sf->nroots; i++) {
1224         if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1225       }
1226     }
1227     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1228     for (i=0; i<sf->nleaves; i++) {
1229       remote[i].rank  = sf->remote[i].rank;
1230       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1231     }
1232     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1233     sf->multi->multi = sf->multi;
1234     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1235     if (sf->rankorder) {        /* Sort the ranks */
1236       PetscMPIInt rank;
1237       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1238       PetscSFNode *newremote;
1239       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1240       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1241       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1242       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1243       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1244       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1245       /* Sort the incoming ranks at each vertex, build the inverse map */
1246       for (i=0; i<sf->nroots; i++) {
1247         PetscInt j;
1248         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1249         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1250         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1251       }
1252       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1253       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1254       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1255       for (i=0; i<sf->nleaves; i++) {
1256         newremote[i].rank  = sf->remote[i].rank;
1257         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1258       }
1259       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1260       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1261     }
1262     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1263   }
1264   *multi = sf->multi;
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 /*@C
1269    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1270 
1271    Collective
1272 
1273    Input Arguments:
1274 +  sf - original star forest
1275 .  nselected  - number of selected roots on this process
1276 -  selected   - indices of the selected roots on this process
1277 
1278    Output Arguments:
1279 .  esf - new star forest
1280 
1281    Level: advanced
1282 
1283    Note:
1284    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1285    be done by calling PetscSFGetGraph().
1286 
1287 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1288 @*/
1289 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *esf)
1290 {
1291   PetscInt          i,j,n,nroots,nleaves,esf_nleaves,*new_ilocal,minleaf,maxleaf,maxlocal;
1292   const PetscInt    *ilocal;
1293   signed char       *rootdata,*leafdata,*leafmem;
1294   const PetscSFNode *iremote;
1295   PetscSFNode       *new_iremote;
1296   MPI_Comm          comm;
1297   PetscErrorCode    ierr;
1298 
1299   PetscFunctionBegin;
1300   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1301   PetscSFCheckGraphSet(sf,1);
1302   if (nselected) PetscValidPointer(selected,3);
1303   PetscValidPointer(esf,4);
1304 
1305   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1306   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1307   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1308   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1309 
1310   if (PetscDefined(USE_DEBUG)) {  /* Error out if selected[] has dups or  out of range indices */
1311     PetscBool dups;
1312     ierr = PetscCheckDupsInt(nselected,selected,&dups);CHKERRQ(ierr);
1313     if (dups) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"selected[] has dups");
1314     for (i=0; i<nselected; i++)
1315       if (selected[i] < 0 || selected[i] >= nroots) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"selected root indice %D is out of [0,%D)",selected[i],nroots);
1316   }
1317 
1318   if (sf->ops->CreateEmbeddedSF) {
1319     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,selected,esf);CHKERRQ(ierr);
1320   } else {
1321     /* A generic version of creating embedded sf */
1322     ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
1323     maxlocal = maxleaf - minleaf + 1;
1324     ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafmem);CHKERRQ(ierr);
1325     leafdata = leafmem - minleaf;
1326     /* Tag selected roots and bcast to leaves */
1327     for (i=0; i<nselected; i++) rootdata[selected[i]] = 1;
1328     ierr = PetscSFBcastBegin(sf,MPI_SIGNED_CHAR,rootdata,leafdata);CHKERRQ(ierr);
1329     ierr = PetscSFBcastEnd(sf,MPI_SIGNED_CHAR,rootdata,leafdata);CHKERRQ(ierr);
1330 
1331     /* Build esf with leaves that are still connected */
1332     esf_nleaves = 0;
1333     for (i=0; i<nleaves; i++) {
1334       j = ilocal ? ilocal[i] : i;
1335       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1336          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1337       */
1338       esf_nleaves += (leafdata[j] ? 1 : 0);
1339     }
1340     ierr = PetscMalloc1(esf_nleaves,&new_ilocal);CHKERRQ(ierr);
1341     ierr = PetscMalloc1(esf_nleaves,&new_iremote);CHKERRQ(ierr);
1342     for (i=n=0; i<nleaves; i++) {
1343       j = ilocal ? ilocal[i] : i;
1344       if (leafdata[j]) {
1345         new_ilocal[n]        = j;
1346         new_iremote[n].rank  = iremote[i].rank;
1347         new_iremote[n].index = iremote[i].index;
1348         ++n;
1349       }
1350     }
1351     ierr = PetscSFCreate(comm,esf);CHKERRQ(ierr);
1352     ierr = PetscSFSetFromOptions(*esf);CHKERRQ(ierr);
1353     ierr = PetscSFSetGraph(*esf,nroots,esf_nleaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1354     ierr = PetscFree2(rootdata,leafmem);CHKERRQ(ierr);
1355   }
1356   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1357   PetscFunctionReturn(0);
1358 }
1359 
1360 /*@C
1361   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1362 
1363   Collective
1364 
1365   Input Arguments:
1366 + sf - original star forest
1367 . nselected  - number of selected leaves on this process
1368 - selected   - indices of the selected leaves on this process
1369 
1370   Output Arguments:
1371 .  newsf - new star forest
1372 
1373   Level: advanced
1374 
1375 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1376 @*/
1377 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1378 {
1379   const PetscSFNode *iremote;
1380   PetscSFNode       *new_iremote;
1381   const PetscInt    *ilocal;
1382   PetscInt          i,nroots,*leaves,*new_ilocal;
1383   MPI_Comm          comm;
1384   PetscErrorCode    ierr;
1385 
1386   PetscFunctionBegin;
1387   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1388   PetscSFCheckGraphSet(sf,1);
1389   if (nselected) PetscValidPointer(selected,3);
1390   PetscValidPointer(newsf,4);
1391 
1392   /* Uniq selected[] and put results in leaves[] */
1393   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1394   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1395   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1396   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1397   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1398 
1399   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1400   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1401     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1402   } else {
1403     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1404     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1405     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1406     for (i=0; i<nselected; ++i) {
1407       const PetscInt l     = leaves[i];
1408       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1409       new_iremote[i].rank  = iremote[l].rank;
1410       new_iremote[i].index = iremote[l].index;
1411     }
1412     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1413     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1414   }
1415   ierr = PetscFree(leaves);CHKERRQ(ierr);
1416   PetscFunctionReturn(0);
1417 }
1418 
1419 /*@C
1420    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1421 
1422    Collective on PetscSF
1423 
1424    Input Arguments:
1425 +  sf - star forest on which to communicate
1426 .  unit - data type associated with each node
1427 .  rootdata - buffer to broadcast
1428 -  op - operation to use for reduction
1429 
1430    Output Arguments:
1431 .  leafdata - buffer to be reduced with values from each leaf's respective root
1432 
1433    Level: intermediate
1434 
1435    Notes:
1436     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1437     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1438     use PetscSFBcastAndOpWithMemTypeBegin() instead.
1439 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd(), PetscSFBcastAndOpWithMemTypeBegin()
1440 @*/
1441 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1442 {
1443   PetscErrorCode ierr;
1444   PetscMemType   rootmtype,leafmtype;
1445 
1446   PetscFunctionBegin;
1447   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1448   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1449   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1450   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1451   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1452   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1453   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1454   PetscFunctionReturn(0);
1455 }
1456 
1457 /*@C
1458    PetscSFBcastAndOpWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to PetscSFBcastAndOpEnd()
1459 
1460    Collective on PetscSF
1461 
1462    Input Arguments:
1463 +  sf - star forest on which to communicate
1464 .  unit - data type associated with each node
1465 .  rootmtype - memory type of rootdata
1466 .  rootdata - buffer to broadcast
1467 .  leafmtype - memory type of leafdata
1468 -  op - operation to use for reduction
1469 
1470    Output Arguments:
1471 .  leafdata - buffer to be reduced with values from each leaf's respective root
1472 
1473    Level: intermediate
1474 
1475 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd(),PetscSFBcastAndOpBegin()
1476 @*/
1477 PetscErrorCode PetscSFBcastAndOpWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
1478 {
1479   PetscErrorCode ierr;
1480 
1481   PetscFunctionBegin;
1482   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1483   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1484   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1485   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1486   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1487   PetscFunctionReturn(0);
1488 }
1489 
1490 /*@C
1491    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1492 
1493    Collective
1494 
1495    Input Arguments:
1496 +  sf - star forest
1497 .  unit - data type
1498 .  rootdata - buffer to broadcast
1499 -  op - operation to use for reduction
1500 
1501    Output Arguments:
1502 .  leafdata - buffer to be reduced with values from each leaf's respective root
1503 
1504    Level: intermediate
1505 
1506 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1507 @*/
1508 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1509 {
1510   PetscErrorCode ierr;
1511 
1512   PetscFunctionBegin;
1513   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1514   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);}
1515   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootdata,leafdata,op);CHKERRQ(ierr);
1516   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);}
1517   PetscFunctionReturn(0);
1518 }
1519 
1520 /*@C
1521    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1522 
1523    Collective
1524 
1525    Input Arguments:
1526 +  sf - star forest
1527 .  unit - data type
1528 .  leafdata - values to reduce
1529 -  op - reduction operation
1530 
1531    Output Arguments:
1532 .  rootdata - result of reduction of values from all leaves of each root
1533 
1534    Level: intermediate
1535 
1536    Notes:
1537     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1538     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1539     use PetscSFReduceWithMemTypeBegin() instead.
1540 
1541 .seealso: PetscSFBcastBegin(), PetscSFReduceWithMemTypeBegin()
1542 @*/
1543 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1544 {
1545   PetscErrorCode ierr;
1546   PetscMemType   rootmtype,leafmtype;
1547 
1548   PetscFunctionBegin;
1549   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1550   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1551   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1552   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1553   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1554   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1555   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1556   PetscFunctionReturn(0);
1557 }
1558 
1559 /*@C
1560    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to PetscSFReduceEnd()
1561 
1562    Collective
1563 
1564    Input Arguments:
1565 +  sf - star forest
1566 .  unit - data type
1567 .  leafmtype - memory type of leafdata
1568 .  leafdata - values to reduce
1569 .  rootmtype - memory type of rootdata
1570 -  op - reduction operation
1571 
1572    Output Arguments:
1573 .  rootdata - result of reduction of values from all leaves of each root
1574 
1575    Level: intermediate
1576 
1577 .seealso: PetscSFBcastBegin(), PetscSFReduceBegin()
1578 @*/
1579 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
1580 {
1581   PetscErrorCode ierr;
1582 
1583   PetscFunctionBegin;
1584   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1585   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1586   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1587   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1588   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1589   PetscFunctionReturn(0);
1590 }
1591 
1592 /*@C
1593    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1594 
1595    Collective
1596 
1597    Input Arguments:
1598 +  sf - star forest
1599 .  unit - data type
1600 .  leafdata - values to reduce
1601 -  op - reduction operation
1602 
1603    Output Arguments:
1604 .  rootdata - result of reduction of values from all leaves of each root
1605 
1606    Level: intermediate
1607 
1608 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1609 @*/
1610 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1611 {
1612   PetscErrorCode ierr;
1613 
1614   PetscFunctionBegin;
1615   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1616   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1617   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
1618   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1619   PetscFunctionReturn(0);
1620 }
1621 
1622 /*@C
1623    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1624 
1625    Collective
1626 
1627    Input Arguments:
1628 +  sf - star forest
1629 .  unit - data type
1630 .  leafdata - leaf values to use in reduction
1631 -  op - operation to use for reduction
1632 
1633    Output Arguments:
1634 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1635 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1636 
1637    Level: advanced
1638 
1639    Note:
1640    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1641    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1642    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1643    integers.
1644 
1645 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1646 @*/
1647 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1648 {
1649   PetscErrorCode ierr;
1650   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1651 
1652   PetscFunctionBegin;
1653   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1654   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1655   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1656   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1657   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1658   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1659   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1660   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1661   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1662   PetscFunctionReturn(0);
1663 }
1664 
1665 /*@C
1666    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1667 
1668    Collective
1669 
1670    Input Arguments:
1671 +  sf - star forest
1672 .  unit - data type
1673 .  leafdata - leaf values to use in reduction
1674 -  op - operation to use for reduction
1675 
1676    Output Arguments:
1677 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1678 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1679 
1680    Level: advanced
1681 
1682 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1683 @*/
1684 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1685 {
1686   PetscErrorCode ierr;
1687 
1688   PetscFunctionBegin;
1689   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1690   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1691   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootdata,leafdata,leafupdate,op);CHKERRQ(ierr);
1692   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1693   PetscFunctionReturn(0);
1694 }
1695 
1696 /*@C
1697    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1698 
1699    Collective
1700 
1701    Input Arguments:
1702 .  sf - star forest
1703 
1704    Output Arguments:
1705 .  degree - degree of each root vertex
1706 
1707    Level: advanced
1708 
1709    Notes:
1710    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1711 
1712 .seealso: PetscSFGatherBegin()
1713 @*/
1714 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1715 {
1716   PetscErrorCode ierr;
1717 
1718   PetscFunctionBegin;
1719   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1720   PetscSFCheckGraphSet(sf,1);
1721   PetscValidPointer(degree,2);
1722   if (!sf->degreeknown) {
1723     PetscInt i, nroots = sf->nroots, maxlocal;
1724     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1725     maxlocal = sf->maxleaf-sf->minleaf+1;
1726     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1727     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1728     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1729     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1730     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1731   }
1732   *degree = NULL;
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 /*@C
1737    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1738 
1739    Collective
1740 
1741    Input Arguments:
1742 .  sf - star forest
1743 
1744    Output Arguments:
1745 .  degree - degree of each root vertex
1746 
1747    Level: developer
1748 
1749    Notes:
1750    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1751 
1752 .seealso:
1753 @*/
1754 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1755 {
1756   PetscErrorCode ierr;
1757 
1758   PetscFunctionBegin;
1759   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1760   PetscSFCheckGraphSet(sf,1);
1761   PetscValidPointer(degree,2);
1762   if (!sf->degreeknown) {
1763     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1764     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1765     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1766     sf->degreeknown = PETSC_TRUE;
1767   }
1768   *degree = sf->degree;
1769   PetscFunctionReturn(0);
1770 }
1771 
1772 
1773 /*@C
1774    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1775    Each multi-root is assigned index of the corresponding original root.
1776 
1777    Collective
1778 
1779    Input Arguments:
1780 +  sf - star forest
1781 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1782 
1783    Output Arguments:
1784 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1785 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1786 
1787    Level: developer
1788 
1789    Notes:
1790    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1791 
1792 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1793 @*/
1794 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1795 {
1796   PetscSF             msf;
1797   PetscInt            i, j, k, nroots, nmroots;
1798   PetscErrorCode      ierr;
1799 
1800   PetscFunctionBegin;
1801   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1802   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1803   if (nroots) PetscValidIntPointer(degree,2);
1804   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1805   PetscValidPointer(multiRootsOrigNumbering,4);
1806   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1807   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1808   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1809   for (i=0,j=0,k=0; i<nroots; i++) {
1810     if (!degree[i]) continue;
1811     for (j=0; j<degree[i]; j++,k++) {
1812       (*multiRootsOrigNumbering)[k] = i;
1813     }
1814   }
1815   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1816   if (nMultiRoots) *nMultiRoots = nmroots;
1817   PetscFunctionReturn(0);
1818 }
1819 
1820 /*@C
1821    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1822 
1823    Collective
1824 
1825    Input Arguments:
1826 +  sf - star forest
1827 .  unit - data type
1828 -  leafdata - leaf data to gather to roots
1829 
1830    Output Argument:
1831 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1832 
1833    Level: intermediate
1834 
1835 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1836 @*/
1837 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1838 {
1839   PetscErrorCode ierr;
1840   PetscSF        multi = NULL;
1841 
1842   PetscFunctionBegin;
1843   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1844   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1845   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1846   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1847   PetscFunctionReturn(0);
1848 }
1849 
1850 /*@C
1851    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1852 
1853    Collective
1854 
1855    Input Arguments:
1856 +  sf - star forest
1857 .  unit - data type
1858 -  leafdata - leaf data to gather to roots
1859 
1860    Output Argument:
1861 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1862 
1863    Level: intermediate
1864 
1865 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1866 @*/
1867 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1868 {
1869   PetscErrorCode ierr;
1870   PetscSF        multi = NULL;
1871 
1872   PetscFunctionBegin;
1873   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1874   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1875   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1876   PetscFunctionReturn(0);
1877 }
1878 
1879 /*@C
1880    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1881 
1882    Collective
1883 
1884    Input Arguments:
1885 +  sf - star forest
1886 .  unit - data type
1887 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1888 
1889    Output Argument:
1890 .  leafdata - leaf data to be update with personal data from each respective root
1891 
1892    Level: intermediate
1893 
1894 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1895 @*/
1896 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1897 {
1898   PetscErrorCode ierr;
1899   PetscSF        multi = NULL;
1900 
1901   PetscFunctionBegin;
1902   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1903   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1904   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1905   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1906   PetscFunctionReturn(0);
1907 }
1908 
1909 /*@C
1910    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1911 
1912    Collective
1913 
1914    Input Arguments:
1915 +  sf - star forest
1916 .  unit - data type
1917 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1918 
1919    Output Argument:
1920 .  leafdata - leaf data to be update with personal data from each respective root
1921 
1922    Level: intermediate
1923 
1924 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1925 @*/
1926 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1927 {
1928   PetscErrorCode ierr;
1929   PetscSF        multi = NULL;
1930 
1931   PetscFunctionBegin;
1932   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1933   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1934   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1939 {
1940   PetscInt        i, n, nleaves;
1941   const PetscInt *ilocal = NULL;
1942   PetscHSetI      seen;
1943   PetscErrorCode  ierr;
1944 
1945   PetscFunctionBegin;
1946   if (PetscDefined(USE_DEBUG)) {
1947     ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1948     ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1949     for (i = 0; i < nleaves; i++) {
1950       const PetscInt leaf = ilocal ? ilocal[i] : i;
1951       ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1952     }
1953     ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1954     if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1955     ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1956   }
1957   PetscFunctionReturn(0);
1958 }
1959 
1960 /*@
1961   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1962 
1963   Input Parameters:
1964 + sfA - The first PetscSF
1965 - sfB - The second PetscSF
1966 
1967   Output Parameters:
1968 . sfBA - The composite SF
1969 
1970   Level: developer
1971 
1972   Notes:
1973   Currently, the two SFs must be defined on congruent communicators and they must be true star
1974   forests, i.e. the same leaf is not connected with different roots.
1975 
1976   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1977   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1978   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1979   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1980 
1981 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1982 @*/
1983 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1984 {
1985   PetscErrorCode    ierr;
1986   const PetscSFNode *remotePointsA,*remotePointsB;
1987   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1988   const PetscInt    *localPointsA,*localPointsB;
1989   PetscInt          *localPointsBA;
1990   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1991   PetscBool         denseB;
1992 
1993   PetscFunctionBegin;
1994   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1995   PetscSFCheckGraphSet(sfA,1);
1996   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1997   PetscSFCheckGraphSet(sfB,2);
1998   PetscCheckSameComm(sfA,1,sfB,2);
1999   PetscValidPointer(sfBA,3);
2000   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2001   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2002 
2003   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
2004   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
2005   if (localPointsA) {
2006     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
2007     for (i=0; i<numRootsB; i++) {
2008       reorderedRemotePointsA[i].rank = -1;
2009       reorderedRemotePointsA[i].index = -1;
2010     }
2011     for (i=0; i<numLeavesA; i++) {
2012       if (localPointsA[i] >= numRootsB) continue;
2013       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
2014     }
2015     remotePointsA = reorderedRemotePointsA;
2016   }
2017   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
2018   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
2019   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
2020   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
2021   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2022 
2023   denseB = (PetscBool)!localPointsB;
2024   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2025     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
2026     else numLeavesBA++;
2027   }
2028   if (denseB) {
2029     localPointsBA  = NULL;
2030     remotePointsBA = leafdataB;
2031   } else {
2032     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
2033     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
2034     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2035       const PetscInt l = localPointsB ? localPointsB[i] : i;
2036 
2037       if (leafdataB[l-minleaf].rank == -1) continue;
2038       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
2039       localPointsBA[numLeavesBA] = l;
2040       numLeavesBA++;
2041     }
2042     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
2043   }
2044   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2045   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2046   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2047   PetscFunctionReturn(0);
2048 }
2049 
2050 /*@
2051   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
2052 
2053   Input Parameters:
2054 + sfA - The first PetscSF
2055 - sfB - The second PetscSF
2056 
2057   Output Parameters:
2058 . sfBA - The composite SF.
2059 
2060   Level: developer
2061 
2062   Notes:
2063   Currently, the two SFs must be defined on congruent communicators and they must be true star
2064   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2065   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2066 
2067   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2068   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2069   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2070   a Reduce on sfB, on connected roots.
2071 
2072 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
2073 @*/
2074 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
2075 {
2076   PetscErrorCode    ierr;
2077   const PetscSFNode *remotePointsA,*remotePointsB;
2078   PetscSFNode       *remotePointsBA;
2079   const PetscInt    *localPointsA,*localPointsB;
2080   PetscSFNode       *reorderedRemotePointsA = NULL;
2081   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
2082   MPI_Op            op;
2083 #if defined(PETSC_USE_64BIT_INDICES)
2084   PetscBool         iswin;
2085 #endif
2086 
2087   PetscFunctionBegin;
2088   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2089   PetscSFCheckGraphSet(sfA,1);
2090   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2091   PetscSFCheckGraphSet(sfB,2);
2092   PetscCheckSameComm(sfA,1,sfB,2);
2093   PetscValidPointer(sfBA,3);
2094   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2095   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2096 
2097   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
2098   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
2099 
2100   /* TODO: Check roots of sfB have degree of 1 */
2101   /* Once we implement it, we can replace the MPI_MAXLOC
2102      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
2103      We use MPI_MAXLOC only to have a deterministic output from this routine if
2104      the root condition is not meet.
2105    */
2106   op = MPI_MAXLOC;
2107 #if defined(PETSC_USE_64BIT_INDICES)
2108   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2109   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
2110   if (iswin) op = MPIU_REPLACE;
2111 #endif
2112 
2113   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
2114   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
2115   for (i=0; i<maxleaf - minleaf + 1; i++) {
2116     reorderedRemotePointsA[i].rank = -1;
2117     reorderedRemotePointsA[i].index = -1;
2118   }
2119   if (localPointsA) {
2120     for (i=0; i<numLeavesA; i++) {
2121       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2122       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2123     }
2124   } else {
2125     for (i=0; i<numLeavesA; i++) {
2126       if (i > maxleaf || i < minleaf) continue;
2127       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2128     }
2129   }
2130 
2131   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
2132   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
2133   for (i=0; i<numRootsB; i++) {
2134     remotePointsBA[i].rank = -1;
2135     remotePointsBA[i].index = -1;
2136   }
2137 
2138   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2139   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2140   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2141   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
2142     if (remotePointsBA[i].rank == -1) continue;
2143     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
2144     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2145     localPointsBA[numLeavesBA] = i;
2146     numLeavesBA++;
2147   }
2148   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2149   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2150   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2151   PetscFunctionReturn(0);
2152 }
2153 
2154 /*
2155   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
2156 
2157   Input Parameters:
2158 . sf - The global PetscSF
2159 
2160   Output Parameters:
2161 . out - The local PetscSF
2162  */
2163 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
2164 {
2165   MPI_Comm           comm;
2166   PetscMPIInt        myrank;
2167   const PetscInt     *ilocal;
2168   const PetscSFNode  *iremote;
2169   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
2170   PetscSFNode        *liremote;
2171   PetscSF            lsf;
2172   PetscErrorCode     ierr;
2173 
2174   PetscFunctionBegin;
2175   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2176   if (sf->ops->CreateLocalSF) {
2177     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2178   } else {
2179     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2180     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2181     ierr = MPI_Comm_rank(comm,&myrank);CHKERRMPI(ierr);
2182 
2183     /* Find out local edges and build a local SF */
2184     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2185     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2186     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2187     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2188 
2189     for (i=j=0; i<nleaves; i++) {
2190       if (iremote[i].rank == (PetscInt)myrank) {
2191         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2192         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2193         liremote[j].index = iremote[i].index;
2194         j++;
2195       }
2196     }
2197     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2198     ierr = PetscSFSetFromOptions(lsf);CHKERRQ(ierr);
2199     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2200     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2201     *out = lsf;
2202   }
2203   PetscFunctionReturn(0);
2204 }
2205 
2206 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2207 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2208 {
2209   PetscErrorCode     ierr;
2210   PetscMemType       rootmtype,leafmtype;
2211 
2212   PetscFunctionBegin;
2213   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2214   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2215   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2216   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2217   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2218   if (sf->ops->BcastToZero) {
2219     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2220   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2221   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2222   PetscFunctionReturn(0);
2223 }
2224 
2225