xref: /petsc/src/vec/is/sf/interface/sf.c (revision 84ff8c8b54fd7c9cb88641c01dfe6357ec5f72d0) !
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petsc/private/viewerimpl.h>
4 #include <petscctable.h>
5 
6 #if defined(PETSC_HAVE_CUDA)
7   #include <cuda_runtime.h>
8 #endif
9 
10 #if defined(PETSC_HAVE_HIP)
11   #include <hip/hip_runtime.h>
12 #endif
13 
14 #if defined(PETSC_USE_DEBUG)
15 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
16     if (PetscUnlikely(!(sf)->graphset))                              \
17       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
18   } while (0)
19 #else
20 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
21 #endif
22 
23 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",NULL};
24 
25 PETSC_STATIC_INLINE PetscErrorCode PetscGetMemType(const void *data,PetscMemType *type)
26 {
27   PetscFunctionBegin;
28   PetscValidPointer(type,2);
29   *type = PETSC_MEMTYPE_HOST;
30 #if defined(PETSC_HAVE_CUDA)
31   if (PetscDeviceInitialized(PETSC_DEVICE_CUDA) && data) {
32     cudaError_t                  cerr;
33     struct cudaPointerAttributes attr;
34     enum cudaMemoryType          mtype;
35     cerr = cudaPointerGetAttributes(&attr,data); /* Do not check error since before CUDA 11.0, passing a host pointer returns cudaErrorInvalidValue */
36     cudaGetLastError(); /* Reset the last error */
37     #if (CUDART_VERSION < 10000)
38       mtype = attr.memoryType;
39     #else
40       mtype = attr.type;
41     #endif
42     if (cerr == cudaSuccess && mtype == cudaMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
43   }
44 #endif
45 
46 #if defined(PETSC_HAVE_HIP)
47   if (PetscDeviceInitialized(PETSC_DEVICE_HIP) && data) {
48     hipError_t                   cerr;
49     struct hipPointerAttribute_t attr;
50     enum hipMemoryType           mtype;
51     cerr = hipPointerGetAttributes(&attr,data);
52     hipGetLastError(); /* Reset the last error */
53     mtype = attr.memoryType;
54     if (cerr == hipSuccess && mtype == hipMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
55   }
56 #endif
57   PetscFunctionReturn(0);
58 }
59 
60 /*@
61    PetscSFCreate - create a star forest communication context
62 
63    Collective
64 
65    Input Parameter:
66 .  comm - communicator on which the star forest will operate
67 
68    Output Parameter:
69 .  sf - new star forest context
70 
71    Options Database Keys:
72 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
73 .  -sf_type window    -Use MPI-3 one-sided window for communication
74 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
75 
76    Level: intermediate
77 
78    Notes:
79    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
80    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
81    SFs are optimized and they have better performance than general SFs.
82 
83 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
84 @*/
85 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
86 {
87   PetscErrorCode ierr;
88   PetscSF        b;
89 
90   PetscFunctionBegin;
91   PetscValidPointer(sf,2);
92   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
93 
94   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
95 
96   b->nroots    = -1;
97   b->nleaves   = -1;
98   b->minleaf   = PETSC_MAX_INT;
99   b->maxleaf   = PETSC_MIN_INT;
100   b->nranks    = -1;
101   b->rankorder = PETSC_TRUE;
102   b->ingroup   = MPI_GROUP_NULL;
103   b->outgroup  = MPI_GROUP_NULL;
104   b->graphset  = PETSC_FALSE;
105 #if defined(PETSC_HAVE_DEVICE)
106   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
107   b->use_stream_aware_mpi = PETSC_FALSE;
108   b->unknown_input_stream= PETSC_FALSE;
109   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
110     b->backend = PETSCSF_BACKEND_KOKKOS;
111   #elif defined(PETSC_HAVE_CUDA)
112     b->backend = PETSCSF_BACKEND_CUDA;
113   #elif defined(PETSC_HAVE_HIP)
114     b->backend = PETSCSF_BACKEND_HIP;
115   #endif
116 
117   #if defined(PETSC_HAVE_NVSHMEM)
118     b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
119     b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
120     ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&b->use_nvshmem,NULL);CHKERRQ(ierr);
121     ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem_get",&b->use_nvshmem_get,NULL);CHKERRQ(ierr);
122   #endif
123 #endif
124   b->vscat.from_n = -1;
125   b->vscat.to_n   = -1;
126   b->vscat.unit   = MPIU_SCALAR;
127  *sf = b;
128   PetscFunctionReturn(0);
129 }
130 
131 /*@
132    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
133 
134    Collective
135 
136    Input Parameter:
137 .  sf - star forest
138 
139    Level: advanced
140 
141 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
142 @*/
143 PetscErrorCode PetscSFReset(PetscSF sf)
144 {
145   PetscErrorCode ierr;
146 
147   PetscFunctionBegin;
148   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
149   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
150   sf->nroots   = -1;
151   sf->nleaves  = -1;
152   sf->minleaf  = PETSC_MAX_INT;
153   sf->maxleaf  = PETSC_MIN_INT;
154   sf->mine     = NULL;
155   sf->remote   = NULL;
156   sf->graphset = PETSC_FALSE;
157   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
158   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
159   sf->nranks = -1;
160   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
161   sf->degreeknown = PETSC_FALSE;
162   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
163   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRMPI(ierr);}
164   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRMPI(ierr);}
165   if (sf->multi) sf->multi->multi = NULL;
166   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
167   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
168 
169  #if defined(PETSC_HAVE_DEVICE)
170   for (PetscInt i=0; i<2; i++) {ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,sf->rmine_d[i]);CHKERRQ(ierr);}
171  #endif
172 
173   sf->setupcalled = PETSC_FALSE;
174   PetscFunctionReturn(0);
175 }
176 
177 /*@C
178    PetscSFSetType - Set the PetscSF communication implementation
179 
180    Collective on PetscSF
181 
182    Input Parameters:
183 +  sf - the PetscSF context
184 -  type - a known method
185 
186    Options Database Key:
187 .  -sf_type <type> - Sets the method; use -help for a list
188    of available methods (for instance, window, basic, neighbor)
189 
190    Notes:
191    See "include/petscsf.h" for available methods (for instance)
192 +    PETSCSFWINDOW - MPI-2/3 one-sided
193 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
194 
195   Level: intermediate
196 
197 .seealso: PetscSFType, PetscSFCreate()
198 @*/
199 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
200 {
201   PetscErrorCode ierr,(*r)(PetscSF);
202   PetscBool      match;
203 
204   PetscFunctionBegin;
205   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
206   PetscValidCharPointer(type,2);
207 
208   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
209   if (match) PetscFunctionReturn(0);
210 
211   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
212   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
213   /* Destroy the previous PetscSF implementation context */
214   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
215   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
216   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
217   ierr = (*r)(sf);CHKERRQ(ierr);
218   PetscFunctionReturn(0);
219 }
220 
221 /*@C
222   PetscSFGetType - Get the PetscSF communication implementation
223 
224   Not Collective
225 
226   Input Parameter:
227 . sf  - the PetscSF context
228 
229   Output Parameter:
230 . type - the PetscSF type name
231 
232   Level: intermediate
233 
234 .seealso: PetscSFSetType(), PetscSFCreate()
235 @*/
236 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
237 {
238   PetscFunctionBegin;
239   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
240   PetscValidPointer(type,2);
241   *type = ((PetscObject)sf)->type_name;
242   PetscFunctionReturn(0);
243 }
244 
245 /*@C
246    PetscSFDestroy - destroy star forest
247 
248    Collective
249 
250    Input Parameter:
251 .  sf - address of star forest
252 
253    Level: intermediate
254 
255 .seealso: PetscSFCreate(), PetscSFReset()
256 @*/
257 PetscErrorCode PetscSFDestroy(PetscSF *sf)
258 {
259   PetscErrorCode ierr;
260 
261   PetscFunctionBegin;
262   if (!*sf) PetscFunctionReturn(0);
263   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
264   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
265   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
266   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
267   ierr = PetscSFDestroy(&(*sf)->vscat.lsf);CHKERRQ(ierr);
268   if ((*sf)->vscat.bs > 1) {ierr = MPI_Type_free(&(*sf)->vscat.unit);CHKERRMPI(ierr);}
269   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
270   PetscFunctionReturn(0);
271 }
272 
273 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
274 {
275   PetscInt           i, nleaves;
276   PetscMPIInt        size;
277   const PetscInt    *ilocal;
278   const PetscSFNode *iremote;
279   PetscErrorCode     ierr;
280 
281   PetscFunctionBegin;
282   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
283   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
284   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
285   for (i = 0; i < nleaves; i++) {
286     const PetscInt rank = iremote[i].rank;
287     const PetscInt remote = iremote[i].index;
288     const PetscInt leaf = ilocal ? ilocal[i] : i;
289     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
290     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
291     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
292   }
293   PetscFunctionReturn(0);
294 }
295 
296 /*@
297    PetscSFSetUp - set up communication structures
298 
299    Collective
300 
301    Input Parameter:
302 .  sf - star forest communication object
303 
304    Level: beginner
305 
306 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
307 @*/
308 PetscErrorCode PetscSFSetUp(PetscSF sf)
309 {
310   PetscErrorCode ierr;
311 
312   PetscFunctionBegin;
313   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
314   PetscSFCheckGraphSet(sf,1);
315   if (sf->setupcalled) PetscFunctionReturn(0);
316   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
317   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
318   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);} /* Zero all sf->ops */
319   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
320 #if defined(PETSC_HAVE_CUDA)
321   if (sf->backend == PETSCSF_BACKEND_CUDA) {
322     sf->ops->Malloc = PetscSFMalloc_CUDA;
323     sf->ops->Free   = PetscSFFree_CUDA;
324   }
325 #endif
326 #if defined(PETSC_HAVE_HIP)
327   if (sf->backend == PETSCSF_BACKEND_HIP) {
328     sf->ops->Malloc = PetscSFMalloc_HIP;
329     sf->ops->Free   = PetscSFFree_HIP;
330   }
331 #endif
332 
333 #
334 #if defined(PETSC_HAVE_KOKKOS)
335   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
336     sf->ops->Malloc = PetscSFMalloc_Kokkos;
337     sf->ops->Free   = PetscSFFree_Kokkos;
338   }
339 #endif
340   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
341   sf->setupcalled = PETSC_TRUE;
342   PetscFunctionReturn(0);
343 }
344 
345 /*@
346    PetscSFSetFromOptions - set PetscSF options using the options database
347 
348    Logically Collective
349 
350    Input Parameter:
351 .  sf - star forest
352 
353    Options Database Keys:
354 +  -sf_type               - implementation type, see PetscSFSetType()
355 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
356 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
357                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
358                             If true, this option only works with -use_gpu_aware_mpi 1.
359 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
360                                If true, this option only works with -use_gpu_aware_mpi 1.
361 
362 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently SF has these backends: cuda, hip and Kokkos.
363                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
364                               the only available is kokkos.
365 
366    Level: intermediate
367 @*/
368 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
369 {
370   PetscSFType    deft;
371   char           type[256];
372   PetscErrorCode ierr;
373   PetscBool      flg;
374 
375   PetscFunctionBegin;
376   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
377   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
378   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
379   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
380   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
381   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
382  #if defined(PETSC_HAVE_DEVICE)
383   {
384     char        backendstr[32] = {0};
385     PetscBool   isCuda = PETSC_FALSE,isHip = PETSC_FALSE,isKokkos = PETSC_FALSE,set;
386     /* Change the defaults set in PetscSFCreate() with command line options */
387     ierr = PetscOptionsBool("-sf_unknown_input_stream","SF root/leafdata is computed on arbitary streams unknown to SF","PetscSFSetFromOptions",sf->unknown_input_stream,&sf->unknown_input_stream,NULL);CHKERRQ(ierr);
388     ierr = PetscOptionsBool("-sf_use_stream_aware_mpi","Assume the underlying MPI is cuda-stream aware","PetscSFSetFromOptions",sf->use_stream_aware_mpi,&sf->use_stream_aware_mpi,NULL);CHKERRQ(ierr);
389     ierr = PetscOptionsString("-sf_backend","Select the device backend SF uses","PetscSFSetFromOptions",NULL,backendstr,sizeof(backendstr),&set);CHKERRQ(ierr);
390     ierr = PetscStrcasecmp("cuda",backendstr,&isCuda);CHKERRQ(ierr);
391     ierr = PetscStrcasecmp("kokkos",backendstr,&isKokkos);CHKERRQ(ierr);
392     ierr = PetscStrcasecmp("hip",backendstr,&isHip);CHKERRQ(ierr);
393   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
394     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
395     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
396     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
397     else if (set) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
398   #elif defined(PETSC_HAVE_KOKKOS)
399     if (set && !isKokkos) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You can only choose kokkos", backendstr);
400    #endif
401   }
402  #endif
403   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
404   ierr = PetscOptionsEnd();CHKERRQ(ierr);
405   PetscFunctionReturn(0);
406 }
407 
408 /*@
409    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
410 
411    Logically Collective
412 
413    Input Parameters:
414 +  sf - star forest
415 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
416 
417    Level: advanced
418 
419 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
420 @*/
421 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
422 {
423   PetscFunctionBegin;
424   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
425   PetscValidLogicalCollectiveBool(sf,flg,2);
426   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
427   sf->rankorder = flg;
428   PetscFunctionReturn(0);
429 }
430 
431 /*@
432    PetscSFSetGraph - Set a parallel star forest
433 
434    Collective
435 
436    Input Parameters:
437 +  sf - star forest
438 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
439 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
440 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
441 during setup in debug mode)
442 .  localmode - copy mode for ilocal
443 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
444 during setup in debug mode)
445 -  remotemode - copy mode for iremote
446 
447    Level: intermediate
448 
449    Notes:
450     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
451 
452    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
453    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
454    needed
455 
456    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
457    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
458 
459 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
460 @*/
461 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
462 {
463   PetscErrorCode ierr;
464 
465   PetscFunctionBegin;
466   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
467   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
468   if (nleaves > 0) PetscValidPointer(iremote,6);
469   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
470   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
471 
472   if (sf->nroots >= 0) { /* Reset only if graph already set */
473     ierr = PetscSFReset(sf);CHKERRQ(ierr);
474   }
475 
476   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
477 
478   sf->nroots  = nroots;
479   sf->nleaves = nleaves;
480 
481   if (nleaves && ilocal) {
482     PetscInt i;
483     PetscInt minleaf = PETSC_MAX_INT;
484     PetscInt maxleaf = PETSC_MIN_INT;
485     int      contiguous = 1;
486     for (i=0; i<nleaves; i++) {
487       minleaf = PetscMin(minleaf,ilocal[i]);
488       maxleaf = PetscMax(maxleaf,ilocal[i]);
489       contiguous &= (ilocal[i] == i);
490     }
491     sf->minleaf = minleaf;
492     sf->maxleaf = maxleaf;
493     if (contiguous) {
494       if (localmode == PETSC_OWN_POINTER) {
495         ierr = PetscFree(ilocal);CHKERRQ(ierr);
496       }
497       ilocal = NULL;
498     }
499   } else {
500     sf->minleaf = 0;
501     sf->maxleaf = nleaves - 1;
502   }
503 
504   if (ilocal) {
505     switch (localmode) {
506     case PETSC_COPY_VALUES:
507       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
508       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
509       sf->mine = sf->mine_alloc;
510       break;
511     case PETSC_OWN_POINTER:
512       sf->mine_alloc = (PetscInt*)ilocal;
513       sf->mine       = sf->mine_alloc;
514       break;
515     case PETSC_USE_POINTER:
516       sf->mine_alloc = NULL;
517       sf->mine       = (PetscInt*)ilocal;
518       break;
519     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
520     }
521   }
522 
523   switch (remotemode) {
524   case PETSC_COPY_VALUES:
525     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
526     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
527     sf->remote = sf->remote_alloc;
528     break;
529   case PETSC_OWN_POINTER:
530     sf->remote_alloc = (PetscSFNode*)iremote;
531     sf->remote       = sf->remote_alloc;
532     break;
533   case PETSC_USE_POINTER:
534     sf->remote_alloc = NULL;
535     sf->remote       = (PetscSFNode*)iremote;
536     break;
537   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
538   }
539 
540   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
541   sf->graphset = PETSC_TRUE;
542   PetscFunctionReturn(0);
543 }
544 
545 /*@
546   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
547 
548   Collective
549 
550   Input Parameters:
551 + sf      - The PetscSF
552 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
553 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
554 
555   Notes:
556   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
557   n and N are local and global sizes of x respectively.
558 
559   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
560   sequential vectors y on all ranks.
561 
562   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
563   sequential vector y on rank 0.
564 
565   In above cases, entries of x are roots and entries of y are leaves.
566 
567   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
568   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
569   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
570   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
571   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
572 
573   In this case, roots and leaves are symmetric.
574 
575   Level: intermediate
576  @*/
577 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
578 {
579   MPI_Comm       comm;
580   PetscInt       n,N,res[2];
581   PetscMPIInt    rank,size;
582   PetscSFType    type;
583   PetscErrorCode ierr;
584 
585   PetscFunctionBegin;
586   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
587   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
588   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
589   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
590 
591   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
592     type = PETSCSFALLTOALL;
593     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
594     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
595     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
596     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
597   } else {
598     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
599     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
600     res[0] = n;
601     res[1] = -n;
602     /* Check if n are same over all ranks so that we can optimize it */
603     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRMPI(ierr);
604     if (res[0] == -res[1]) { /* same n */
605       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
606     } else {
607       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
608     }
609     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
610   }
611   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
612 
613   sf->pattern = pattern;
614   sf->mine    = NULL; /* Contiguous */
615 
616   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
617      Also set other easy stuff.
618    */
619   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
620     sf->nleaves      = N;
621     sf->nroots       = n;
622     sf->nranks       = size;
623     sf->minleaf      = 0;
624     sf->maxleaf      = N - 1;
625   } else if (pattern == PETSCSF_PATTERN_GATHER) {
626     sf->nleaves      = rank ? 0 : N;
627     sf->nroots       = n;
628     sf->nranks       = rank ? 0 : size;
629     sf->minleaf      = 0;
630     sf->maxleaf      = rank ? -1 : N - 1;
631   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
632     sf->nleaves      = size;
633     sf->nroots       = size;
634     sf->nranks       = size;
635     sf->minleaf      = 0;
636     sf->maxleaf      = size - 1;
637   }
638   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
639   sf->graphset = PETSC_TRUE;
640   PetscFunctionReturn(0);
641 }
642 
643 /*@
644    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
645 
646    Collective
647 
648    Input Parameter:
649 .  sf - star forest to invert
650 
651    Output Parameter:
652 .  isf - inverse of sf
653 
654    Level: advanced
655 
656    Notes:
657    All roots must have degree 1.
658 
659    The local space may be a permutation, but cannot be sparse.
660 
661 .seealso: PetscSFSetGraph()
662 @*/
663 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
664 {
665   PetscErrorCode ierr;
666   PetscMPIInt    rank;
667   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
668   const PetscInt *ilocal;
669   PetscSFNode    *roots,*leaves;
670 
671   PetscFunctionBegin;
672   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
673   PetscSFCheckGraphSet(sf,1);
674   PetscValidPointer(isf,2);
675 
676   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
677   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
678 
679   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
680   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
681   for (i=0; i<maxlocal; i++) {
682     leaves[i].rank  = rank;
683     leaves[i].index = i;
684   }
685   for (i=0; i <nroots; i++) {
686     roots[i].rank  = -1;
687     roots[i].index = -1;
688   }
689   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPI_REPLACE);CHKERRQ(ierr);
690   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPI_REPLACE);CHKERRQ(ierr);
691 
692   /* Check whether our leaves are sparse */
693   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
694   if (count == nroots) newilocal = NULL;
695   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
696     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
697     for (i=0,count=0; i<nroots; i++) {
698       if (roots[i].rank >= 0) {
699         newilocal[count]   = i;
700         roots[count].rank  = roots[i].rank;
701         roots[count].index = roots[i].index;
702         count++;
703       }
704     }
705   }
706 
707   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
708   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
709   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
710   PetscFunctionReturn(0);
711 }
712 
713 /*@
714    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
715 
716    Collective
717 
718    Input Parameters:
719 +  sf - communication object to duplicate
720 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
721 
722    Output Parameter:
723 .  newsf - new communication object
724 
725    Level: beginner
726 
727 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
728 @*/
729 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
730 {
731   PetscSFType    type;
732   PetscErrorCode ierr;
733   MPI_Datatype   dtype=MPIU_SCALAR;
734 
735   PetscFunctionBegin;
736   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
737   PetscValidLogicalCollectiveEnum(sf,opt,2);
738   PetscValidPointer(newsf,3);
739   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
740   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
741   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
742   if (opt == PETSCSF_DUPLICATE_GRAPH) {
743     PetscSFCheckGraphSet(sf,1);
744     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
745       PetscInt          nroots,nleaves;
746       const PetscInt    *ilocal;
747       const PetscSFNode *iremote;
748       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
749       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
750     } else {
751       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
752     }
753   }
754   /* Since oldtype is committed, so is newtype, according to MPI */
755   if (sf->vscat.bs > 1) {ierr = MPI_Type_dup(sf->vscat.unit,&dtype);CHKERRMPI(ierr);}
756   (*newsf)->vscat.bs     = sf->vscat.bs;
757   (*newsf)->vscat.unit   = dtype;
758   (*newsf)->vscat.to_n   = sf->vscat.to_n;
759   (*newsf)->vscat.from_n = sf->vscat.from_n;
760   /* Do not copy lsf. Build it on demand since it is rarely used */
761 
762 #if defined(PETSC_HAVE_DEVICE)
763   (*newsf)->backend              = sf->backend;
764   (*newsf)->unknown_input_stream= sf->unknown_input_stream;
765   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
766   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
767 #endif
768   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
769   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
770   PetscFunctionReturn(0);
771 }
772 
773 /*@C
774    PetscSFGetGraph - Get the graph specifying a parallel star forest
775 
776    Not Collective
777 
778    Input Parameter:
779 .  sf - star forest
780 
781    Output Parameters:
782 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
783 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
784 .  ilocal - locations of leaves in leafdata buffers (if returned value is NULL, it means leaves are in contiguous storage)
785 -  iremote - remote locations of root vertices for each leaf on the current process
786 
787    Notes:
788    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
789 
790    When called from Fortran, the returned iremote array is a copy and must be deallocated after use. Consequently, if you
791    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
792 
793    Level: intermediate
794 
795 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
796 @*/
797 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
798 {
799   PetscErrorCode ierr;
800 
801   PetscFunctionBegin;
802   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
803   if (sf->ops->GetGraph) {
804     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
805   } else {
806     if (nroots) *nroots = sf->nroots;
807     if (nleaves) *nleaves = sf->nleaves;
808     if (ilocal) *ilocal = sf->mine;
809     if (iremote) *iremote = sf->remote;
810   }
811   PetscFunctionReturn(0);
812 }
813 
814 /*@
815    PetscSFGetLeafRange - Get the active leaf ranges
816 
817    Not Collective
818 
819    Input Parameter:
820 .  sf - star forest
821 
822    Output Parameters:
823 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
824 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
825 
826    Level: developer
827 
828 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
829 @*/
830 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
831 {
832   PetscFunctionBegin;
833   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
834   PetscSFCheckGraphSet(sf,1);
835   if (minleaf) *minleaf = sf->minleaf;
836   if (maxleaf) *maxleaf = sf->maxleaf;
837   PetscFunctionReturn(0);
838 }
839 
840 /*@C
841    PetscSFViewFromOptions - View from Options
842 
843    Collective on PetscSF
844 
845    Input Parameters:
846 +  A - the star forest
847 .  obj - Optional object
848 -  name - command line option
849 
850    Level: intermediate
851 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
852 @*/
853 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
854 {
855   PetscErrorCode ierr;
856 
857   PetscFunctionBegin;
858   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
859   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
860   PetscFunctionReturn(0);
861 }
862 
863 /*@C
864    PetscSFView - view a star forest
865 
866    Collective
867 
868    Input Parameters:
869 +  sf - star forest
870 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
871 
872    Level: beginner
873 
874 .seealso: PetscSFCreate(), PetscSFSetGraph()
875 @*/
876 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
877 {
878   PetscErrorCode    ierr;
879   PetscBool         iascii;
880   PetscViewerFormat format;
881 
882   PetscFunctionBegin;
883   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
884   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
885   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
886   PetscCheckSameComm(sf,1,viewer,2);
887   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
888   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
889   if (iascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) {
890     PetscMPIInt rank;
891     PetscInt    ii,i,j;
892 
893     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
894     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
895     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
896       if (!sf->graphset) {
897         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
898         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
899         PetscFunctionReturn(0);
900       }
901       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
902       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
903       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
904       for (i=0; i<sf->nleaves; i++) {
905         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
906       }
907       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
908       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
909       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
910         PetscMPIInt *tmpranks,*perm;
911         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
912         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
913         for (i=0; i<sf->nranks; i++) perm[i] = i;
914         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
915         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
916         for (ii=0; ii<sf->nranks; ii++) {
917           i = perm[ii];
918           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
919           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
920             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
921           }
922         }
923         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
924       }
925       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
926       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
927     }
928     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
929   }
930   if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
931   PetscFunctionReturn(0);
932 }
933 
934 /*@C
935    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
936 
937    Not Collective
938 
939    Input Parameter:
940 .  sf - star forest
941 
942    Output Parameters:
943 +  nranks - number of ranks referenced by local part
944 .  ranks - array of ranks
945 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
946 .  rmine - concatenated array holding local indices referencing each remote rank
947 -  rremote - concatenated array holding remote indices referenced for each remote rank
948 
949    Level: developer
950 
951 .seealso: PetscSFGetLeafRanks()
952 @*/
953 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
954 {
955   PetscErrorCode ierr;
956 
957   PetscFunctionBegin;
958   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
959   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
960   if (sf->ops->GetRootRanks) {
961     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
962   } else {
963     /* The generic implementation */
964     if (nranks)  *nranks  = sf->nranks;
965     if (ranks)   *ranks   = sf->ranks;
966     if (roffset) *roffset = sf->roffset;
967     if (rmine)   *rmine   = sf->rmine;
968     if (rremote) *rremote = sf->rremote;
969   }
970   PetscFunctionReturn(0);
971 }
972 
973 /*@C
974    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
975 
976    Not Collective
977 
978    Input Parameter:
979 .  sf - star forest
980 
981    Output Parameters:
982 +  niranks - number of leaf ranks referencing roots on this process
983 .  iranks - array of ranks
984 .  ioffset - offset in irootloc for each rank (length niranks+1)
985 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
986 
987    Level: developer
988 
989 .seealso: PetscSFGetRootRanks()
990 @*/
991 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
992 {
993   PetscErrorCode ierr;
994 
995   PetscFunctionBegin;
996   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
997   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
998   if (sf->ops->GetLeafRanks) {
999     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
1000   } else {
1001     PetscSFType type;
1002     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
1003     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
1004   }
1005   PetscFunctionReturn(0);
1006 }
1007 
1008 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
1009   PetscInt i;
1010   for (i=0; i<n; i++) {
1011     if (needle == list[i]) return PETSC_TRUE;
1012   }
1013   return PETSC_FALSE;
1014 }
1015 
1016 /*@C
1017    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
1018 
1019    Collective
1020 
1021    Input Parameters:
1022 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
1023 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
1024 
1025    Level: developer
1026 
1027 .seealso: PetscSFGetRootRanks()
1028 @*/
1029 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
1030 {
1031   PetscErrorCode     ierr;
1032   PetscTable         table;
1033   PetscTablePosition pos;
1034   PetscMPIInt        size,groupsize,*groupranks;
1035   PetscInt           *rcount,*ranks;
1036   PetscInt           i, irank = -1,orank = -1;
1037 
1038   PetscFunctionBegin;
1039   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1040   PetscSFCheckGraphSet(sf,1);
1041   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
1042   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
1043   for (i=0; i<sf->nleaves; i++) {
1044     /* Log 1-based rank */
1045     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
1046   }
1047   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
1048   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
1049   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
1050   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
1051   for (i=0; i<sf->nranks; i++) {
1052     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
1053     ranks[i]--;             /* Convert back to 0-based */
1054   }
1055   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
1056 
1057   /* We expect that dgroup is reliably "small" while nranks could be large */
1058   {
1059     MPI_Group group = MPI_GROUP_NULL;
1060     PetscMPIInt *dgroupranks;
1061     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1062     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRMPI(ierr);
1063     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
1064     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
1065     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
1066     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRMPI(ierr);}
1067     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1068     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
1069   }
1070 
1071   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1072   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i;) {
1073     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
1074       if (InList(ranks[i],groupsize,groupranks)) break;
1075     }
1076     for (; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1077       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
1078     }
1079     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
1080       PetscInt tmprank,tmpcount;
1081 
1082       tmprank             = ranks[i];
1083       tmpcount            = rcount[i];
1084       ranks[i]            = ranks[sf->ndranks];
1085       rcount[i]           = rcount[sf->ndranks];
1086       ranks[sf->ndranks]  = tmprank;
1087       rcount[sf->ndranks] = tmpcount;
1088       sf->ndranks++;
1089     }
1090   }
1091   ierr = PetscFree(groupranks);CHKERRQ(ierr);
1092   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
1093   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
1094   sf->roffset[0] = 0;
1095   for (i=0; i<sf->nranks; i++) {
1096     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
1097     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
1098     rcount[i]        = 0;
1099   }
1100   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
1101     /* short circuit */
1102     if (orank != sf->remote[i].rank) {
1103       /* Search for index of iremote[i].rank in sf->ranks */
1104       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
1105       if (irank < 0) {
1106         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
1107         if (irank >= 0) irank += sf->ndranks;
1108       }
1109       orank = sf->remote[i].rank;
1110     }
1111     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
1112     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1113     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1114     rcount[irank]++;
1115   }
1116   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
1117   PetscFunctionReturn(0);
1118 }
1119 
1120 /*@C
1121    PetscSFGetGroups - gets incoming and outgoing process groups
1122 
1123    Collective
1124 
1125    Input Parameter:
1126 .  sf - star forest
1127 
1128    Output Parameters:
1129 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1130 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1131 
1132    Level: developer
1133 
1134 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1135 @*/
1136 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1137 {
1138   PetscErrorCode ierr;
1139   MPI_Group      group = MPI_GROUP_NULL;
1140 
1141   PetscFunctionBegin;
1142   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1143   if (sf->ingroup == MPI_GROUP_NULL) {
1144     PetscInt       i;
1145     const PetscInt *indegree;
1146     PetscMPIInt    rank,*outranks,*inranks;
1147     PetscSFNode    *remote;
1148     PetscSF        bgcount;
1149 
1150     /* Compute the number of incoming ranks */
1151     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1152     for (i=0; i<sf->nranks; i++) {
1153       remote[i].rank  = sf->ranks[i];
1154       remote[i].index = 0;
1155     }
1156     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1157     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1158     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1159     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1160     /* Enumerate the incoming ranks */
1161     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1162     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1163     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1164     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1165     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1166     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1167     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRMPI(ierr);
1168     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1169     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1170     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1171   }
1172   *incoming = sf->ingroup;
1173 
1174   if (sf->outgroup == MPI_GROUP_NULL) {
1175     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1176     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRMPI(ierr);
1177     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1178   }
1179   *outgoing = sf->outgroup;
1180   PetscFunctionReturn(0);
1181 }
1182 
1183 /*@
1184    PetscSFGetMultiSF - gets the inner SF implementing gathers and scatters
1185 
1186    Collective
1187 
1188    Input Parameter:
1189 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1190 
1191    Output Parameter:
1192 .  multi - star forest with split roots, such that each root has degree exactly 1
1193 
1194    Level: developer
1195 
1196    Notes:
1197 
1198    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1199    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1200    edge, it is a candidate for future optimization that might involve its removal.
1201 
1202 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1203 @*/
1204 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1205 {
1206   PetscErrorCode ierr;
1207 
1208   PetscFunctionBegin;
1209   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1210   PetscValidPointer(multi,2);
1211   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1212     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1213     *multi = sf->multi;
1214     sf->multi->multi = sf->multi;
1215     PetscFunctionReturn(0);
1216   }
1217   if (!sf->multi) {
1218     const PetscInt *indegree;
1219     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1220     PetscSFNode    *remote;
1221     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1222     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1223     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1224     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1225     inoffset[0] = 0;
1226     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1227     for (i=0; i<maxlocal; i++) outones[i] = 1;
1228     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1229     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1230     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1231     if (PetscDefined(USE_DEBUG)) { /* Check that the expected number of increments occurred */
1232       for (i=0; i<sf->nroots; i++) {
1233         if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1234       }
1235     }
1236     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1237     for (i=0; i<sf->nleaves; i++) {
1238       remote[i].rank  = sf->remote[i].rank;
1239       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1240     }
1241     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1242     sf->multi->multi = sf->multi;
1243     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1244     if (sf->rankorder) {        /* Sort the ranks */
1245       PetscMPIInt rank;
1246       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1247       PetscSFNode *newremote;
1248       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1249       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1250       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1251       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1252       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPI_REPLACE);CHKERRQ(ierr);
1253       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPI_REPLACE);CHKERRQ(ierr);
1254       /* Sort the incoming ranks at each vertex, build the inverse map */
1255       for (i=0; i<sf->nroots; i++) {
1256         PetscInt j;
1257         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1258         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1259         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1260       }
1261       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset,MPI_REPLACE);CHKERRQ(ierr);
1262       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset,MPI_REPLACE);CHKERRQ(ierr);
1263       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1264       for (i=0; i<sf->nleaves; i++) {
1265         newremote[i].rank  = sf->remote[i].rank;
1266         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1267       }
1268       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1269       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1270     }
1271     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1272   }
1273   *multi = sf->multi;
1274   PetscFunctionReturn(0);
1275 }
1276 
1277 /*@C
1278    PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots, does not remap indices
1279 
1280    Collective
1281 
1282    Input Parameters:
1283 +  sf - original star forest
1284 .  nselected  - number of selected roots on this process
1285 -  selected   - indices of the selected roots on this process
1286 
1287    Output Parameter:
1288 .  esf - new star forest
1289 
1290    Level: advanced
1291 
1292    Note:
1293    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1294    be done by calling PetscSFGetGraph().
1295 
1296 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1297 @*/
1298 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *esf)
1299 {
1300   PetscInt          i,j,n,nroots,nleaves,esf_nleaves,*new_ilocal,minleaf,maxleaf,maxlocal;
1301   const PetscInt    *ilocal;
1302   signed char       *rootdata,*leafdata,*leafmem;
1303   const PetscSFNode *iremote;
1304   PetscSFNode       *new_iremote;
1305   MPI_Comm          comm;
1306   PetscErrorCode    ierr;
1307 
1308   PetscFunctionBegin;
1309   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1310   PetscSFCheckGraphSet(sf,1);
1311   if (nselected) PetscValidPointer(selected,3);
1312   PetscValidPointer(esf,4);
1313 
1314   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1315   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1316   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1317   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1318 
1319   if (PetscDefined(USE_DEBUG)) {  /* Error out if selected[] has dups or  out of range indices */
1320     PetscBool dups;
1321     ierr = PetscCheckDupsInt(nselected,selected,&dups);CHKERRQ(ierr);
1322     if (dups) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"selected[] has dups");
1323     for (i=0; i<nselected; i++)
1324       if (selected[i] < 0 || selected[i] >= nroots) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"selected root indice %D is out of [0,%D)",selected[i],nroots);
1325   }
1326 
1327   if (sf->ops->CreateEmbeddedRootSF) {
1328     ierr = (*sf->ops->CreateEmbeddedRootSF)(sf,nselected,selected,esf);CHKERRQ(ierr);
1329   } else {
1330     /* A generic version of creating embedded sf */
1331     ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
1332     maxlocal = maxleaf - minleaf + 1;
1333     ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafmem);CHKERRQ(ierr);
1334     leafdata = leafmem - minleaf;
1335     /* Tag selected roots and bcast to leaves */
1336     for (i=0; i<nselected; i++) rootdata[selected[i]] = 1;
1337     ierr = PetscSFBcastBegin(sf,MPI_SIGNED_CHAR,rootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1338     ierr = PetscSFBcastEnd(sf,MPI_SIGNED_CHAR,rootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1339 
1340     /* Build esf with leaves that are still connected */
1341     esf_nleaves = 0;
1342     for (i=0; i<nleaves; i++) {
1343       j = ilocal ? ilocal[i] : i;
1344       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1345          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1346       */
1347       esf_nleaves += (leafdata[j] ? 1 : 0);
1348     }
1349     ierr = PetscMalloc1(esf_nleaves,&new_ilocal);CHKERRQ(ierr);
1350     ierr = PetscMalloc1(esf_nleaves,&new_iremote);CHKERRQ(ierr);
1351     for (i=n=0; i<nleaves; i++) {
1352       j = ilocal ? ilocal[i] : i;
1353       if (leafdata[j]) {
1354         new_ilocal[n]        = j;
1355         new_iremote[n].rank  = iremote[i].rank;
1356         new_iremote[n].index = iremote[i].index;
1357         ++n;
1358       }
1359     }
1360     ierr = PetscSFCreate(comm,esf);CHKERRQ(ierr);
1361     ierr = PetscSFSetFromOptions(*esf);CHKERRQ(ierr);
1362     ierr = PetscSFSetGraph(*esf,nroots,esf_nleaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1363     ierr = PetscFree2(rootdata,leafmem);CHKERRQ(ierr);
1364   }
1365   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1366   PetscFunctionReturn(0);
1367 }
1368 
1369 /*@C
1370   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1371 
1372   Collective
1373 
1374   Input Parameters:
1375 + sf - original star forest
1376 . nselected  - number of selected leaves on this process
1377 - selected   - indices of the selected leaves on this process
1378 
1379   Output Parameter:
1380 .  newsf - new star forest
1381 
1382   Level: advanced
1383 
1384 .seealso: PetscSFCreateEmbeddedRootSF(), PetscSFSetGraph(), PetscSFGetGraph()
1385 @*/
1386 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1387 {
1388   const PetscSFNode *iremote;
1389   PetscSFNode       *new_iremote;
1390   const PetscInt    *ilocal;
1391   PetscInt          i,nroots,*leaves,*new_ilocal;
1392   MPI_Comm          comm;
1393   PetscErrorCode    ierr;
1394 
1395   PetscFunctionBegin;
1396   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1397   PetscSFCheckGraphSet(sf,1);
1398   if (nselected) PetscValidPointer(selected,3);
1399   PetscValidPointer(newsf,4);
1400 
1401   /* Uniq selected[] and put results in leaves[] */
1402   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1403   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1404   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1405   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1406   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1407 
1408   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1409   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1410     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1411   } else {
1412     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1413     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1414     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1415     for (i=0; i<nselected; ++i) {
1416       const PetscInt l     = leaves[i];
1417       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1418       new_iremote[i].rank  = iremote[l].rank;
1419       new_iremote[i].index = iremote[l].index;
1420     }
1421     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1422     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1423   }
1424   ierr = PetscFree(leaves);CHKERRQ(ierr);
1425   PetscFunctionReturn(0);
1426 }
1427 
1428 /*@C
1429    PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastEnd()
1430 
1431    Collective on PetscSF
1432 
1433    Input Parameters:
1434 +  sf - star forest on which to communicate
1435 .  unit - data type associated with each node
1436 .  rootdata - buffer to broadcast
1437 -  op - operation to use for reduction
1438 
1439    Output Parameter:
1440 .  leafdata - buffer to be reduced with values from each leaf's respective root
1441 
1442    Level: intermediate
1443 
1444    Notes:
1445     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1446     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1447     use PetscSFBcastWithMemTypeBegin() instead.
1448 .seealso: PetscSFBcastEnd(), PetscSFBcastWithMemTypeBegin()
1449 @*/
1450 PetscErrorCode PetscSFBcastBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1451 {
1452   PetscErrorCode ierr;
1453   PetscMemType   rootmtype,leafmtype;
1454 
1455   PetscFunctionBegin;
1456   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1457   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1458   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1459   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1460   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1461   ierr = (*sf->ops->BcastBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1462   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1463   PetscFunctionReturn(0);
1464 }
1465 
1466 /*@C
1467    PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to PetscSFBcastEnd()
1468 
1469    Collective on PetscSF
1470 
1471    Input Parameters:
1472 +  sf - star forest on which to communicate
1473 .  unit - data type associated with each node
1474 .  rootmtype - memory type of rootdata
1475 .  rootdata - buffer to broadcast
1476 .  leafmtype - memory type of leafdata
1477 -  op - operation to use for reduction
1478 
1479    Output Parameter:
1480 .  leafdata - buffer to be reduced with values from each leaf's respective root
1481 
1482    Level: intermediate
1483 
1484 .seealso: PetscSFBcastEnd(), PetscSFBcastBegin()
1485 @*/
1486 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
1487 {
1488   PetscErrorCode ierr;
1489 
1490   PetscFunctionBegin;
1491   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1492   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1493   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1494   ierr = (*sf->ops->BcastBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1495   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1496   PetscFunctionReturn(0);
1497 }
1498 
1499 /*@C
1500    PetscSFBcastEnd - end a broadcast & reduce operation started with PetscSFBcastBegin()
1501 
1502    Collective
1503 
1504    Input Parameters:
1505 +  sf - star forest
1506 .  unit - data type
1507 .  rootdata - buffer to broadcast
1508 -  op - operation to use for reduction
1509 
1510    Output Parameter:
1511 .  leafdata - buffer to be reduced with values from each leaf's respective root
1512 
1513    Level: intermediate
1514 
1515 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1516 @*/
1517 PetscErrorCode PetscSFBcastEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1518 {
1519   PetscErrorCode ierr;
1520 
1521   PetscFunctionBegin;
1522   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1523   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastEnd,sf,0,0,0);CHKERRQ(ierr);}
1524   ierr = (*sf->ops->BcastEnd)(sf,unit,rootdata,leafdata,op);CHKERRQ(ierr);
1525   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastEnd,sf,0,0,0);CHKERRQ(ierr);}
1526   PetscFunctionReturn(0);
1527 }
1528 
1529 /*@C
1530    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1531 
1532    Collective
1533 
1534    Input Parameters:
1535 +  sf - star forest
1536 .  unit - data type
1537 .  leafdata - values to reduce
1538 -  op - reduction operation
1539 
1540    Output Parameter:
1541 .  rootdata - result of reduction of values from all leaves of each root
1542 
1543    Level: intermediate
1544 
1545    Notes:
1546     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1547     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1548     use PetscSFReduceWithMemTypeBegin() instead.
1549 
1550 .seealso: PetscSFBcastBegin(), PetscSFReduceWithMemTypeBegin()
1551 @*/
1552 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1553 {
1554   PetscErrorCode ierr;
1555   PetscMemType   rootmtype,leafmtype;
1556 
1557   PetscFunctionBegin;
1558   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1559   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1560   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1561   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1562   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1563   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1564   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1565   PetscFunctionReturn(0);
1566 }
1567 
1568 /*@C
1569    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to PetscSFReduceEnd()
1570 
1571    Collective
1572 
1573    Input Parameters:
1574 +  sf - star forest
1575 .  unit - data type
1576 .  leafmtype - memory type of leafdata
1577 .  leafdata - values to reduce
1578 .  rootmtype - memory type of rootdata
1579 -  op - reduction operation
1580 
1581    Output Parameter:
1582 .  rootdata - result of reduction of values from all leaves of each root
1583 
1584    Level: intermediate
1585 
1586 .seealso: PetscSFBcastBegin(), PetscSFReduceBegin()
1587 @*/
1588 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
1589 {
1590   PetscErrorCode ierr;
1591 
1592   PetscFunctionBegin;
1593   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1594   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1595   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1596   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1597   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1598   PetscFunctionReturn(0);
1599 }
1600 
1601 /*@C
1602    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1603 
1604    Collective
1605 
1606    Input Parameters:
1607 +  sf - star forest
1608 .  unit - data type
1609 .  leafdata - values to reduce
1610 -  op - reduction operation
1611 
1612    Output Parameter:
1613 .  rootdata - result of reduction of values from all leaves of each root
1614 
1615    Level: intermediate
1616 
1617 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1618 @*/
1619 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1620 {
1621   PetscErrorCode ierr;
1622 
1623   PetscFunctionBegin;
1624   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1625   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1626   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
1627   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1628   PetscFunctionReturn(0);
1629 }
1630 
1631 /*@C
1632    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1633 
1634    Collective
1635 
1636    Input Parameters:
1637 +  sf - star forest
1638 .  unit - data type
1639 .  leafdata - leaf values to use in reduction
1640 -  op - operation to use for reduction
1641 
1642    Output Parameters:
1643 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1644 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1645 
1646    Level: advanced
1647 
1648    Note:
1649    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1650    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1651    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1652    integers.
1653 
1654 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1655 @*/
1656 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1657 {
1658   PetscErrorCode ierr;
1659   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1660 
1661   PetscFunctionBegin;
1662   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1663   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1664   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1665   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1666   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1667   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1668   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1669   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1670   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1671   PetscFunctionReturn(0);
1672 }
1673 
1674 /*@C
1675    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1676 
1677    Collective
1678 
1679    Input Parameters:
1680 +  sf - star forest
1681 .  unit - data type
1682 .  leafdata - leaf values to use in reduction
1683 -  op - operation to use for reduction
1684 
1685    Output Parameters:
1686 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1687 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1688 
1689    Level: advanced
1690 
1691 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1692 @*/
1693 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1694 {
1695   PetscErrorCode ierr;
1696 
1697   PetscFunctionBegin;
1698   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1699   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1700   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootdata,leafdata,leafupdate,op);CHKERRQ(ierr);
1701   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1702   PetscFunctionReturn(0);
1703 }
1704 
1705 /*@C
1706    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1707 
1708    Collective
1709 
1710    Input Parameter:
1711 .  sf - star forest
1712 
1713    Output Parameter:
1714 .  degree - degree of each root vertex
1715 
1716    Level: advanced
1717 
1718    Notes:
1719    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1720 
1721 .seealso: PetscSFGatherBegin()
1722 @*/
1723 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1724 {
1725   PetscErrorCode ierr;
1726 
1727   PetscFunctionBegin;
1728   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1729   PetscSFCheckGraphSet(sf,1);
1730   PetscValidPointer(degree,2);
1731   if (!sf->degreeknown) {
1732     PetscInt i, nroots = sf->nroots, maxlocal;
1733     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1734     maxlocal = sf->maxleaf-sf->minleaf+1;
1735     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1736     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1737     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1738     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1739     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1740   }
1741   *degree = NULL;
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 /*@C
1746    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1747 
1748    Collective
1749 
1750    Input Parameter:
1751 .  sf - star forest
1752 
1753    Output Parameter:
1754 .  degree - degree of each root vertex
1755 
1756    Level: developer
1757 
1758    Notes:
1759    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1760 
1761 .seealso:
1762 @*/
1763 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1764 {
1765   PetscErrorCode ierr;
1766 
1767   PetscFunctionBegin;
1768   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1769   PetscSFCheckGraphSet(sf,1);
1770   PetscValidPointer(degree,2);
1771   if (!sf->degreeknown) {
1772     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1773     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1774     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1775     sf->degreeknown = PETSC_TRUE;
1776   }
1777   *degree = sf->degree;
1778   PetscFunctionReturn(0);
1779 }
1780 
1781 /*@C
1782    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1783    Each multi-root is assigned index of the corresponding original root.
1784 
1785    Collective
1786 
1787    Input Parameters:
1788 +  sf - star forest
1789 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1790 
1791    Output Parameters:
1792 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1793 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1794 
1795    Level: developer
1796 
1797    Notes:
1798    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1799 
1800 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1801 @*/
1802 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1803 {
1804   PetscSF             msf;
1805   PetscInt            i, j, k, nroots, nmroots;
1806   PetscErrorCode      ierr;
1807 
1808   PetscFunctionBegin;
1809   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1810   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1811   if (nroots) PetscValidIntPointer(degree,2);
1812   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1813   PetscValidPointer(multiRootsOrigNumbering,4);
1814   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1815   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1816   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1817   for (i=0,j=0,k=0; i<nroots; i++) {
1818     if (!degree[i]) continue;
1819     for (j=0; j<degree[i]; j++,k++) {
1820       (*multiRootsOrigNumbering)[k] = i;
1821     }
1822   }
1823   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1824   if (nMultiRoots) *nMultiRoots = nmroots;
1825   PetscFunctionReturn(0);
1826 }
1827 
1828 /*@C
1829    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1830 
1831    Collective
1832 
1833    Input Parameters:
1834 +  sf - star forest
1835 .  unit - data type
1836 -  leafdata - leaf data to gather to roots
1837 
1838    Output Parameter:
1839 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1840 
1841    Level: intermediate
1842 
1843 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1844 @*/
1845 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1846 {
1847   PetscErrorCode ierr;
1848   PetscSF        multi = NULL;
1849 
1850   PetscFunctionBegin;
1851   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1852   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1853   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1854   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPI_REPLACE);CHKERRQ(ierr);
1855   PetscFunctionReturn(0);
1856 }
1857 
1858 /*@C
1859    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1860 
1861    Collective
1862 
1863    Input Parameters:
1864 +  sf - star forest
1865 .  unit - data type
1866 -  leafdata - leaf data to gather to roots
1867 
1868    Output Parameter:
1869 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1870 
1871    Level: intermediate
1872 
1873 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1874 @*/
1875 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1876 {
1877   PetscErrorCode ierr;
1878   PetscSF        multi = NULL;
1879 
1880   PetscFunctionBegin;
1881   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1882   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1883   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPI_REPLACE);CHKERRQ(ierr);
1884   PetscFunctionReturn(0);
1885 }
1886 
1887 /*@C
1888    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1889 
1890    Collective
1891 
1892    Input Parameters:
1893 +  sf - star forest
1894 .  unit - data type
1895 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1896 
1897    Output Parameter:
1898 .  leafdata - leaf data to be update with personal data from each respective root
1899 
1900    Level: intermediate
1901 
1902 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1903 @*/
1904 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1905 {
1906   PetscErrorCode ierr;
1907   PetscSF        multi = NULL;
1908 
1909   PetscFunctionBegin;
1910   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1911   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1912   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1913   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1914   PetscFunctionReturn(0);
1915 }
1916 
1917 /*@C
1918    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1919 
1920    Collective
1921 
1922    Input Parameters:
1923 +  sf - star forest
1924 .  unit - data type
1925 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1926 
1927    Output Parameter:
1928 .  leafdata - leaf data to be update with personal data from each respective root
1929 
1930    Level: intermediate
1931 
1932 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1933 @*/
1934 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1935 {
1936   PetscErrorCode ierr;
1937   PetscSF        multi = NULL;
1938 
1939   PetscFunctionBegin;
1940   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1941   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1942   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1943   PetscFunctionReturn(0);
1944 }
1945 
1946 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1947 {
1948   PetscInt        i, n, nleaves;
1949   const PetscInt *ilocal = NULL;
1950   PetscHSetI      seen;
1951   PetscErrorCode  ierr;
1952 
1953   PetscFunctionBegin;
1954   if (PetscDefined(USE_DEBUG)) {
1955     ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1956     ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1957     for (i = 0; i < nleaves; i++) {
1958       const PetscInt leaf = ilocal ? ilocal[i] : i;
1959       ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1960     }
1961     ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1962     if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1963     ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1964   }
1965   PetscFunctionReturn(0);
1966 }
1967 
1968 /*@
1969   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1970 
1971   Input Parameters:
1972 + sfA - The first PetscSF
1973 - sfB - The second PetscSF
1974 
1975   Output Parameters:
1976 . sfBA - The composite SF
1977 
1978   Level: developer
1979 
1980   Notes:
1981   Currently, the two SFs must be defined on congruent communicators and they must be true star
1982   forests, i.e. the same leaf is not connected with different roots.
1983 
1984   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1985   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1986   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1987   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1988 
1989 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1990 @*/
1991 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1992 {
1993   PetscErrorCode    ierr;
1994   const PetscSFNode *remotePointsA,*remotePointsB;
1995   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1996   const PetscInt    *localPointsA,*localPointsB;
1997   PetscInt          *localPointsBA;
1998   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1999   PetscBool         denseB;
2000 
2001   PetscFunctionBegin;
2002   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2003   PetscSFCheckGraphSet(sfA,1);
2004   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2005   PetscSFCheckGraphSet(sfB,2);
2006   PetscCheckSameComm(sfA,1,sfB,2);
2007   PetscValidPointer(sfBA,3);
2008   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2009   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2010 
2011   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
2012   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
2013   if (localPointsA) {
2014     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
2015     for (i=0; i<numRootsB; i++) {
2016       reorderedRemotePointsA[i].rank = -1;
2017       reorderedRemotePointsA[i].index = -1;
2018     }
2019     for (i=0; i<numLeavesA; i++) {
2020       if (localPointsA[i] >= numRootsB) continue;
2021       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
2022     }
2023     remotePointsA = reorderedRemotePointsA;
2024   }
2025   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
2026   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
2027   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf,MPI_REPLACE);CHKERRQ(ierr);
2028   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf,MPI_REPLACE);CHKERRQ(ierr);
2029   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2030 
2031   denseB = (PetscBool)!localPointsB;
2032   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2033     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
2034     else numLeavesBA++;
2035   }
2036   if (denseB) {
2037     localPointsBA  = NULL;
2038     remotePointsBA = leafdataB;
2039   } else {
2040     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
2041     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
2042     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2043       const PetscInt l = localPointsB ? localPointsB[i] : i;
2044 
2045       if (leafdataB[l-minleaf].rank == -1) continue;
2046       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
2047       localPointsBA[numLeavesBA] = l;
2048       numLeavesBA++;
2049     }
2050     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
2051   }
2052   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2053   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2054   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2055   PetscFunctionReturn(0);
2056 }
2057 
2058 /*@
2059   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
2060 
2061   Input Parameters:
2062 + sfA - The first PetscSF
2063 - sfB - The second PetscSF
2064 
2065   Output Parameters:
2066 . sfBA - The composite SF.
2067 
2068   Level: developer
2069 
2070   Notes:
2071   Currently, the two SFs must be defined on congruent communicators and they must be true star
2072   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2073   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2074 
2075   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2076   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2077   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2078   a Reduce on sfB, on connected roots.
2079 
2080 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
2081 @*/
2082 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
2083 {
2084   PetscErrorCode    ierr;
2085   const PetscSFNode *remotePointsA,*remotePointsB;
2086   PetscSFNode       *remotePointsBA;
2087   const PetscInt    *localPointsA,*localPointsB;
2088   PetscSFNode       *reorderedRemotePointsA = NULL;
2089   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
2090   MPI_Op            op;
2091 #if defined(PETSC_USE_64BIT_INDICES)
2092   PetscBool         iswin;
2093 #endif
2094 
2095   PetscFunctionBegin;
2096   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2097   PetscSFCheckGraphSet(sfA,1);
2098   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2099   PetscSFCheckGraphSet(sfB,2);
2100   PetscCheckSameComm(sfA,1,sfB,2);
2101   PetscValidPointer(sfBA,3);
2102   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2103   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2104 
2105   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
2106   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
2107 
2108   /* TODO: Check roots of sfB have degree of 1 */
2109   /* Once we implement it, we can replace the MPI_MAXLOC
2110      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2111      We use MPI_MAXLOC only to have a deterministic output from this routine if
2112      the root condition is not meet.
2113    */
2114   op = MPI_MAXLOC;
2115 #if defined(PETSC_USE_64BIT_INDICES)
2116   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2117   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
2118   if (iswin) op = MPI_REPLACE;
2119 #endif
2120 
2121   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
2122   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
2123   for (i=0; i<maxleaf - minleaf + 1; i++) {
2124     reorderedRemotePointsA[i].rank = -1;
2125     reorderedRemotePointsA[i].index = -1;
2126   }
2127   if (localPointsA) {
2128     for (i=0; i<numLeavesA; i++) {
2129       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2130       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2131     }
2132   } else {
2133     for (i=0; i<numLeavesA; i++) {
2134       if (i > maxleaf || i < minleaf) continue;
2135       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2136     }
2137   }
2138 
2139   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
2140   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
2141   for (i=0; i<numRootsB; i++) {
2142     remotePointsBA[i].rank = -1;
2143     remotePointsBA[i].index = -1;
2144   }
2145 
2146   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2147   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2148   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2149   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
2150     if (remotePointsBA[i].rank == -1) continue;
2151     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
2152     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2153     localPointsBA[numLeavesBA] = i;
2154     numLeavesBA++;
2155   }
2156   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2157   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2158   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2159   PetscFunctionReturn(0);
2160 }
2161 
2162 /*
2163   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
2164 
2165   Input Parameters:
2166 . sf - The global PetscSF
2167 
2168   Output Parameters:
2169 . out - The local PetscSF
2170  */
2171 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
2172 {
2173   MPI_Comm           comm;
2174   PetscMPIInt        myrank;
2175   const PetscInt     *ilocal;
2176   const PetscSFNode  *iremote;
2177   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
2178   PetscSFNode        *liremote;
2179   PetscSF            lsf;
2180   PetscErrorCode     ierr;
2181 
2182   PetscFunctionBegin;
2183   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2184   if (sf->ops->CreateLocalSF) {
2185     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2186   } else {
2187     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2188     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2189     ierr = MPI_Comm_rank(comm,&myrank);CHKERRMPI(ierr);
2190 
2191     /* Find out local edges and build a local SF */
2192     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2193     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2194     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2195     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2196 
2197     for (i=j=0; i<nleaves; i++) {
2198       if (iremote[i].rank == (PetscInt)myrank) {
2199         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2200         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2201         liremote[j].index = iremote[i].index;
2202         j++;
2203       }
2204     }
2205     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2206     ierr = PetscSFSetFromOptions(lsf);CHKERRQ(ierr);
2207     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2208     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2209     *out = lsf;
2210   }
2211   PetscFunctionReturn(0);
2212 }
2213 
2214 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2215 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2216 {
2217   PetscErrorCode     ierr;
2218   PetscMemType       rootmtype,leafmtype;
2219 
2220   PetscFunctionBegin;
2221   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2222   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2223   ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);
2224   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2225   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2226   if (sf->ops->BcastToZero) {
2227     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2228   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2229   ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);
2230   PetscFunctionReturn(0);
2231 }
2232 
2233