xref: /petsc/src/vec/is/sf/interface/sf.c (revision 0db4d2e0165a0ea245cec0549c2de0bb7b39e2c0)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6   #include <cuda_runtime.h>
7 #endif
8 
9 #if defined(PETSC_HAVE_HIP)
10   #include <hip/hip_runtime.h>
11 #endif
12 
13 #if defined(PETSC_USE_DEBUG)
14 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
15     if (PetscUnlikely(!(sf)->graphset))                              \
16       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
17   } while (0)
18 #else
19 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
20 #endif
21 
22 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",NULL};
23 
24 PETSC_STATIC_INLINE PetscErrorCode PetscGetMemType(const void *data,PetscMemType *type)
25 {
26   PetscFunctionBegin;
27   PetscValidPointer(type,2);
28   *type = PETSC_MEMTYPE_HOST;
29 #if defined(PETSC_HAVE_CUDA)
30   if (PetscCUDAInitialized && data) {
31     cudaError_t                  cerr;
32     struct cudaPointerAttributes attr;
33     enum cudaMemoryType          mtype;
34     cerr = cudaPointerGetAttributes(&attr,data); /* Do not check error since before CUDA 11.0, passing a host pointer returns cudaErrorInvalidValue */
35     cudaGetLastError(); /* Reset the last error */
36     #if (CUDART_VERSION < 10000)
37       mtype = attr.memoryType;
38     #else
39       mtype = attr.type;
40     #endif
41     if (cerr == cudaSuccess && mtype == cudaMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
42   }
43 #endif
44 
45 #if defined(PETSC_HAVE_HIP)
46   if (PetscHIPInitialized && data) {
47     hipError_t                   cerr;
48     struct hipPointerAttribute_t attr;
49     enum hipMemoryType           mtype;
50     cerr = hipPointerGetAttributes(&attr,data);
51     hipGetLastError(); /* Reset the last error */
52     mtype = attr.memoryType;
53     if (cerr == hipSuccess && mtype == hipMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
54   }
55 #endif
56   PetscFunctionReturn(0);
57 }
58 
59 /*@
60    PetscSFCreate - create a star forest communication context
61 
62    Collective
63 
64    Input Arguments:
65 .  comm - communicator on which the star forest will operate
66 
67    Output Arguments:
68 .  sf - new star forest context
69 
70    Options Database Keys:
71 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
72 .  -sf_type window    -Use MPI-3 one-sided window for communication
73 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
74 
75    Level: intermediate
76 
77    Notes:
78    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
79    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
80    SFs are optimized and they have better performance than general SFs.
81 
82 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
83 @*/
84 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
85 {
86   PetscErrorCode ierr;
87   PetscSF        b;
88 
89   PetscFunctionBegin;
90   PetscValidPointer(sf,2);
91   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
92 
93   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
94 
95   b->nroots    = -1;
96   b->nleaves   = -1;
97   b->minleaf   = PETSC_MAX_INT;
98   b->maxleaf   = PETSC_MIN_INT;
99   b->nranks    = -1;
100   b->rankorder = PETSC_TRUE;
101   b->ingroup   = MPI_GROUP_NULL;
102   b->outgroup  = MPI_GROUP_NULL;
103   b->graphset  = PETSC_FALSE;
104 #if defined(PETSC_HAVE_DEVICE)
105   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
106   b->use_stream_aware_mpi = PETSC_FALSE;
107   b->use_default_stream   = PETSC_TRUE; /* The assumption is true for PETSc internal use of SF */
108   /* Set the default */
109   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
110     b->backend = PETSCSF_BACKEND_KOKKOS;
111   #elif defined(PETSC_HAVE_CUDA)
112     b->backend = PETSCSF_BACKEND_CUDA;
113   #elif defined(PETSC_HAVE_HIP)
114     b->backend = PETSCSF_BACKEND_HIP;
115   #endif
116 #endif
117   b->vscat.from_n = -1;
118   b->vscat.to_n   = -1;
119   b->vscat.unit   = MPIU_SCALAR;
120  *sf = b;
121   PetscFunctionReturn(0);
122 }
123 
124 /*@
125    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
126 
127    Collective
128 
129    Input Arguments:
130 .  sf - star forest
131 
132    Level: advanced
133 
134 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
135 @*/
136 PetscErrorCode PetscSFReset(PetscSF sf)
137 {
138   PetscErrorCode ierr;
139 
140   PetscFunctionBegin;
141   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
142   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
143   sf->nroots   = -1;
144   sf->nleaves  = -1;
145   sf->minleaf  = PETSC_MAX_INT;
146   sf->maxleaf  = PETSC_MIN_INT;
147   sf->mine     = NULL;
148   sf->remote   = NULL;
149   sf->graphset = PETSC_FALSE;
150   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
151   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
152   sf->nranks = -1;
153   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
154 #if defined(PETSC_HAVE_DEVICE)
155   for (PetscInt i=0; i<2; i++) {ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,sf->rmine_d[i]);CHKERRQ(ierr);}
156 #endif
157   sf->degreeknown = PETSC_FALSE;
158   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
159   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRMPI(ierr);}
160   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRMPI(ierr);}
161   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
162   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
163   sf->setupcalled = PETSC_FALSE;
164   PetscFunctionReturn(0);
165 }
166 
167 /*@C
168    PetscSFSetType - Set the PetscSF communication implementation
169 
170    Collective on PetscSF
171 
172    Input Parameters:
173 +  sf - the PetscSF context
174 -  type - a known method
175 
176    Options Database Key:
177 .  -sf_type <type> - Sets the method; use -help for a list
178    of available methods (for instance, window, basic, neighbor)
179 
180    Notes:
181    See "include/petscsf.h" for available methods (for instance)
182 +    PETSCSFWINDOW - MPI-2/3 one-sided
183 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
184 
185   Level: intermediate
186 
187 .seealso: PetscSFType, PetscSFCreate()
188 @*/
189 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
190 {
191   PetscErrorCode ierr,(*r)(PetscSF);
192   PetscBool      match;
193 
194   PetscFunctionBegin;
195   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
196   PetscValidCharPointer(type,2);
197 
198   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
199   if (match) PetscFunctionReturn(0);
200 
201   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
202   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
203   /* Destroy the previous PetscSF implementation context */
204   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
205   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
206   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
207   ierr = (*r)(sf);CHKERRQ(ierr);
208   PetscFunctionReturn(0);
209 }
210 
211 /*@C
212   PetscSFGetType - Get the PetscSF communication implementation
213 
214   Not Collective
215 
216   Input Parameter:
217 . sf  - the PetscSF context
218 
219   Output Parameter:
220 . type - the PetscSF type name
221 
222   Level: intermediate
223 
224 .seealso: PetscSFSetType(), PetscSFCreate()
225 @*/
226 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
227 {
228   PetscFunctionBegin;
229   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
230   PetscValidPointer(type,2);
231   *type = ((PetscObject)sf)->type_name;
232   PetscFunctionReturn(0);
233 }
234 
235 /*@
236    PetscSFDestroy - destroy star forest
237 
238    Collective
239 
240    Input Arguments:
241 .  sf - address of star forest
242 
243    Level: intermediate
244 
245 .seealso: PetscSFCreate(), PetscSFReset()
246 @*/
247 PetscErrorCode PetscSFDestroy(PetscSF *sf)
248 {
249   PetscErrorCode ierr;
250 
251   PetscFunctionBegin;
252   if (!*sf) PetscFunctionReturn(0);
253   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
254   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
255   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
256   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
257   ierr = PetscSFDestroy(&(*sf)->vscat.lsf);CHKERRQ(ierr);
258   if ((*sf)->vscat.bs > 1) {ierr = MPI_Type_free(&(*sf)->vscat.unit);CHKERRQ(ierr);}
259   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
260   PetscFunctionReturn(0);
261 }
262 
263 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
264 {
265   PetscInt           i, nleaves;
266   PetscMPIInt        size;
267   const PetscInt    *ilocal;
268   const PetscSFNode *iremote;
269   PetscErrorCode     ierr;
270 
271   PetscFunctionBegin;
272   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
273   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
274   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
275   for (i = 0; i < nleaves; i++) {
276     const PetscInt rank = iremote[i].rank;
277     const PetscInt remote = iremote[i].index;
278     const PetscInt leaf = ilocal ? ilocal[i] : i;
279     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
280     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
281     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
282   }
283   PetscFunctionReturn(0);
284 }
285 
286 
287 
288 
289 /*@
290    PetscSFSetUp - set up communication structures
291 
292    Collective
293 
294    Input Arguments:
295 .  sf - star forest communication object
296 
297    Level: beginner
298 
299 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
300 @*/
301 PetscErrorCode PetscSFSetUp(PetscSF sf)
302 {
303   PetscErrorCode ierr;
304 
305   PetscFunctionBegin;
306   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
307   PetscSFCheckGraphSet(sf,1);
308   if (sf->setupcalled) PetscFunctionReturn(0);
309   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
310   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
311   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);} /* Zero all sf->ops */
312   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
313 #if defined(PETSC_HAVE_CUDA)
314   if (sf->backend == PETSCSF_BACKEND_CUDA) {
315     sf->ops->Malloc = PetscSFMalloc_Cuda;
316     sf->ops->Free   = PetscSFFree_Cuda;
317   }
318 #endif
319 #if defined(PETSC_HAVE_HIP)
320   if (sf->backend == PETSCSF_BACKEND_HIP) {
321     sf->ops->Malloc = PetscSFMalloc_HIP;
322     sf->ops->Free   = PetscSFFree_HIP;
323   }
324 #endif
325 
326 #
327 #if defined(PETSC_HAVE_KOKKOS)
328   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
329     sf->ops->Malloc = PetscSFMalloc_Kokkos;
330     sf->ops->Free   = PetscSFFree_Kokkos;
331   }
332 #endif
333   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
334   sf->setupcalled = PETSC_TRUE;
335   PetscFunctionReturn(0);
336 }
337 
338 /*@
339    PetscSFSetFromOptions - set PetscSF options using the options database
340 
341    Logically Collective
342 
343    Input Arguments:
344 .  sf - star forest
345 
346    Options Database Keys:
347 +  -sf_type               - implementation type, see PetscSFSetType()
348 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
349 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
350                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
351                             If true, this option only works with -use_gpu_aware_mpi 1.
352 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
353                                If true, this option only works with -use_gpu_aware_mpi 1.
354 
355 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently SF has these backends: cuda, hip and Kokkos.
356                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
357                               the only available is kokkos.
358 
359    Level: intermediate
360 @*/
361 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
362 {
363   PetscSFType    deft;
364   char           type[256];
365   PetscErrorCode ierr;
366   PetscBool      flg;
367 
368   PetscFunctionBegin;
369   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
370   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
371   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
372   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
373   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
374   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
375 #if defined(PETSC_HAVE_DEVICE)
376   {
377     char        backendstr[32] = {0};
378     PetscBool   isCuda = PETSC_FALSE,isHip = PETSC_FALSE,isKokkos = PETSC_FALSE,set;
379     /* Change the defaults set in PetscSFCreate() with command line options */
380     ierr = PetscOptionsBool("-sf_use_default_stream","Assume SF's input and output root/leafdata is computed on the default stream","PetscSFSetFromOptions",sf->use_default_stream,&sf->use_default_stream,NULL);CHKERRQ(ierr);
381     ierr = PetscOptionsBool("-sf_use_stream_aware_mpi","Assume the underlying MPI is cuda-stream aware","PetscSFSetFromOptions",sf->use_stream_aware_mpi,&sf->use_stream_aware_mpi,NULL);CHKERRQ(ierr);
382     ierr = PetscOptionsString("-sf_backend","Select the device backend SF uses","PetscSFSetFromOptions",NULL,backendstr,sizeof(backendstr),&set);CHKERRQ(ierr);
383     ierr = PetscStrcasecmp("cuda",backendstr,&isCuda);CHKERRQ(ierr);
384     ierr = PetscStrcasecmp("kokkos",backendstr,&isKokkos);CHKERRQ(ierr);
385     ierr = PetscStrcasecmp("hip",backendstr,&isHip);CHKERRQ(ierr);
386   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
387     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
388     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
389     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
390     else if (set) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
391   #elif defined(PETSC_HAVE_KOKKOS)
392     if (set && !isKokkos) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You can only choose kokkos", backendstr);
393   #endif
394   }
395 #endif
396   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
397   ierr = PetscOptionsEnd();CHKERRQ(ierr);
398   PetscFunctionReturn(0);
399 }
400 
401 /*@
402    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
403 
404    Logically Collective
405 
406    Input Arguments:
407 +  sf - star forest
408 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
409 
410    Level: advanced
411 
412 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
413 @*/
414 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
415 {
416   PetscFunctionBegin;
417   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
418   PetscValidLogicalCollectiveBool(sf,flg,2);
419   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
420   sf->rankorder = flg;
421   PetscFunctionReturn(0);
422 }
423 
424 /*@
425    PetscSFSetGraph - Set a parallel star forest
426 
427    Collective
428 
429    Input Arguments:
430 +  sf - star forest
431 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
432 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
433 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
434 during setup in debug mode)
435 .  localmode - copy mode for ilocal
436 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
437 during setup in debug mode)
438 -  remotemode - copy mode for iremote
439 
440    Level: intermediate
441 
442    Notes:
443     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
444 
445    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
446    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
447    needed
448 
449    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
450    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
451 
452 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
453 @*/
454 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
455 {
456   PetscErrorCode ierr;
457 
458   PetscFunctionBegin;
459   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
460   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
461   if (nleaves > 0) PetscValidPointer(iremote,6);
462   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
463   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
464 
465   if (sf->nroots >= 0) { /* Reset only if graph already set */
466     ierr = PetscSFReset(sf);CHKERRQ(ierr);
467   }
468 
469   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
470 
471   sf->nroots  = nroots;
472   sf->nleaves = nleaves;
473 
474   if (nleaves && ilocal) {
475     PetscInt i;
476     PetscInt minleaf = PETSC_MAX_INT;
477     PetscInt maxleaf = PETSC_MIN_INT;
478     int      contiguous = 1;
479     for (i=0; i<nleaves; i++) {
480       minleaf = PetscMin(minleaf,ilocal[i]);
481       maxleaf = PetscMax(maxleaf,ilocal[i]);
482       contiguous &= (ilocal[i] == i);
483     }
484     sf->minleaf = minleaf;
485     sf->maxleaf = maxleaf;
486     if (contiguous) {
487       if (localmode == PETSC_OWN_POINTER) {
488         ierr = PetscFree(ilocal);CHKERRQ(ierr);
489       }
490       ilocal = NULL;
491     }
492   } else {
493     sf->minleaf = 0;
494     sf->maxleaf = nleaves - 1;
495   }
496 
497   if (ilocal) {
498     switch (localmode) {
499     case PETSC_COPY_VALUES:
500       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
501       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
502       sf->mine = sf->mine_alloc;
503       break;
504     case PETSC_OWN_POINTER:
505       sf->mine_alloc = (PetscInt*)ilocal;
506       sf->mine       = sf->mine_alloc;
507       break;
508     case PETSC_USE_POINTER:
509       sf->mine_alloc = NULL;
510       sf->mine       = (PetscInt*)ilocal;
511       break;
512     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
513     }
514   }
515 
516   switch (remotemode) {
517   case PETSC_COPY_VALUES:
518     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
519     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
520     sf->remote = sf->remote_alloc;
521     break;
522   case PETSC_OWN_POINTER:
523     sf->remote_alloc = (PetscSFNode*)iremote;
524     sf->remote       = sf->remote_alloc;
525     break;
526   case PETSC_USE_POINTER:
527     sf->remote_alloc = NULL;
528     sf->remote       = (PetscSFNode*)iremote;
529     break;
530   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
531   }
532 
533   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
534   sf->graphset = PETSC_TRUE;
535   PetscFunctionReturn(0);
536 }
537 
538 /*@
539   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
540 
541   Collective
542 
543   Input Parameters:
544 + sf      - The PetscSF
545 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
546 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
547 
548   Notes:
549   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
550   n and N are local and global sizes of x respectively.
551 
552   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
553   sequential vectors y on all ranks.
554 
555   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
556   sequential vector y on rank 0.
557 
558   In above cases, entries of x are roots and entries of y are leaves.
559 
560   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
561   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
562   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
563   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
564   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
565 
566   In this case, roots and leaves are symmetric.
567 
568   Level: intermediate
569  @*/
570 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
571 {
572   MPI_Comm       comm;
573   PetscInt       n,N,res[2];
574   PetscMPIInt    rank,size;
575   PetscSFType    type;
576   PetscErrorCode ierr;
577 
578   PetscFunctionBegin;
579   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
580   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
581   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
582   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
583 
584   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
585     type = PETSCSFALLTOALL;
586     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
587     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
588     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
589     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
590   } else {
591     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
592     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
593     res[0] = n;
594     res[1] = -n;
595     /* Check if n are same over all ranks so that we can optimize it */
596     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
597     if (res[0] == -res[1]) { /* same n */
598       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
599     } else {
600       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
601     }
602     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
603   }
604   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
605 
606   sf->pattern = pattern;
607   sf->mine    = NULL; /* Contiguous */
608 
609   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
610      Also set other easy stuff.
611    */
612   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
613     sf->nleaves      = N;
614     sf->nroots       = n;
615     sf->nranks       = size;
616     sf->minleaf      = 0;
617     sf->maxleaf      = N - 1;
618   } else if (pattern == PETSCSF_PATTERN_GATHER) {
619     sf->nleaves      = rank ? 0 : N;
620     sf->nroots       = n;
621     sf->nranks       = rank ? 0 : size;
622     sf->minleaf      = 0;
623     sf->maxleaf      = rank ? -1 : N - 1;
624   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
625     sf->nleaves      = size;
626     sf->nroots       = size;
627     sf->nranks       = size;
628     sf->minleaf      = 0;
629     sf->maxleaf      = size - 1;
630   }
631   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
632   sf->graphset = PETSC_TRUE;
633   PetscFunctionReturn(0);
634 }
635 
636 /*@
637    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
638 
639    Collective
640 
641    Input Arguments:
642 
643 .  sf - star forest to invert
644 
645    Output Arguments:
646 .  isf - inverse of sf
647    Level: advanced
648 
649    Notes:
650    All roots must have degree 1.
651 
652    The local space may be a permutation, but cannot be sparse.
653 
654 .seealso: PetscSFSetGraph()
655 @*/
656 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
657 {
658   PetscErrorCode ierr;
659   PetscMPIInt    rank;
660   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
661   const PetscInt *ilocal;
662   PetscSFNode    *roots,*leaves;
663 
664   PetscFunctionBegin;
665   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
666   PetscSFCheckGraphSet(sf,1);
667   PetscValidPointer(isf,2);
668 
669   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
670   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
671 
672   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
673   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
674   for (i=0; i<maxlocal; i++) {
675     leaves[i].rank  = rank;
676     leaves[i].index = i;
677   }
678   for (i=0; i <nroots; i++) {
679     roots[i].rank  = -1;
680     roots[i].index = -1;
681   }
682   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
683   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
684 
685   /* Check whether our leaves are sparse */
686   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
687   if (count == nroots) newilocal = NULL;
688   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
689     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
690     for (i=0,count=0; i<nroots; i++) {
691       if (roots[i].rank >= 0) {
692         newilocal[count]   = i;
693         roots[count].rank  = roots[i].rank;
694         roots[count].index = roots[i].index;
695         count++;
696       }
697     }
698   }
699 
700   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
701   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
702   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
703   PetscFunctionReturn(0);
704 }
705 
706 /*@
707    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
708 
709    Collective
710 
711    Input Arguments:
712 +  sf - communication object to duplicate
713 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
714 
715    Output Arguments:
716 .  newsf - new communication object
717 
718    Level: beginner
719 
720 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
721 @*/
722 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
723 {
724   PetscSFType    type;
725   PetscErrorCode ierr;
726   MPI_Datatype   dtype=MPIU_SCALAR;
727 
728   PetscFunctionBegin;
729   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
730   PetscValidLogicalCollectiveEnum(sf,opt,2);
731   PetscValidPointer(newsf,3);
732   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
733   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
734   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
735   if (opt == PETSCSF_DUPLICATE_GRAPH) {
736     PetscSFCheckGraphSet(sf,1);
737     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
738       PetscInt          nroots,nleaves;
739       const PetscInt    *ilocal;
740       const PetscSFNode *iremote;
741       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
742       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
743     } else {
744       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
745     }
746   }
747   /* Since oldtype is committed, so is newtype, according to MPI */
748   if (sf->vscat.bs > 1) {ierr = MPI_Type_dup(sf->vscat.unit,&dtype);CHKERRQ(ierr);}
749   (*newsf)->vscat.bs     = sf->vscat.bs;
750   (*newsf)->vscat.unit   = dtype;
751   (*newsf)->vscat.to_n   = sf->vscat.to_n;
752   (*newsf)->vscat.from_n = sf->vscat.from_n;
753   /* Do not copy lsf. Build it on demand since it is rarely used */
754 
755 #if defined(PETSC_HAVE_DEVICE)
756   (*newsf)->backend              = sf->backend;
757   (*newsf)->use_default_stream   = sf->use_default_stream;
758   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
759   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
760 #endif
761   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
762   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
763   PetscFunctionReturn(0);
764 }
765 
766 /*@C
767    PetscSFGetGraph - Get the graph specifying a parallel star forest
768 
769    Not Collective
770 
771    Input Arguments:
772 .  sf - star forest
773 
774    Output Arguments:
775 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
776 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
777 .  ilocal - locations of leaves in leafdata buffers
778 -  iremote - remote locations of root vertices for each leaf on the current process
779 
780    Notes:
781    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
782 
783    When called from Fortran, the returned iremote array is a copy and must be deallocated after use. Consequently, if you
784    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
785 
786    Level: intermediate
787 
788 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
789 @*/
790 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
791 {
792   PetscErrorCode ierr;
793 
794   PetscFunctionBegin;
795   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
796   if (sf->ops->GetGraph) {
797     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
798   } else {
799     if (nroots) *nroots = sf->nroots;
800     if (nleaves) *nleaves = sf->nleaves;
801     if (ilocal) *ilocal = sf->mine;
802     if (iremote) *iremote = sf->remote;
803   }
804   PetscFunctionReturn(0);
805 }
806 
807 /*@
808    PetscSFGetLeafRange - Get the active leaf ranges
809 
810    Not Collective
811 
812    Input Arguments:
813 .  sf - star forest
814 
815    Output Arguments:
816 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
817 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
818 
819    Level: developer
820 
821 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
822 @*/
823 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
824 {
825   PetscFunctionBegin;
826   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
827   PetscSFCheckGraphSet(sf,1);
828   if (minleaf) *minleaf = sf->minleaf;
829   if (maxleaf) *maxleaf = sf->maxleaf;
830   PetscFunctionReturn(0);
831 }
832 
833 /*@C
834    PetscSFViewFromOptions - View from Options
835 
836    Collective on PetscSF
837 
838    Input Parameters:
839 +  A - the star forest
840 .  obj - Optional object
841 -  name - command line option
842 
843    Level: intermediate
844 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
845 @*/
846 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
847 {
848   PetscErrorCode ierr;
849 
850   PetscFunctionBegin;
851   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
852   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
853   PetscFunctionReturn(0);
854 }
855 
856 /*@C
857    PetscSFView - view a star forest
858 
859    Collective
860 
861    Input Arguments:
862 +  sf - star forest
863 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
864 
865    Level: beginner
866 
867 .seealso: PetscSFCreate(), PetscSFSetGraph()
868 @*/
869 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
870 {
871   PetscErrorCode    ierr;
872   PetscBool         iascii;
873   PetscViewerFormat format;
874 
875   PetscFunctionBegin;
876   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
877   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
878   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
879   PetscCheckSameComm(sf,1,viewer,2);
880   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
881   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
882   if (iascii) {
883     PetscMPIInt rank;
884     PetscInt    ii,i,j;
885 
886     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
887     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
888     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
889     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
890       if (!sf->graphset) {
891         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
892         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
893         PetscFunctionReturn(0);
894       }
895       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
896       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
897       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
898       for (i=0; i<sf->nleaves; i++) {
899         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
900       }
901       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
902       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
903       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
904         PetscMPIInt *tmpranks,*perm;
905         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
906         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
907         for (i=0; i<sf->nranks; i++) perm[i] = i;
908         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
909         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
910         for (ii=0; ii<sf->nranks; ii++) {
911           i = perm[ii];
912           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
913           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
914             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
915           }
916         }
917         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
918       }
919       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
920       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
921     }
922     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
923   }
924   PetscFunctionReturn(0);
925 }
926 
927 /*@C
928    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
929 
930    Not Collective
931 
932    Input Arguments:
933 .  sf - star forest
934 
935    Output Arguments:
936 +  nranks - number of ranks referenced by local part
937 .  ranks - array of ranks
938 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
939 .  rmine - concatenated array holding local indices referencing each remote rank
940 -  rremote - concatenated array holding remote indices referenced for each remote rank
941 
942    Level: developer
943 
944 .seealso: PetscSFGetLeafRanks()
945 @*/
946 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
947 {
948   PetscErrorCode ierr;
949 
950   PetscFunctionBegin;
951   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
952   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
953   if (sf->ops->GetRootRanks) {
954     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
955   } else {
956     /* The generic implementation */
957     if (nranks)  *nranks  = sf->nranks;
958     if (ranks)   *ranks   = sf->ranks;
959     if (roffset) *roffset = sf->roffset;
960     if (rmine)   *rmine   = sf->rmine;
961     if (rremote) *rremote = sf->rremote;
962   }
963   PetscFunctionReturn(0);
964 }
965 
966 /*@C
967    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
968 
969    Not Collective
970 
971    Input Arguments:
972 .  sf - star forest
973 
974    Output Arguments:
975 +  niranks - number of leaf ranks referencing roots on this process
976 .  iranks - array of ranks
977 .  ioffset - offset in irootloc for each rank (length niranks+1)
978 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
979 
980    Level: developer
981 
982 .seealso: PetscSFGetRootRanks()
983 @*/
984 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
985 {
986   PetscErrorCode ierr;
987 
988   PetscFunctionBegin;
989   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
990   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
991   if (sf->ops->GetLeafRanks) {
992     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
993   } else {
994     PetscSFType type;
995     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
996     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
997   }
998   PetscFunctionReturn(0);
999 }
1000 
1001 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
1002   PetscInt i;
1003   for (i=0; i<n; i++) {
1004     if (needle == list[i]) return PETSC_TRUE;
1005   }
1006   return PETSC_FALSE;
1007 }
1008 
1009 /*@C
1010    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
1011 
1012    Collective
1013 
1014    Input Arguments:
1015 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
1016 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
1017 
1018    Level: developer
1019 
1020 .seealso: PetscSFGetRootRanks()
1021 @*/
1022 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
1023 {
1024   PetscErrorCode     ierr;
1025   PetscTable         table;
1026   PetscTablePosition pos;
1027   PetscMPIInt        size,groupsize,*groupranks;
1028   PetscInt           *rcount,*ranks;
1029   PetscInt           i, irank = -1,orank = -1;
1030 
1031   PetscFunctionBegin;
1032   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1033   PetscSFCheckGraphSet(sf,1);
1034   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
1035   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
1036   for (i=0; i<sf->nleaves; i++) {
1037     /* Log 1-based rank */
1038     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
1039   }
1040   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
1041   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
1042   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
1043   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
1044   for (i=0; i<sf->nranks; i++) {
1045     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
1046     ranks[i]--;             /* Convert back to 0-based */
1047   }
1048   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
1049 
1050   /* We expect that dgroup is reliably "small" while nranks could be large */
1051   {
1052     MPI_Group group = MPI_GROUP_NULL;
1053     PetscMPIInt *dgroupranks;
1054     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1055     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRMPI(ierr);
1056     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
1057     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
1058     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
1059     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRMPI(ierr);}
1060     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1061     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
1062   }
1063 
1064   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1065   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i;) {
1066     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
1067       if (InList(ranks[i],groupsize,groupranks)) break;
1068     }
1069     for (; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1070       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
1071     }
1072     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
1073       PetscInt tmprank,tmpcount;
1074 
1075       tmprank             = ranks[i];
1076       tmpcount            = rcount[i];
1077       ranks[i]            = ranks[sf->ndranks];
1078       rcount[i]           = rcount[sf->ndranks];
1079       ranks[sf->ndranks]  = tmprank;
1080       rcount[sf->ndranks] = tmpcount;
1081       sf->ndranks++;
1082     }
1083   }
1084   ierr = PetscFree(groupranks);CHKERRQ(ierr);
1085   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
1086   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
1087   sf->roffset[0] = 0;
1088   for (i=0; i<sf->nranks; i++) {
1089     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
1090     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
1091     rcount[i]        = 0;
1092   }
1093   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
1094     /* short circuit */
1095     if (orank != sf->remote[i].rank) {
1096       /* Search for index of iremote[i].rank in sf->ranks */
1097       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
1098       if (irank < 0) {
1099         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
1100         if (irank >= 0) irank += sf->ndranks;
1101       }
1102       orank = sf->remote[i].rank;
1103     }
1104     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
1105     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1106     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1107     rcount[irank]++;
1108   }
1109   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
1110   PetscFunctionReturn(0);
1111 }
1112 
1113 /*@C
1114    PetscSFGetGroups - gets incoming and outgoing process groups
1115 
1116    Collective
1117 
1118    Input Argument:
1119 .  sf - star forest
1120 
1121    Output Arguments:
1122 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1123 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1124 
1125    Level: developer
1126 
1127 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1128 @*/
1129 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1130 {
1131   PetscErrorCode ierr;
1132   MPI_Group      group = MPI_GROUP_NULL;
1133 
1134   PetscFunctionBegin;
1135   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1136   if (sf->ingroup == MPI_GROUP_NULL) {
1137     PetscInt       i;
1138     const PetscInt *indegree;
1139     PetscMPIInt    rank,*outranks,*inranks;
1140     PetscSFNode    *remote;
1141     PetscSF        bgcount;
1142 
1143     /* Compute the number of incoming ranks */
1144     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1145     for (i=0; i<sf->nranks; i++) {
1146       remote[i].rank  = sf->ranks[i];
1147       remote[i].index = 0;
1148     }
1149     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1150     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1151     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1152     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1153     /* Enumerate the incoming ranks */
1154     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1155     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1156     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1157     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1158     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1159     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1160     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRMPI(ierr);
1161     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1162     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1163     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1164   }
1165   *incoming = sf->ingroup;
1166 
1167   if (sf->outgroup == MPI_GROUP_NULL) {
1168     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1169     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRMPI(ierr);
1170     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1171   }
1172   *outgoing = sf->outgroup;
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 /*@
1177    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1178 
1179    Collective
1180 
1181    Input Argument:
1182 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1183 
1184    Output Arguments:
1185 .  multi - star forest with split roots, such that each root has degree exactly 1
1186 
1187    Level: developer
1188 
1189    Notes:
1190 
1191    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1192    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1193    edge, it is a candidate for future optimization that might involve its removal.
1194 
1195 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1196 @*/
1197 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1198 {
1199   PetscErrorCode ierr;
1200 
1201   PetscFunctionBegin;
1202   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1203   PetscValidPointer(multi,2);
1204   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1205     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1206     *multi = sf->multi;
1207     PetscFunctionReturn(0);
1208   }
1209   if (!sf->multi) {
1210     const PetscInt *indegree;
1211     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1212     PetscSFNode    *remote;
1213     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1214     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1215     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1216     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1217     inoffset[0] = 0;
1218     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1219     for (i=0; i<maxlocal; i++) outones[i] = 1;
1220     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1221     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1222     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1223     if (PetscDefined(USE_DEBUG)) { /* Check that the expected number of increments occurred */
1224       for (i=0; i<sf->nroots; i++) {
1225         if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1226       }
1227     }
1228     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1229     for (i=0; i<sf->nleaves; i++) {
1230       remote[i].rank  = sf->remote[i].rank;
1231       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1232     }
1233     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1234     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1235     if (sf->rankorder) {        /* Sort the ranks */
1236       PetscMPIInt rank;
1237       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1238       PetscSFNode *newremote;
1239       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1240       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1241       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1242       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1243       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1244       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1245       /* Sort the incoming ranks at each vertex, build the inverse map */
1246       for (i=0; i<sf->nroots; i++) {
1247         PetscInt j;
1248         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1249         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1250         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1251       }
1252       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1253       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1254       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1255       for (i=0; i<sf->nleaves; i++) {
1256         newremote[i].rank  = sf->remote[i].rank;
1257         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1258       }
1259       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1260       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1261     }
1262     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1263   }
1264   *multi = sf->multi;
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 /*@C
1269    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1270 
1271    Collective
1272 
1273    Input Arguments:
1274 +  sf - original star forest
1275 .  nselected  - number of selected roots on this process
1276 -  selected   - indices of the selected roots on this process
1277 
1278    Output Arguments:
1279 .  esf - new star forest
1280 
1281    Level: advanced
1282 
1283    Note:
1284    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1285    be done by calling PetscSFGetGraph().
1286 
1287 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1288 @*/
1289 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *esf)
1290 {
1291   PetscInt          i,j,n,nroots,nleaves,esf_nleaves,*new_ilocal,minleaf,maxleaf,maxlocal;
1292   const PetscInt    *ilocal;
1293   signed char       *rootdata,*leafdata,*leafmem;
1294   const PetscSFNode *iremote;
1295   PetscSFNode       *new_iremote;
1296   MPI_Comm          comm;
1297   PetscErrorCode    ierr;
1298 
1299   PetscFunctionBegin;
1300   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1301   PetscSFCheckGraphSet(sf,1);
1302   if (nselected) PetscValidPointer(selected,3);
1303   PetscValidPointer(esf,4);
1304 
1305   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1306   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1307   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1308   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1309 
1310   if (PetscDefined(USE_DEBUG)) {  /* Error out if selected[] has dups or  out of range indices */
1311     PetscBool dups;
1312     ierr = PetscCheckDupsInt(nselected,selected,&dups);CHKERRQ(ierr);
1313     if (dups) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"selected[] has dups");
1314     for (i=0; i<nselected; i++)
1315       if (selected[i] < 0 || selected[i] >= nroots) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"selected root indice %D is out of [0,%D)",selected[i],nroots);
1316   }
1317 
1318   if (sf->ops->CreateEmbeddedSF) {
1319     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,selected,esf);CHKERRQ(ierr);
1320   } else {
1321     /* A generic version of creating embedded sf */
1322     ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
1323     maxlocal = maxleaf - minleaf + 1;
1324     ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafmem);CHKERRQ(ierr);
1325     leafdata = leafmem - minleaf;
1326     /* Tag selected roots and bcast to leaves */
1327     for (i=0; i<nselected; i++) rootdata[selected[i]] = 1;
1328     ierr = PetscSFBcastBegin(sf,MPI_SIGNED_CHAR,rootdata,leafdata);CHKERRQ(ierr);
1329     ierr = PetscSFBcastEnd(sf,MPI_SIGNED_CHAR,rootdata,leafdata);CHKERRQ(ierr);
1330 
1331     /* Build esf with leaves that are still connected */
1332     esf_nleaves = 0;
1333     for (i=0; i<nleaves; i++) {
1334       j = ilocal ? ilocal[i] : i;
1335       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1336          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1337       */
1338       esf_nleaves += (leafdata[j] ? 1 : 0);
1339     }
1340     ierr = PetscMalloc1(esf_nleaves,&new_ilocal);CHKERRQ(ierr);
1341     ierr = PetscMalloc1(esf_nleaves,&new_iremote);CHKERRQ(ierr);
1342     for (i=n=0; i<nleaves; i++) {
1343       j = ilocal ? ilocal[i] : i;
1344       if (leafdata[j]) {
1345         new_ilocal[n]        = j;
1346         new_iremote[n].rank  = iremote[i].rank;
1347         new_iremote[n].index = iremote[i].index;
1348         ++n;
1349       }
1350     }
1351     ierr = PetscSFCreate(comm,esf);CHKERRQ(ierr);
1352     ierr = PetscSFSetFromOptions(*esf);CHKERRQ(ierr);
1353     ierr = PetscSFSetGraph(*esf,nroots,esf_nleaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1354     ierr = PetscFree2(rootdata,leafmem);CHKERRQ(ierr);
1355   }
1356   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1357   PetscFunctionReturn(0);
1358 }
1359 
1360 /*@C
1361   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1362 
1363   Collective
1364 
1365   Input Arguments:
1366 + sf - original star forest
1367 . nselected  - number of selected leaves on this process
1368 - selected   - indices of the selected leaves on this process
1369 
1370   Output Arguments:
1371 .  newsf - new star forest
1372 
1373   Level: advanced
1374 
1375 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1376 @*/
1377 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1378 {
1379   const PetscSFNode *iremote;
1380   PetscSFNode       *new_iremote;
1381   const PetscInt    *ilocal;
1382   PetscInt          i,nroots,*leaves,*new_ilocal;
1383   MPI_Comm          comm;
1384   PetscErrorCode    ierr;
1385 
1386   PetscFunctionBegin;
1387   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1388   PetscSFCheckGraphSet(sf,1);
1389   if (nselected) PetscValidPointer(selected,3);
1390   PetscValidPointer(newsf,4);
1391 
1392   /* Uniq selected[] and put results in leaves[] */
1393   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1394   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1395   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1396   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1397   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1398 
1399   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1400   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1401     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1402   } else {
1403     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1404     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1405     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1406     for (i=0; i<nselected; ++i) {
1407       const PetscInt l     = leaves[i];
1408       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1409       new_iremote[i].rank  = iremote[l].rank;
1410       new_iremote[i].index = iremote[l].index;
1411     }
1412     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1413     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1414   }
1415   ierr = PetscFree(leaves);CHKERRQ(ierr);
1416   PetscFunctionReturn(0);
1417 }
1418 
1419 /*@C
1420    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1421 
1422    Collective on PetscSF
1423 
1424    Input Arguments:
1425 +  sf - star forest on which to communicate
1426 .  unit - data type associated with each node
1427 .  rootdata - buffer to broadcast
1428 -  op - operation to use for reduction
1429 
1430    Output Arguments:
1431 .  leafdata - buffer to be reduced with values from each leaf's respective root
1432 
1433    Level: intermediate
1434 
1435    Notes:
1436     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1437     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1438     use PetscSFBcastAndOpWithMemTypeBegin() instead.
1439 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd(), PetscSFBcastAndOpWithMemTypeBegin()
1440 @*/
1441 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1442 {
1443   PetscErrorCode ierr;
1444   PetscMemType   rootmtype,leafmtype;
1445 
1446   PetscFunctionBegin;
1447   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1448   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1449   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1450   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1451   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1452   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1453   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1454   PetscFunctionReturn(0);
1455 }
1456 
1457 /*@C
1458    PetscSFBcastAndOpWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to PetscSFBcastAndOpEnd()
1459 
1460    Collective on PetscSF
1461 
1462    Input Arguments:
1463 +  sf - star forest on which to communicate
1464 .  unit - data type associated with each node
1465 .  rootmtype - memory type of rootdata
1466 .  rootdata - buffer to broadcast
1467 .  leafmtype - memory type of leafdata
1468 -  op - operation to use for reduction
1469 
1470    Output Arguments:
1471 .  leafdata - buffer to be reduced with values from each leaf's respective root
1472 
1473    Level: intermediate
1474 
1475 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd(),PetscSFBcastAndOpBegin()
1476 @*/
1477 PetscErrorCode PetscSFBcastAndOpWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
1478 {
1479   PetscErrorCode ierr;
1480 
1481   PetscFunctionBegin;
1482   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1483   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1484   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1485   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1486   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1487   PetscFunctionReturn(0);
1488 }
1489 
1490 /*@C
1491    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1492 
1493    Collective
1494 
1495    Input Arguments:
1496 +  sf - star forest
1497 .  unit - data type
1498 .  rootdata - buffer to broadcast
1499 -  op - operation to use for reduction
1500 
1501    Output Arguments:
1502 .  leafdata - buffer to be reduced with values from each leaf's respective root
1503 
1504    Level: intermediate
1505 
1506 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1507 @*/
1508 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1509 {
1510   PetscErrorCode ierr;
1511 
1512   PetscFunctionBegin;
1513   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1514   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);}
1515   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootdata,leafdata,op);CHKERRQ(ierr);
1516   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);}
1517   PetscFunctionReturn(0);
1518 }
1519 
1520 /*@C
1521    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1522 
1523    Collective
1524 
1525    Input Arguments:
1526 +  sf - star forest
1527 .  unit - data type
1528 .  leafdata - values to reduce
1529 -  op - reduction operation
1530 
1531    Output Arguments:
1532 .  rootdata - result of reduction of values from all leaves of each root
1533 
1534    Level: intermediate
1535 
1536    Notes:
1537     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1538     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1539     use PetscSFReduceWithMemTypeBegin() instead.
1540 
1541 .seealso: PetscSFBcastBegin(), PetscSFReduceWithMemTypeBegin()
1542 @*/
1543 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1544 {
1545   PetscErrorCode ierr;
1546   PetscMemType   rootmtype,leafmtype;
1547 
1548   PetscFunctionBegin;
1549   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1550   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1551   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1552   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1553   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1554   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1555   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1556   PetscFunctionReturn(0);
1557 }
1558 
1559 /*@C
1560    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to PetscSFReduceEnd()
1561 
1562    Collective
1563 
1564    Input Arguments:
1565 +  sf - star forest
1566 .  unit - data type
1567 .  leafmtype - memory type of leafdata
1568 .  leafdata - values to reduce
1569 .  rootmtype - memory type of rootdata
1570 -  op - reduction operation
1571 
1572    Output Arguments:
1573 .  rootdata - result of reduction of values from all leaves of each root
1574 
1575    Level: intermediate
1576 
1577 .seealso: PetscSFBcastBegin(), PetscSFReduceBegin()
1578 @*/
1579 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
1580 {
1581   PetscErrorCode ierr;
1582 
1583   PetscFunctionBegin;
1584   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1585   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1586   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1587   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1588   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1589   PetscFunctionReturn(0);
1590 }
1591 
1592 /*@C
1593    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1594 
1595    Collective
1596 
1597    Input Arguments:
1598 +  sf - star forest
1599 .  unit - data type
1600 .  leafdata - values to reduce
1601 -  op - reduction operation
1602 
1603    Output Arguments:
1604 .  rootdata - result of reduction of values from all leaves of each root
1605 
1606    Level: intermediate
1607 
1608 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1609 @*/
1610 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1611 {
1612   PetscErrorCode ierr;
1613 
1614   PetscFunctionBegin;
1615   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1616   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1617   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
1618   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1619   PetscFunctionReturn(0);
1620 }
1621 
1622 /*@C
1623    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1624 
1625    Collective
1626 
1627    Input Arguments:
1628 +  sf - star forest
1629 .  unit - data type
1630 .  leafdata - leaf values to use in reduction
1631 -  op - operation to use for reduction
1632 
1633    Output Arguments:
1634 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1635 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1636 
1637    Level: advanced
1638 
1639    Note:
1640    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1641    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1642    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1643    integers.
1644 
1645 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1646 @*/
1647 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1648 {
1649   PetscErrorCode ierr;
1650   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1651 
1652   PetscFunctionBegin;
1653   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1654   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1655   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1656   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1657   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1658   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1659   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1660   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1661   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1662   PetscFunctionReturn(0);
1663 }
1664 
1665 /*@C
1666    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1667 
1668    Collective
1669 
1670    Input Arguments:
1671 +  sf - star forest
1672 .  unit - data type
1673 .  leafdata - leaf values to use in reduction
1674 -  op - operation to use for reduction
1675 
1676    Output Arguments:
1677 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1678 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1679 
1680    Level: advanced
1681 
1682 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1683 @*/
1684 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1685 {
1686   PetscErrorCode ierr;
1687 
1688   PetscFunctionBegin;
1689   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1690   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1691   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootdata,leafdata,leafupdate,op);CHKERRQ(ierr);
1692   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1693   PetscFunctionReturn(0);
1694 }
1695 
1696 /*@C
1697    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1698 
1699    Collective
1700 
1701    Input Arguments:
1702 .  sf - star forest
1703 
1704    Output Arguments:
1705 .  degree - degree of each root vertex
1706 
1707    Level: advanced
1708 
1709    Notes:
1710    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1711 
1712 .seealso: PetscSFGatherBegin()
1713 @*/
1714 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1715 {
1716   PetscErrorCode ierr;
1717 
1718   PetscFunctionBegin;
1719   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1720   PetscSFCheckGraphSet(sf,1);
1721   PetscValidPointer(degree,2);
1722   if (!sf->degreeknown) {
1723     PetscInt i, nroots = sf->nroots, maxlocal;
1724     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1725     maxlocal = sf->maxleaf-sf->minleaf+1;
1726     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1727     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1728     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1729     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1730     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1731   }
1732   *degree = NULL;
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 /*@C
1737    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1738 
1739    Collective
1740 
1741    Input Arguments:
1742 .  sf - star forest
1743 
1744    Output Arguments:
1745 .  degree - degree of each root vertex
1746 
1747    Level: developer
1748 
1749    Notes:
1750    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1751 
1752 .seealso:
1753 @*/
1754 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1755 {
1756   PetscErrorCode ierr;
1757 
1758   PetscFunctionBegin;
1759   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1760   PetscSFCheckGraphSet(sf,1);
1761   PetscValidPointer(degree,2);
1762   if (!sf->degreeknown) {
1763     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1764     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1765     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1766     sf->degreeknown = PETSC_TRUE;
1767   }
1768   *degree = sf->degree;
1769   PetscFunctionReturn(0);
1770 }
1771 
1772 
1773 /*@C
1774    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1775    Each multi-root is assigned index of the corresponding original root.
1776 
1777    Collective
1778 
1779    Input Arguments:
1780 +  sf - star forest
1781 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1782 
1783    Output Arguments:
1784 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1785 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1786 
1787    Level: developer
1788 
1789    Notes:
1790    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1791 
1792 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1793 @*/
1794 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1795 {
1796   PetscSF             msf;
1797   PetscInt            i, j, k, nroots, nmroots;
1798   PetscErrorCode      ierr;
1799 
1800   PetscFunctionBegin;
1801   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1802   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1803   if (nroots) PetscValidIntPointer(degree,2);
1804   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1805   PetscValidPointer(multiRootsOrigNumbering,4);
1806   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1807   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1808   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1809   for (i=0,j=0,k=0; i<nroots; i++) {
1810     if (!degree[i]) continue;
1811     for (j=0; j<degree[i]; j++,k++) {
1812       (*multiRootsOrigNumbering)[k] = i;
1813     }
1814   }
1815   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1816   if (nMultiRoots) *nMultiRoots = nmroots;
1817   PetscFunctionReturn(0);
1818 }
1819 
1820 /*@C
1821    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1822 
1823    Collective
1824 
1825    Input Arguments:
1826 +  sf - star forest
1827 .  unit - data type
1828 -  leafdata - leaf data to gather to roots
1829 
1830    Output Argument:
1831 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1832 
1833    Level: intermediate
1834 
1835 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1836 @*/
1837 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1838 {
1839   PetscErrorCode ierr;
1840   PetscSF        multi = NULL;
1841 
1842   PetscFunctionBegin;
1843   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1844   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1845   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1846   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1847   PetscFunctionReturn(0);
1848 }
1849 
1850 /*@C
1851    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1852 
1853    Collective
1854 
1855    Input Arguments:
1856 +  sf - star forest
1857 .  unit - data type
1858 -  leafdata - leaf data to gather to roots
1859 
1860    Output Argument:
1861 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1862 
1863    Level: intermediate
1864 
1865 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1866 @*/
1867 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1868 {
1869   PetscErrorCode ierr;
1870   PetscSF        multi = NULL;
1871 
1872   PetscFunctionBegin;
1873   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1874   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1875   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1876   PetscFunctionReturn(0);
1877 }
1878 
1879 /*@C
1880    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1881 
1882    Collective
1883 
1884    Input Arguments:
1885 +  sf - star forest
1886 .  unit - data type
1887 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1888 
1889    Output Argument:
1890 .  leafdata - leaf data to be update with personal data from each respective root
1891 
1892    Level: intermediate
1893 
1894 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1895 @*/
1896 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1897 {
1898   PetscErrorCode ierr;
1899   PetscSF        multi = NULL;
1900 
1901   PetscFunctionBegin;
1902   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1903   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1904   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1905   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1906   PetscFunctionReturn(0);
1907 }
1908 
1909 /*@C
1910    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1911 
1912    Collective
1913 
1914    Input Arguments:
1915 +  sf - star forest
1916 .  unit - data type
1917 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1918 
1919    Output Argument:
1920 .  leafdata - leaf data to be update with personal data from each respective root
1921 
1922    Level: intermediate
1923 
1924 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1925 @*/
1926 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1927 {
1928   PetscErrorCode ierr;
1929   PetscSF        multi = NULL;
1930 
1931   PetscFunctionBegin;
1932   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1933   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1934   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1939 {
1940   PetscInt        i, n, nleaves;
1941   const PetscInt *ilocal = NULL;
1942   PetscHSetI      seen;
1943   PetscErrorCode  ierr;
1944 
1945   PetscFunctionBegin;
1946   if (PetscDefined(USE_DEBUG)) {
1947     ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1948     ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1949     for (i = 0; i < nleaves; i++) {
1950       const PetscInt leaf = ilocal ? ilocal[i] : i;
1951       ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1952     }
1953     ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1954     if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1955     ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1956   }
1957   PetscFunctionReturn(0);
1958 }
1959 
1960 /*@
1961   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1962 
1963   Input Parameters:
1964 + sfA - The first PetscSF
1965 - sfB - The second PetscSF
1966 
1967   Output Parameters:
1968 . sfBA - The composite SF
1969 
1970   Level: developer
1971 
1972   Notes:
1973   Currently, the two SFs must be defined on congruent communicators and they must be true star
1974   forests, i.e. the same leaf is not connected with different roots.
1975 
1976   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1977   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1978   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1979   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1980 
1981 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1982 @*/
1983 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1984 {
1985   PetscErrorCode    ierr;
1986   const PetscSFNode *remotePointsA,*remotePointsB;
1987   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1988   const PetscInt    *localPointsA,*localPointsB;
1989   PetscInt          *localPointsBA;
1990   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1991   PetscBool         denseB;
1992 
1993   PetscFunctionBegin;
1994   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1995   PetscSFCheckGraphSet(sfA,1);
1996   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1997   PetscSFCheckGraphSet(sfB,2);
1998   PetscCheckSameComm(sfA,1,sfB,2);
1999   PetscValidPointer(sfBA,3);
2000   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2001   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2002 
2003   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
2004   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
2005   if (localPointsA) {
2006     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
2007     for (i=0; i<numRootsB; i++) {
2008       reorderedRemotePointsA[i].rank = -1;
2009       reorderedRemotePointsA[i].index = -1;
2010     }
2011     for (i=0; i<numLeavesA; i++) {
2012       if (localPointsA[i] >= numRootsB) continue;
2013       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
2014     }
2015     remotePointsA = reorderedRemotePointsA;
2016   }
2017   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
2018   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
2019   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
2020   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
2021   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2022 
2023   denseB = (PetscBool)!localPointsB;
2024   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2025     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
2026     else numLeavesBA++;
2027   }
2028   if (denseB) {
2029     localPointsBA  = NULL;
2030     remotePointsBA = leafdataB;
2031   } else {
2032     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
2033     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
2034     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2035       const PetscInt l = localPointsB ? localPointsB[i] : i;
2036 
2037       if (leafdataB[l-minleaf].rank == -1) continue;
2038       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
2039       localPointsBA[numLeavesBA] = l;
2040       numLeavesBA++;
2041     }
2042     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
2043   }
2044   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2045   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2046   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2047   PetscFunctionReturn(0);
2048 }
2049 
2050 /*@
2051   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
2052 
2053   Input Parameters:
2054 + sfA - The first PetscSF
2055 - sfB - The second PetscSF
2056 
2057   Output Parameters:
2058 . sfBA - The composite SF.
2059 
2060   Level: developer
2061 
2062   Notes:
2063   Currently, the two SFs must be defined on congruent communicators and they must be true star
2064   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2065   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2066 
2067   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2068   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2069   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2070   a Reduce on sfB, on connected roots.
2071 
2072 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
2073 @*/
2074 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
2075 {
2076   PetscErrorCode    ierr;
2077   const PetscSFNode *remotePointsA,*remotePointsB;
2078   PetscSFNode       *remotePointsBA;
2079   const PetscInt    *localPointsA,*localPointsB;
2080   PetscSFNode       *reorderedRemotePointsA = NULL;
2081   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
2082   MPI_Op            op;
2083 #if defined(PETSC_USE_64BIT_INDICES)
2084   PetscBool         iswin;
2085 #endif
2086 
2087   PetscFunctionBegin;
2088   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2089   PetscSFCheckGraphSet(sfA,1);
2090   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2091   PetscSFCheckGraphSet(sfB,2);
2092   PetscCheckSameComm(sfA,1,sfB,2);
2093   PetscValidPointer(sfBA,3);
2094   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2095   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2096 
2097   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
2098   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
2099 
2100   /* TODO: Check roots of sfB have degree of 1 */
2101   /* Once we implement it, we can replace the MPI_MAXLOC
2102      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
2103      We use MPI_MAXLOC only to have a deterministic output from this routine if
2104      the root condition is not meet.
2105    */
2106   op = MPI_MAXLOC;
2107 #if defined(PETSC_USE_64BIT_INDICES)
2108   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2109   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
2110   if (iswin) op = MPIU_REPLACE;
2111 #endif
2112 
2113   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
2114   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
2115   for (i=0; i<maxleaf - minleaf + 1; i++) {
2116     reorderedRemotePointsA[i].rank = -1;
2117     reorderedRemotePointsA[i].index = -1;
2118   }
2119   if (localPointsA) {
2120     for (i=0; i<numLeavesA; i++) {
2121       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2122       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2123     }
2124   } else {
2125     for (i=0; i<numLeavesA; i++) {
2126       if (i > maxleaf || i < minleaf) continue;
2127       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2128     }
2129   }
2130 
2131   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
2132   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
2133   for (i=0; i<numRootsB; i++) {
2134     remotePointsBA[i].rank = -1;
2135     remotePointsBA[i].index = -1;
2136   }
2137 
2138   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2139   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2140   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2141   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
2142     if (remotePointsBA[i].rank == -1) continue;
2143     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
2144     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2145     localPointsBA[numLeavesBA] = i;
2146     numLeavesBA++;
2147   }
2148   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2149   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2150   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2151   PetscFunctionReturn(0);
2152 }
2153 
2154 /*
2155   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
2156 
2157   Input Parameters:
2158 . sf - The global PetscSF
2159 
2160   Output Parameters:
2161 . out - The local PetscSF
2162  */
2163 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
2164 {
2165   MPI_Comm           comm;
2166   PetscMPIInt        myrank;
2167   const PetscInt     *ilocal;
2168   const PetscSFNode  *iremote;
2169   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
2170   PetscSFNode        *liremote;
2171   PetscSF            lsf;
2172   PetscErrorCode     ierr;
2173 
2174   PetscFunctionBegin;
2175   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2176   if (sf->ops->CreateLocalSF) {
2177     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2178   } else {
2179     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2180     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2181     ierr = MPI_Comm_rank(comm,&myrank);CHKERRMPI(ierr);
2182 
2183     /* Find out local edges and build a local SF */
2184     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2185     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2186     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2187     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2188 
2189     for (i=j=0; i<nleaves; i++) {
2190       if (iremote[i].rank == (PetscInt)myrank) {
2191         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2192         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2193         liremote[j].index = iremote[i].index;
2194         j++;
2195       }
2196     }
2197     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2198     ierr = PetscSFSetFromOptions(lsf);CHKERRQ(ierr);
2199     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2200     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2201     *out = lsf;
2202   }
2203   PetscFunctionReturn(0);
2204 }
2205 
2206 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2207 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2208 {
2209   PetscErrorCode     ierr;
2210   PetscMemType       rootmtype,leafmtype;
2211 
2212   PetscFunctionBegin;
2213   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2214   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2215   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2216   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2217   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2218   if (sf->ops->BcastToZero) {
2219     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2220   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2221   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2222   PetscFunctionReturn(0);
2223 }
2224 
2225