xref: /petsc/src/vec/is/sf/interface/sf.c (revision e19f88df7cd0bcfe73faf98683db6f77794e28aa)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_USE_DEBUG)
6 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
7     if (PetscUnlikely(!(sf)->graphset))                              \
8       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
9   } while (0)
10 #else
11 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
12 #endif
13 
14 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",0};
15 
16 /*@
17    PetscSFCreate - create a star forest communication context
18 
19    Collective
20 
21    Input Arguments:
22 .  comm - communicator on which the star forest will operate
23 
24    Output Arguments:
25 .  sf - new star forest context
26 
27    Options Database Keys:
28 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
29 .  -sf_type window    -Use MPI-3 one-sided window for communication
30 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
31 
32    Level: intermediate
33 
34    Notes:
35    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
36    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
37    SFs are optimized and they have better performance than general SFs.
38 
39 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
40 @*/
41 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
42 {
43   PetscErrorCode ierr;
44   PetscSF        b;
45 
46   PetscFunctionBegin;
47   PetscValidPointer(sf,2);
48   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
49 
50   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
51 
52   b->nroots    = -1;
53   b->nleaves   = -1;
54   b->minleaf   = PETSC_MAX_INT;
55   b->maxleaf   = PETSC_MIN_INT;
56   b->nranks    = -1;
57   b->rankorder = PETSC_TRUE;
58   b->ingroup   = MPI_GROUP_NULL;
59   b->outgroup  = MPI_GROUP_NULL;
60   b->graphset  = PETSC_FALSE;
61 
62   *sf = b;
63   PetscFunctionReturn(0);
64 }
65 
66 /*@
67    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
68 
69    Collective
70 
71    Input Arguments:
72 .  sf - star forest
73 
74    Level: advanced
75 
76 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
77 @*/
78 PetscErrorCode PetscSFReset(PetscSF sf)
79 {
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
84   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
85   sf->nroots   = -1;
86   sf->nleaves  = -1;
87   sf->minleaf  = PETSC_MAX_INT;
88   sf->maxleaf  = PETSC_MIN_INT;
89   sf->mine     = NULL;
90   sf->remote   = NULL;
91   sf->graphset = PETSC_FALSE;
92   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
93   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
94   sf->nranks = -1;
95   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
96 #if defined(PETSC_HAVE_CUDA)
97   {
98   PetscInt  i;
99   for (i=0; i<2; i++) {if (sf->rmine_d[i]) {cudaError_t err = cudaFree(sf->rmine_d[i]);CHKERRCUDA(err);sf->rmine_d[i]=NULL;}}
100   }
101 #endif
102   sf->degreeknown = PETSC_FALSE;
103   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
104   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRQ(ierr);}
105   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRQ(ierr);}
106   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
107   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
108   sf->setupcalled = PETSC_FALSE;
109   PetscFunctionReturn(0);
110 }
111 
112 /*@C
113    PetscSFSetType - Set the PetscSF communication implementation
114 
115    Collective on PetscSF
116 
117    Input Parameters:
118 +  sf - the PetscSF context
119 -  type - a known method
120 
121    Options Database Key:
122 .  -sf_type <type> - Sets the method; use -help for a list
123    of available methods (for instance, window, basic, neighbor)
124 
125    Notes:
126    See "include/petscsf.h" for available methods (for instance)
127 +    PETSCSFWINDOW - MPI-2/3 one-sided
128 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
129 
130   Level: intermediate
131 
132 .seealso: PetscSFType, PetscSFCreate()
133 @*/
134 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
135 {
136   PetscErrorCode ierr,(*r)(PetscSF);
137   PetscBool      match;
138 
139   PetscFunctionBegin;
140   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
141   PetscValidCharPointer(type,2);
142 
143   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
144   if (match) PetscFunctionReturn(0);
145 
146   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
147   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
148   /* Destroy the previous PetscSF implementation context */
149   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
150   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
151   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
152   ierr = (*r)(sf);CHKERRQ(ierr);
153   PetscFunctionReturn(0);
154 }
155 
156 /*@C
157   PetscSFGetType - Get the PetscSF communication implementation
158 
159   Not Collective
160 
161   Input Parameter:
162 . sf  - the PetscSF context
163 
164   Output Parameter:
165 . type - the PetscSF type name
166 
167   Level: intermediate
168 
169 .seealso: PetscSFSetType(), PetscSFCreate()
170 @*/
171 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
172 {
173   PetscFunctionBegin;
174   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
175   PetscValidPointer(type,2);
176   *type = ((PetscObject)sf)->type_name;
177   PetscFunctionReturn(0);
178 }
179 
180 /*@
181    PetscSFDestroy - destroy star forest
182 
183    Collective
184 
185    Input Arguments:
186 .  sf - address of star forest
187 
188    Level: intermediate
189 
190 .seealso: PetscSFCreate(), PetscSFReset()
191 @*/
192 PetscErrorCode PetscSFDestroy(PetscSF *sf)
193 {
194   PetscErrorCode ierr;
195 
196   PetscFunctionBegin;
197   if (!*sf) PetscFunctionReturn(0);
198   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
199   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
200   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
201   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
202   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
203   PetscFunctionReturn(0);
204 }
205 
206 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
207 {
208 #if defined(PETSC_USE_DEBUG)
209   PetscInt           i, nleaves;
210   PetscMPIInt        size;
211   const PetscInt    *ilocal;
212   const PetscSFNode *iremote;
213   PetscErrorCode     ierr;
214 
215   PetscFunctionBegin;
216   if (!sf->graphset) PetscFunctionReturn(0);
217   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
218   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
219   for (i = 0; i < nleaves; i++) {
220     const PetscInt rank = iremote[i].rank;
221     const PetscInt remote = iremote[i].index;
222     const PetscInt leaf = ilocal ? ilocal[i] : i;
223     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
224     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
225     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
226   }
227   PetscFunctionReturn(0);
228 #else
229   PetscFunctionBegin;
230   PetscFunctionReturn(0);
231 #endif
232 }
233 
234 /*@
235    PetscSFSetUp - set up communication structures
236 
237    Collective
238 
239    Input Arguments:
240 .  sf - star forest communication object
241 
242    Level: beginner
243 
244 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
245 @*/
246 PetscErrorCode PetscSFSetUp(PetscSF sf)
247 {
248   PetscErrorCode ierr;
249 
250   PetscFunctionBegin;
251   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
252   PetscSFCheckGraphSet(sf,1);
253   if (sf->setupcalled) PetscFunctionReturn(0);
254   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
255   sf->use_gpu_aware_mpi = use_gpu_aware_mpi;
256   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);}
257   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
258   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
259   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
260   sf->setupcalled = PETSC_TRUE;
261   PetscFunctionReturn(0);
262 }
263 
264 /*@
265    PetscSFSetFromOptions - set PetscSF options using the options database
266 
267    Logically Collective
268 
269    Input Arguments:
270 .  sf - star forest
271 
272    Options Database Keys:
273 +  -sf_type               - implementation type, see PetscSFSetType()
274 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
275 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
276                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
277                             If true, this option only works with -use_cuda_aware_mpi 1.
278 -  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
279                                If true, this option only works with -use_cuda_aware_mpi 1.
280 
281    Level: intermediate
282 @*/
283 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
284 {
285   PetscSFType    deft;
286   char           type[256];
287   PetscErrorCode ierr;
288   PetscBool      flg;
289 
290   PetscFunctionBegin;
291   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
292   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
293   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
294   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
295   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
296   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
297 
298 #if defined(PETSC_HAVE_CUDA)
299   sf->use_default_stream = PETSC_TRUE; /* The assumption is true for PETSc internal use of SF */
300   ierr = PetscOptionsBool("-sf_use_default_stream","Assume SF's input and output root/leafdata is computed on the default stream","PetscSFSetFromOptions",sf->use_default_stream,&sf->use_default_stream,NULL);CHKERRQ(ierr);
301   sf->use_stream_aware_mpi = PETSC_FALSE;
302   ierr = PetscOptionsBool("-sf_use_stream_aware_mpi","Assume the underlying MPI is cuda-stream aware","PetscSFSetFromOptions",sf->use_stream_aware_mpi,&sf->use_stream_aware_mpi,NULL);CHKERRQ(ierr);
303 #endif
304   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
305   ierr = PetscOptionsEnd();CHKERRQ(ierr);
306   PetscFunctionReturn(0);
307 }
308 
309 /*@
310    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
311 
312    Logically Collective
313 
314    Input Arguments:
315 +  sf - star forest
316 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
317 
318    Level: advanced
319 
320 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
321 @*/
322 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
323 {
324   PetscFunctionBegin;
325   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
326   PetscValidLogicalCollectiveBool(sf,flg,2);
327   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
328   sf->rankorder = flg;
329   PetscFunctionReturn(0);
330 }
331 
332 /*@
333    PetscSFSetGraph - Set a parallel star forest
334 
335    Collective
336 
337    Input Arguments:
338 +  sf - star forest
339 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
340 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
341 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
342 during setup in debug mode)
343 .  localmode - copy mode for ilocal
344 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
345 during setup in debug mode)
346 -  remotemode - copy mode for iremote
347 
348    Level: intermediate
349 
350    Notes:
351     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
352 
353    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
354    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
355    needed
356 
357    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
358    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
359 
360 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
361 @*/
362 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
363 {
364   PetscErrorCode ierr;
365 
366   PetscFunctionBegin;
367   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
368   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
369   if (nleaves > 0) PetscValidPointer(iremote,6);
370   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
371   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
372 
373   if (sf->nroots >= 0) { /* Reset only if graph already set */
374     ierr = PetscSFReset(sf);CHKERRQ(ierr);
375   }
376 
377   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
378 
379   sf->nroots  = nroots;
380   sf->nleaves = nleaves;
381 
382   if (nleaves && ilocal) {
383     PetscInt i;
384     PetscInt minleaf = PETSC_MAX_INT;
385     PetscInt maxleaf = PETSC_MIN_INT;
386     int      contiguous = 1;
387     for (i=0; i<nleaves; i++) {
388       minleaf = PetscMin(minleaf,ilocal[i]);
389       maxleaf = PetscMax(maxleaf,ilocal[i]);
390       contiguous &= (ilocal[i] == i);
391     }
392     sf->minleaf = minleaf;
393     sf->maxleaf = maxleaf;
394     if (contiguous) {
395       if (localmode == PETSC_OWN_POINTER) {
396         ierr = PetscFree(ilocal);CHKERRQ(ierr);
397       }
398       ilocal = NULL;
399     }
400   } else {
401     sf->minleaf = 0;
402     sf->maxleaf = nleaves - 1;
403   }
404 
405   if (ilocal) {
406     switch (localmode) {
407     case PETSC_COPY_VALUES:
408       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
409       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
410       sf->mine = sf->mine_alloc;
411       break;
412     case PETSC_OWN_POINTER:
413       sf->mine_alloc = (PetscInt*)ilocal;
414       sf->mine       = sf->mine_alloc;
415       break;
416     case PETSC_USE_POINTER:
417       sf->mine_alloc = NULL;
418       sf->mine       = (PetscInt*)ilocal;
419       break;
420     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
421     }
422   }
423 
424   switch (remotemode) {
425   case PETSC_COPY_VALUES:
426     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
427     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
428     sf->remote = sf->remote_alloc;
429     break;
430   case PETSC_OWN_POINTER:
431     sf->remote_alloc = (PetscSFNode*)iremote;
432     sf->remote       = sf->remote_alloc;
433     break;
434   case PETSC_USE_POINTER:
435     sf->remote_alloc = NULL;
436     sf->remote       = (PetscSFNode*)iremote;
437     break;
438   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
439   }
440 
441   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
442   sf->graphset = PETSC_TRUE;
443   PetscFunctionReturn(0);
444 }
445 
446 /*@
447   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
448 
449   Collective
450 
451   Input Parameters:
452 + sf      - The PetscSF
453 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
454 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
455 
456   Notes:
457   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
458   n and N are local and global sizes of x respectively.
459 
460   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
461   sequential vectors y on all ranks.
462 
463   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
464   sequential vector y on rank 0.
465 
466   In above cases, entries of x are roots and entries of y are leaves.
467 
468   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
469   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
470   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
471   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
472   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
473 
474   In this case, roots and leaves are symmetric.
475 
476   Level: intermediate
477  @*/
478 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
479 {
480   MPI_Comm       comm;
481   PetscInt       n,N,res[2];
482   PetscMPIInt    rank,size;
483   PetscSFType    type;
484   PetscErrorCode ierr;
485 
486   PetscFunctionBegin;
487   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
488   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
489   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
490   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
491 
492   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
493     type = PETSCSFALLTOALL;
494     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
495     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
496     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
497     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
498   } else {
499     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
500     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
501     res[0] = n;
502     res[1] = -n;
503     /* Check if n are same over all ranks so that we can optimize it */
504     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
505     if (res[0] == -res[1]) { /* same n */
506       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
507     } else {
508       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
509     }
510     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
511   }
512   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
513 
514   sf->pattern = pattern;
515   sf->mine    = NULL; /* Contiguous */
516 
517   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
518      Also set other easy stuff.
519    */
520   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
521     sf->nleaves      = N;
522     sf->nroots       = n;
523     sf->nranks       = size;
524     sf->minleaf      = 0;
525     sf->maxleaf      = N - 1;
526   } else if (pattern == PETSCSF_PATTERN_GATHER) {
527     sf->nleaves      = rank ? 0 : N;
528     sf->nroots       = n;
529     sf->nranks       = rank ? 0 : size;
530     sf->minleaf      = 0;
531     sf->maxleaf      = rank ? -1 : N - 1;
532   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
533     sf->nleaves      = size;
534     sf->nroots       = size;
535     sf->nranks       = size;
536     sf->minleaf      = 0;
537     sf->maxleaf      = size - 1;
538   }
539   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
540   sf->graphset = PETSC_TRUE;
541   PetscFunctionReturn(0);
542 }
543 
544 /*@
545    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
546 
547    Collective
548 
549    Input Arguments:
550 
551 .  sf - star forest to invert
552 
553    Output Arguments:
554 .  isf - inverse of sf
555    Level: advanced
556 
557    Notes:
558    All roots must have degree 1.
559 
560    The local space may be a permutation, but cannot be sparse.
561 
562 .seealso: PetscSFSetGraph()
563 @*/
564 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
565 {
566   PetscErrorCode ierr;
567   PetscMPIInt    rank;
568   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
569   const PetscInt *ilocal;
570   PetscSFNode    *roots,*leaves;
571 
572   PetscFunctionBegin;
573   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
574   PetscSFCheckGraphSet(sf,1);
575   PetscValidPointer(isf,2);
576 
577   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
578   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
579 
580   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
581   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
582   for (i=0; i<maxlocal; i++) {
583     leaves[i].rank  = rank;
584     leaves[i].index = i;
585   }
586   for (i=0; i <nroots; i++) {
587     roots[i].rank  = -1;
588     roots[i].index = -1;
589   }
590   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
591   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
592 
593   /* Check whether our leaves are sparse */
594   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
595   if (count == nroots) newilocal = NULL;
596   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
597     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
598     for (i=0,count=0; i<nroots; i++) {
599       if (roots[i].rank >= 0) {
600         newilocal[count]   = i;
601         roots[count].rank  = roots[i].rank;
602         roots[count].index = roots[i].index;
603         count++;
604       }
605     }
606   }
607 
608   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
609   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
610   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
611   PetscFunctionReturn(0);
612 }
613 
614 /*@
615    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
616 
617    Collective
618 
619    Input Arguments:
620 +  sf - communication object to duplicate
621 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
622 
623    Output Arguments:
624 .  newsf - new communication object
625 
626    Level: beginner
627 
628 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
629 @*/
630 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
631 {
632   PetscSFType    type;
633   PetscErrorCode ierr;
634 
635   PetscFunctionBegin;
636   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
637   PetscValidLogicalCollectiveEnum(sf,opt,2);
638   PetscValidPointer(newsf,3);
639   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
640   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
641   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
642   if (opt == PETSCSF_DUPLICATE_GRAPH) {
643     PetscSFCheckGraphSet(sf,1);
644     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
645       PetscInt          nroots,nleaves;
646       const PetscInt    *ilocal;
647       const PetscSFNode *iremote;
648       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
649       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
650     } else {
651       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
652     }
653   }
654   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
655   PetscFunctionReturn(0);
656 }
657 
658 /*@C
659    PetscSFGetGraph - Get the graph specifying a parallel star forest
660 
661    Not Collective
662 
663    Input Arguments:
664 .  sf - star forest
665 
666    Output Arguments:
667 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
668 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
669 .  ilocal - locations of leaves in leafdata buffers
670 -  iremote - remote locations of root vertices for each leaf on the current process
671 
672    Notes:
673    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
674 
675    When called from Fortran, the returned iremote array is a copy and must be deallocated after use. Consequently, if you
676    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
677 
678    Level: intermediate
679 
680 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
681 @*/
682 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
683 {
684   PetscErrorCode ierr;
685 
686   PetscFunctionBegin;
687   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
688   if (sf->ops->GetGraph) {
689     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
690   } else {
691     if (nroots) *nroots = sf->nroots;
692     if (nleaves) *nleaves = sf->nleaves;
693     if (ilocal) *ilocal = sf->mine;
694     if (iremote) *iremote = sf->remote;
695   }
696   PetscFunctionReturn(0);
697 }
698 
699 /*@
700    PetscSFGetLeafRange - Get the active leaf ranges
701 
702    Not Collective
703 
704    Input Arguments:
705 .  sf - star forest
706 
707    Output Arguments:
708 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
709 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
710 
711    Level: developer
712 
713 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
714 @*/
715 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
716 {
717 
718   PetscFunctionBegin;
719   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
720   PetscSFCheckGraphSet(sf,1);
721   if (minleaf) *minleaf = sf->minleaf;
722   if (maxleaf) *maxleaf = sf->maxleaf;
723   PetscFunctionReturn(0);
724 }
725 
726 /*@C
727    PetscSFViewFromOptions - View from Options
728 
729    Collective on PetscSF
730 
731    Input Parameters:
732 +  A - the star forest
733 .  obj - Optional object
734 -  name - command line option
735 
736    Level: intermediate
737 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
738 @*/
739 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
740 {
741   PetscErrorCode ierr;
742 
743   PetscFunctionBegin;
744   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
745   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
746   PetscFunctionReturn(0);
747 }
748 
749 /*@C
750    PetscSFView - view a star forest
751 
752    Collective
753 
754    Input Arguments:
755 +  sf - star forest
756 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
757 
758    Level: beginner
759 
760 .seealso: PetscSFCreate(), PetscSFSetGraph()
761 @*/
762 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
763 {
764   PetscErrorCode    ierr;
765   PetscBool         iascii;
766   PetscViewerFormat format;
767 
768   PetscFunctionBegin;
769   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
770   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
771   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
772   PetscCheckSameComm(sf,1,viewer,2);
773   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
774   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
775   if (iascii) {
776     PetscMPIInt rank;
777     PetscInt    ii,i,j;
778 
779     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
780     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
781     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
782     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
783       if (!sf->graphset) {
784         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
785         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
786         PetscFunctionReturn(0);
787       }
788       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
789       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
790       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
791       for (i=0; i<sf->nleaves; i++) {
792         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
793       }
794       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
795       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
796       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
797         PetscMPIInt *tmpranks,*perm;
798         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
799         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
800         for (i=0; i<sf->nranks; i++) perm[i] = i;
801         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
802         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
803         for (ii=0; ii<sf->nranks; ii++) {
804           i = perm[ii];
805           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
806           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
807             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
808           }
809         }
810         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
811       }
812       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
813       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
814     }
815     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
816   }
817   PetscFunctionReturn(0);
818 }
819 
820 /*@C
821    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
822 
823    Not Collective
824 
825    Input Arguments:
826 .  sf - star forest
827 
828    Output Arguments:
829 +  nranks - number of ranks referenced by local part
830 .  ranks - array of ranks
831 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
832 .  rmine - concatenated array holding local indices referencing each remote rank
833 -  rremote - concatenated array holding remote indices referenced for each remote rank
834 
835    Level: developer
836 
837 .seealso: PetscSFGetLeafRanks()
838 @*/
839 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
840 {
841   PetscErrorCode ierr;
842 
843   PetscFunctionBegin;
844   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
845   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
846   if (sf->ops->GetRootRanks) {
847     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
848   } else {
849     /* The generic implementation */
850     if (nranks)  *nranks  = sf->nranks;
851     if (ranks)   *ranks   = sf->ranks;
852     if (roffset) *roffset = sf->roffset;
853     if (rmine)   *rmine   = sf->rmine;
854     if (rremote) *rremote = sf->rremote;
855   }
856   PetscFunctionReturn(0);
857 }
858 
859 /*@C
860    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
861 
862    Not Collective
863 
864    Input Arguments:
865 .  sf - star forest
866 
867    Output Arguments:
868 +  niranks - number of leaf ranks referencing roots on this process
869 .  iranks - array of ranks
870 .  ioffset - offset in irootloc for each rank (length niranks+1)
871 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
872 
873    Level: developer
874 
875 .seealso: PetscSFGetRootRanks()
876 @*/
877 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
878 {
879   PetscErrorCode ierr;
880 
881   PetscFunctionBegin;
882   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
883   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
884   if (sf->ops->GetLeafRanks) {
885     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
886   } else {
887     PetscSFType type;
888     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
889     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
890   }
891   PetscFunctionReturn(0);
892 }
893 
894 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
895   PetscInt i;
896   for (i=0; i<n; i++) {
897     if (needle == list[i]) return PETSC_TRUE;
898   }
899   return PETSC_FALSE;
900 }
901 
902 /*@C
903    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
904 
905    Collective
906 
907    Input Arguments:
908 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
909 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
910 
911    Level: developer
912 
913 .seealso: PetscSFGetRootRanks()
914 @*/
915 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
916 {
917   PetscErrorCode     ierr;
918   PetscTable         table;
919   PetscTablePosition pos;
920   PetscMPIInt        size,groupsize,*groupranks;
921   PetscInt           *rcount,*ranks;
922   PetscInt           i, irank = -1,orank = -1;
923 
924   PetscFunctionBegin;
925   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
926   PetscSFCheckGraphSet(sf,1);
927   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
928   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
929   for (i=0; i<sf->nleaves; i++) {
930     /* Log 1-based rank */
931     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
932   }
933   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
934   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
935   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
936   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
937   for (i=0; i<sf->nranks; i++) {
938     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
939     ranks[i]--;             /* Convert back to 0-based */
940   }
941   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
942 
943   /* We expect that dgroup is reliably "small" while nranks could be large */
944   {
945     MPI_Group group = MPI_GROUP_NULL;
946     PetscMPIInt *dgroupranks;
947     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
948     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRQ(ierr);
949     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
950     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
951     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
952     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRQ(ierr);}
953     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
954     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
955   }
956 
957   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
958   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i; ) {
959     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
960       if (InList(ranks[i],groupsize,groupranks)) break;
961     }
962     for ( ; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
963       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
964     }
965     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
966       PetscInt tmprank,tmpcount;
967 
968       tmprank             = ranks[i];
969       tmpcount            = rcount[i];
970       ranks[i]            = ranks[sf->ndranks];
971       rcount[i]           = rcount[sf->ndranks];
972       ranks[sf->ndranks]  = tmprank;
973       rcount[sf->ndranks] = tmpcount;
974       sf->ndranks++;
975     }
976   }
977   ierr = PetscFree(groupranks);CHKERRQ(ierr);
978   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
979   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
980   sf->roffset[0] = 0;
981   for (i=0; i<sf->nranks; i++) {
982     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
983     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
984     rcount[i]        = 0;
985   }
986   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
987     /* short circuit */
988     if (orank != sf->remote[i].rank) {
989       /* Search for index of iremote[i].rank in sf->ranks */
990       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
991       if (irank < 0) {
992         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
993         if (irank >= 0) irank += sf->ndranks;
994       }
995       orank = sf->remote[i].rank;
996     }
997     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
998     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
999     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1000     rcount[irank]++;
1001   }
1002   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
1003   PetscFunctionReturn(0);
1004 }
1005 
1006 /*@C
1007    PetscSFGetGroups - gets incoming and outgoing process groups
1008 
1009    Collective
1010 
1011    Input Argument:
1012 .  sf - star forest
1013 
1014    Output Arguments:
1015 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1016 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1017 
1018    Level: developer
1019 
1020 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1021 @*/
1022 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1023 {
1024   PetscErrorCode ierr;
1025   MPI_Group      group = MPI_GROUP_NULL;
1026 
1027   PetscFunctionBegin;
1028   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1029   if (sf->ingroup == MPI_GROUP_NULL) {
1030     PetscInt       i;
1031     const PetscInt *indegree;
1032     PetscMPIInt    rank,*outranks,*inranks;
1033     PetscSFNode    *remote;
1034     PetscSF        bgcount;
1035 
1036     /* Compute the number of incoming ranks */
1037     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1038     for (i=0; i<sf->nranks; i++) {
1039       remote[i].rank  = sf->ranks[i];
1040       remote[i].index = 0;
1041     }
1042     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1043     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1044     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1045     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1046     /* Enumerate the incoming ranks */
1047     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1048     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1049     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1050     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1051     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1052     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1053     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRQ(ierr);
1054     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1055     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1056     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1057   }
1058   *incoming = sf->ingroup;
1059 
1060   if (sf->outgroup == MPI_GROUP_NULL) {
1061     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1062     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRQ(ierr);
1063     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1064   }
1065   *outgoing = sf->outgroup;
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 /*@
1070    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1071 
1072    Collective
1073 
1074    Input Argument:
1075 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1076 
1077    Output Arguments:
1078 .  multi - star forest with split roots, such that each root has degree exactly 1
1079 
1080    Level: developer
1081 
1082    Notes:
1083 
1084    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1085    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1086    edge, it is a candidate for future optimization that might involve its removal.
1087 
1088 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1089 @*/
1090 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1091 {
1092   PetscErrorCode ierr;
1093 
1094   PetscFunctionBegin;
1095   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1096   PetscValidPointer(multi,2);
1097   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1098     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1099     *multi = sf->multi;
1100     PetscFunctionReturn(0);
1101   }
1102   if (!sf->multi) {
1103     const PetscInt *indegree;
1104     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1105     PetscSFNode    *remote;
1106     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1107     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1108     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1109     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1110     inoffset[0] = 0;
1111     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1112     for (i=0; i<maxlocal; i++) outones[i] = 1;
1113     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1114     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1115     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1116 #if defined(PETSC_USE_DEBUG)                                 /* Check that the expected number of increments occurred */
1117     for (i=0; i<sf->nroots; i++) {
1118       if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1119     }
1120 #endif
1121     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1122     for (i=0; i<sf->nleaves; i++) {
1123       remote[i].rank  = sf->remote[i].rank;
1124       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1125     }
1126     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1127     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1128     if (sf->rankorder) {        /* Sort the ranks */
1129       PetscMPIInt rank;
1130       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1131       PetscSFNode *newremote;
1132       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1133       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1134       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1135       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1136       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1137       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1138       /* Sort the incoming ranks at each vertex, build the inverse map */
1139       for (i=0; i<sf->nroots; i++) {
1140         PetscInt j;
1141         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1142         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1143         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1144       }
1145       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1146       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1147       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1148       for (i=0; i<sf->nleaves; i++) {
1149         newremote[i].rank  = sf->remote[i].rank;
1150         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1151       }
1152       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1153       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1154     }
1155     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1156   }
1157   *multi = sf->multi;
1158   PetscFunctionReturn(0);
1159 }
1160 
1161 /*@C
1162    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1163 
1164    Collective
1165 
1166    Input Arguments:
1167 +  sf - original star forest
1168 .  nselected  - number of selected roots on this process
1169 -  selected   - indices of the selected roots on this process
1170 
1171    Output Arguments:
1172 .  esf - new star forest
1173 
1174    Level: advanced
1175 
1176    Note:
1177    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1178    be done by calling PetscSFGetGraph().
1179 
1180 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1181 @*/
1182 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *esf)
1183 {
1184   PetscInt          i,j,n,nroots,nleaves,esf_nleaves,*new_ilocal,minleaf,maxleaf,maxlocal;
1185   const PetscInt    *ilocal;
1186   signed char       *rootdata,*leafdata,*leafmem;
1187   const PetscSFNode *iremote;
1188   PetscSFNode       *new_iremote;
1189   MPI_Comm          comm;
1190   PetscErrorCode    ierr;
1191 
1192   PetscFunctionBegin;
1193   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1194   PetscSFCheckGraphSet(sf,1);
1195   if (nselected) PetscValidPointer(selected,3);
1196   PetscValidPointer(esf,4);
1197 
1198   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1199 
1200   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1201   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1202   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1203 
1204 #if defined(PETSC_USE_DEBUG)
1205   /* Error out if selected[] has dups or  out of range indices */
1206   {
1207     PetscBool dups;
1208     ierr = PetscCheckDupsInt(nselected,selected,&dups);CHKERRQ(ierr);
1209     if (dups) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"selected[] has dups");
1210     for (i=0; i<nselected; i++)
1211       if (selected[i] < 0 || selected[i] >= nroots) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"selected root indice %D is out of [0,%D)",selected[i],nroots);
1212   }
1213 #endif
1214 
1215   if (sf->ops->CreateEmbeddedSF) {
1216     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,selected,esf);CHKERRQ(ierr);
1217   } else {
1218     /* A generic version of creating embedded sf */
1219     ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
1220     maxlocal = maxleaf - minleaf + 1;
1221     ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafmem);CHKERRQ(ierr);
1222     leafdata = leafmem - minleaf;
1223     /* Tag selected roots and bcast to leaves */
1224     for (i=0; i<nselected; i++) rootdata[selected[i]] = 1;
1225     ierr = PetscSFBcastBegin(sf,MPI_SIGNED_CHAR,rootdata,leafdata);CHKERRQ(ierr);
1226     ierr = PetscSFBcastEnd(sf,MPI_SIGNED_CHAR,rootdata,leafdata);CHKERRQ(ierr);
1227 
1228     /* Build esf with leaves that are still connected */
1229     esf_nleaves = 0;
1230     for (i=0; i<nleaves; i++) {
1231       j = ilocal ? ilocal[i] : i;
1232       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1233          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1234       */
1235       esf_nleaves += (leafdata[j] ? 1 : 0);
1236     }
1237     ierr = PetscMalloc1(esf_nleaves,&new_ilocal);CHKERRQ(ierr);
1238     ierr = PetscMalloc1(esf_nleaves,&new_iremote);CHKERRQ(ierr);
1239     for (i=n=0; i<nleaves; i++) {
1240       j = ilocal ? ilocal[i] : i;
1241       if (leafdata[j]) {
1242         new_ilocal[n]        = j;
1243         new_iremote[n].rank  = iremote[i].rank;
1244         new_iremote[n].index = iremote[i].index;
1245         ++n;
1246       }
1247     }
1248     ierr = PetscSFCreate(comm,esf);CHKERRQ(ierr);
1249     ierr = PetscSFSetFromOptions(*esf);CHKERRQ(ierr);
1250     ierr = PetscSFSetGraph(*esf,nroots,esf_nleaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1251     ierr = PetscFree2(rootdata,leafmem);CHKERRQ(ierr);
1252   }
1253   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1254   PetscFunctionReturn(0);
1255 }
1256 
1257 /*@C
1258   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1259 
1260   Collective
1261 
1262   Input Arguments:
1263 + sf - original star forest
1264 . nselected  - number of selected leaves on this process
1265 - selected   - indices of the selected leaves on this process
1266 
1267   Output Arguments:
1268 .  newsf - new star forest
1269 
1270   Level: advanced
1271 
1272 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1273 @*/
1274 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1275 {
1276   const PetscSFNode *iremote;
1277   PetscSFNode       *new_iremote;
1278   const PetscInt    *ilocal;
1279   PetscInt          i,nroots,*leaves,*new_ilocal;
1280   MPI_Comm          comm;
1281   PetscErrorCode    ierr;
1282 
1283   PetscFunctionBegin;
1284   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1285   PetscSFCheckGraphSet(sf,1);
1286   if (nselected) PetscValidPointer(selected,3);
1287   PetscValidPointer(newsf,4);
1288 
1289   /* Uniq selected[] and put results in leaves[] */
1290   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1291   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1292   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1293   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1294   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1295 
1296   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1297   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1298     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1299   } else {
1300     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1301     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1302     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1303     for (i=0; i<nselected; ++i) {
1304       const PetscInt l     = leaves[i];
1305       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1306       new_iremote[i].rank  = iremote[l].rank;
1307       new_iremote[i].index = iremote[l].index;
1308     }
1309     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1310     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1311   }
1312   ierr = PetscFree(leaves);CHKERRQ(ierr);
1313   PetscFunctionReturn(0);
1314 }
1315 
1316 /*@C
1317    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1318 
1319    Collective on PetscSF
1320 
1321    Input Arguments:
1322 +  sf - star forest on which to communicate
1323 .  unit - data type associated with each node
1324 .  rootdata - buffer to broadcast
1325 -  op - operation to use for reduction
1326 
1327    Output Arguments:
1328 .  leafdata - buffer to be reduced with values from each leaf's respective root
1329 
1330    Level: intermediate
1331 
1332 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd()
1333 @*/
1334 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1335 {
1336   PetscErrorCode ierr;
1337   PetscMemType   rootmtype,leafmtype;
1338 
1339   PetscFunctionBegin;
1340   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1341   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1342   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1343   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1344   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1345   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1346   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 /*@C
1351    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1352 
1353    Collective
1354 
1355    Input Arguments:
1356 +  sf - star forest
1357 .  unit - data type
1358 .  rootdata - buffer to broadcast
1359 -  op - operation to use for reduction
1360 
1361    Output Arguments:
1362 .  leafdata - buffer to be reduced with values from each leaf's respective root
1363 
1364    Level: intermediate
1365 
1366 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1367 @*/
1368 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1369 {
1370   PetscErrorCode ierr;
1371 
1372   PetscFunctionBegin;
1373   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1374   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1375   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1376   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootdata,leafdata,op);CHKERRQ(ierr);
1377   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1378   PetscFunctionReturn(0);
1379 }
1380 
1381 /*@C
1382    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1383 
1384    Collective
1385 
1386    Input Arguments:
1387 +  sf - star forest
1388 .  unit - data type
1389 .  leafdata - values to reduce
1390 -  op - reduction operation
1391 
1392    Output Arguments:
1393 .  rootdata - result of reduction of values from all leaves of each root
1394 
1395    Level: intermediate
1396 
1397 .seealso: PetscSFBcastBegin()
1398 @*/
1399 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1400 {
1401   PetscErrorCode ierr;
1402   PetscMemType   rootmtype,leafmtype;
1403 
1404   PetscFunctionBegin;
1405   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1406   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1407   ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1408   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1409   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1410   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1411   ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1412   PetscFunctionReturn(0);
1413 }
1414 
1415 /*@C
1416    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1417 
1418    Collective
1419 
1420    Input Arguments:
1421 +  sf - star forest
1422 .  unit - data type
1423 .  leafdata - values to reduce
1424 -  op - reduction operation
1425 
1426    Output Arguments:
1427 .  rootdata - result of reduction of values from all leaves of each root
1428 
1429    Level: intermediate
1430 
1431 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1432 @*/
1433 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1434 {
1435   PetscErrorCode ierr;
1436 
1437   PetscFunctionBegin;
1438   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1439   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1440   ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1441   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
1442   ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1443   PetscFunctionReturn(0);
1444 }
1445 
1446 /*@C
1447    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1448 
1449    Collective
1450 
1451    Input Arguments:
1452 +  sf - star forest
1453 .  unit - data type
1454 .  leafdata - leaf values to use in reduction
1455 -  op - operation to use for reduction
1456 
1457    Output Arguments:
1458 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1459 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1460 
1461    Level: advanced
1462 
1463    Note:
1464    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1465    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1466    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1467    integers.
1468 
1469 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1470 @*/
1471 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1472 {
1473   PetscErrorCode ierr;
1474   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1475 
1476   PetscFunctionBegin;
1477   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1478   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1479   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1480   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1481   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1482   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1483   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1484   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1485   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1486   PetscFunctionReturn(0);
1487 }
1488 
1489 /*@C
1490    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1491 
1492    Collective
1493 
1494    Input Arguments:
1495 +  sf - star forest
1496 .  unit - data type
1497 .  leafdata - leaf values to use in reduction
1498 -  op - operation to use for reduction
1499 
1500    Output Arguments:
1501 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1502 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1503 
1504    Level: advanced
1505 
1506 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1507 @*/
1508 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1509 {
1510   PetscErrorCode ierr;
1511 
1512   PetscFunctionBegin;
1513   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1514   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1515   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1516   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootdata,leafdata,leafupdate,op);CHKERRQ(ierr);
1517   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1518   PetscFunctionReturn(0);
1519 }
1520 
1521 /*@C
1522    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1523 
1524    Collective
1525 
1526    Input Arguments:
1527 .  sf - star forest
1528 
1529    Output Arguments:
1530 .  degree - degree of each root vertex
1531 
1532    Level: advanced
1533 
1534    Notes:
1535    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1536 
1537 .seealso: PetscSFGatherBegin()
1538 @*/
1539 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1540 {
1541   PetscErrorCode ierr;
1542 
1543   PetscFunctionBegin;
1544   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1545   PetscSFCheckGraphSet(sf,1);
1546   PetscValidPointer(degree,2);
1547   if (!sf->degreeknown) {
1548     PetscInt i, nroots = sf->nroots, maxlocal;
1549     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1550     maxlocal = sf->maxleaf-sf->minleaf+1;
1551     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1552     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1553     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1554     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1555     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1556   }
1557   *degree = NULL;
1558   PetscFunctionReturn(0);
1559 }
1560 
1561 /*@C
1562    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1563 
1564    Collective
1565 
1566    Input Arguments:
1567 .  sf - star forest
1568 
1569    Output Arguments:
1570 .  degree - degree of each root vertex
1571 
1572    Level: developer
1573 
1574    Notes:
1575    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1576 
1577 .seealso:
1578 @*/
1579 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1580 {
1581   PetscErrorCode ierr;
1582 
1583   PetscFunctionBegin;
1584   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1585   PetscSFCheckGraphSet(sf,1);
1586   PetscValidPointer(degree,2);
1587   if (!sf->degreeknown) {
1588     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1589     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1590     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1591     sf->degreeknown = PETSC_TRUE;
1592   }
1593   *degree = sf->degree;
1594   PetscFunctionReturn(0);
1595 }
1596 
1597 
1598 /*@C
1599    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1600    Each multi-root is assigned index of the corresponding original root.
1601 
1602    Collective
1603 
1604    Input Arguments:
1605 +  sf - star forest
1606 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1607 
1608    Output Arguments:
1609 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1610 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1611 
1612    Level: developer
1613 
1614    Notes:
1615    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1616 
1617 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1618 @*/
1619 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1620 {
1621   PetscSF             msf;
1622   PetscInt            i, j, k, nroots, nmroots;
1623   PetscErrorCode      ierr;
1624 
1625   PetscFunctionBegin;
1626   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1627   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1628   if (nroots) PetscValidIntPointer(degree,2);
1629   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1630   PetscValidPointer(multiRootsOrigNumbering,4);
1631   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1632   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1633   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1634   for (i=0,j=0,k=0; i<nroots; i++) {
1635     if (!degree[i]) continue;
1636     for (j=0; j<degree[i]; j++,k++) {
1637       (*multiRootsOrigNumbering)[k] = i;
1638     }
1639   }
1640 #if defined(PETSC_USE_DEBUG)
1641   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1642 #endif
1643   if (nMultiRoots) *nMultiRoots = nmroots;
1644   PetscFunctionReturn(0);
1645 }
1646 
1647 /*@C
1648    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1649 
1650    Collective
1651 
1652    Input Arguments:
1653 +  sf - star forest
1654 .  unit - data type
1655 -  leafdata - leaf data to gather to roots
1656 
1657    Output Argument:
1658 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1659 
1660    Level: intermediate
1661 
1662 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1663 @*/
1664 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1665 {
1666   PetscErrorCode ierr;
1667   PetscSF        multi = NULL;
1668 
1669   PetscFunctionBegin;
1670   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1671   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1672   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1673   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1674   PetscFunctionReturn(0);
1675 }
1676 
1677 /*@C
1678    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1679 
1680    Collective
1681 
1682    Input Arguments:
1683 +  sf - star forest
1684 .  unit - data type
1685 -  leafdata - leaf data to gather to roots
1686 
1687    Output Argument:
1688 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1689 
1690    Level: intermediate
1691 
1692 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1693 @*/
1694 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1695 {
1696   PetscErrorCode ierr;
1697   PetscSF        multi = NULL;
1698 
1699   PetscFunctionBegin;
1700   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1701   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1702   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1703   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1704   PetscFunctionReturn(0);
1705 }
1706 
1707 /*@C
1708    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1709 
1710    Collective
1711 
1712    Input Arguments:
1713 +  sf - star forest
1714 .  unit - data type
1715 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1716 
1717    Output Argument:
1718 .  leafdata - leaf data to be update with personal data from each respective root
1719 
1720    Level: intermediate
1721 
1722 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1723 @*/
1724 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1725 {
1726   PetscErrorCode ierr;
1727   PetscSF        multi = NULL;
1728 
1729   PetscFunctionBegin;
1730   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1731   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1732   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1733   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1734   PetscFunctionReturn(0);
1735 }
1736 
1737 /*@C
1738    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1739 
1740    Collective
1741 
1742    Input Arguments:
1743 +  sf - star forest
1744 .  unit - data type
1745 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1746 
1747    Output Argument:
1748 .  leafdata - leaf data to be update with personal data from each respective root
1749 
1750    Level: intermediate
1751 
1752 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1753 @*/
1754 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1755 {
1756   PetscErrorCode ierr;
1757   PetscSF        multi = NULL;
1758 
1759   PetscFunctionBegin;
1760   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1761   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1762   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1763   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1764   PetscFunctionReturn(0);
1765 }
1766 
1767 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1768 {
1769 #if defined(PETSC_USE_DEBUG)
1770   PetscInt        i, n, nleaves;
1771   const PetscInt *ilocal = NULL;
1772   PetscHSetI      seen;
1773   PetscErrorCode  ierr;
1774 
1775   PetscFunctionBegin;
1776   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1777   ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1778   for (i = 0; i < nleaves; i++) {
1779     const PetscInt leaf = ilocal ? ilocal[i] : i;
1780     ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1781   }
1782   ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1783   if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1784   ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1785   PetscFunctionReturn(0);
1786 #else
1787   PetscFunctionBegin;
1788   PetscFunctionReturn(0);
1789 #endif
1790 }
1791 
1792 /*@
1793   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1794 
1795   Input Parameters:
1796 + sfA - The first PetscSF
1797 - sfB - The second PetscSF
1798 
1799   Output Parameters:
1800 . sfBA - The composite SF
1801 
1802   Level: developer
1803 
1804   Notes:
1805   Currently, the two SFs must be defined on congruent communicators and they must be true star
1806   forests, i.e. the same leaf is not connected with different roots.
1807 
1808   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1809   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1810   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1811   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1812 
1813 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1814 @*/
1815 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1816 {
1817   PetscErrorCode    ierr;
1818   const PetscSFNode *remotePointsA,*remotePointsB;
1819   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1820   const PetscInt    *localPointsA,*localPointsB;
1821   PetscInt          *localPointsBA;
1822   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1823   PetscBool         denseB;
1824 
1825   PetscFunctionBegin;
1826   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1827   PetscSFCheckGraphSet(sfA,1);
1828   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1829   PetscSFCheckGraphSet(sfB,2);
1830   PetscCheckSameComm(sfA,1,sfB,2);
1831   PetscValidPointer(sfBA,3);
1832   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1833   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1834 
1835   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1836   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1837   if (localPointsA) {
1838     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
1839     for (i=0; i<numRootsB; i++) {
1840       reorderedRemotePointsA[i].rank = -1;
1841       reorderedRemotePointsA[i].index = -1;
1842     }
1843     for (i=0; i<numLeavesA; i++) {
1844       if (localPointsA[i] >= numRootsB) continue;
1845       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
1846     }
1847     remotePointsA = reorderedRemotePointsA;
1848   }
1849   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
1850   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
1851   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1852   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1853   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1854 
1855   denseB = (PetscBool)!localPointsB;
1856   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1857     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
1858     else numLeavesBA++;
1859   }
1860   if (denseB) {
1861     localPointsBA  = NULL;
1862     remotePointsBA = leafdataB;
1863   } else {
1864     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
1865     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
1866     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1867       const PetscInt l = localPointsB ? localPointsB[i] : i;
1868 
1869       if (leafdataB[l-minleaf].rank == -1) continue;
1870       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
1871       localPointsBA[numLeavesBA] = l;
1872       numLeavesBA++;
1873     }
1874     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
1875   }
1876   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1877   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1878   PetscFunctionReturn(0);
1879 }
1880 
1881 /*@
1882   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
1883 
1884   Input Parameters:
1885 + sfA - The first PetscSF
1886 - sfB - The second PetscSF
1887 
1888   Output Parameters:
1889 . sfBA - The composite SF.
1890 
1891   Level: developer
1892 
1893   Notes:
1894   Currently, the two SFs must be defined on congruent communicators and they must be true star
1895   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
1896   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
1897 
1898   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
1899   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
1900   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
1901   a Reduce on sfB, on connected roots.
1902 
1903 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
1904 @*/
1905 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1906 {
1907   PetscErrorCode    ierr;
1908   const PetscSFNode *remotePointsA,*remotePointsB;
1909   PetscSFNode       *remotePointsBA;
1910   const PetscInt    *localPointsA,*localPointsB;
1911   PetscSFNode       *reorderedRemotePointsA = NULL;
1912   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
1913   MPI_Op            op;
1914 #if defined(PETSC_USE_64BIT_INDICES)
1915   PetscBool         iswin;
1916 #endif
1917 
1918   PetscFunctionBegin;
1919   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1920   PetscSFCheckGraphSet(sfA,1);
1921   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1922   PetscSFCheckGraphSet(sfB,2);
1923   PetscCheckSameComm(sfA,1,sfB,2);
1924   PetscValidPointer(sfBA,3);
1925   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1926   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1927 
1928   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
1929   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
1930 
1931   /* TODO: Check roots of sfB have degree of 1 */
1932   /* Once we implement it, we can replace the MPI_MAXLOC
1933      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
1934      We use MPI_MAXLOC only to have a deterministic output from this routine if
1935      the root condition is not meet.
1936    */
1937   op = MPI_MAXLOC;
1938 #if defined(PETSC_USE_64BIT_INDICES)
1939   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
1940   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
1941   if (iswin) op = MPIU_REPLACE;
1942 #endif
1943 
1944   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
1945   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
1946   for (i=0; i<maxleaf - minleaf + 1; i++) {
1947     reorderedRemotePointsA[i].rank = -1;
1948     reorderedRemotePointsA[i].index = -1;
1949   }
1950   if (localPointsA) {
1951     for (i=0; i<numLeavesA; i++) {
1952       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
1953       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
1954     }
1955   } else {
1956     for (i=0; i<numLeavesA; i++) {
1957       if (i > maxleaf || i < minleaf) continue;
1958       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
1959     }
1960   }
1961 
1962   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
1963   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
1964   for (i=0; i<numRootsB; i++) {
1965     remotePointsBA[i].rank = -1;
1966     remotePointsBA[i].index = -1;
1967   }
1968 
1969   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
1970   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
1971   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1972   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
1973     if (remotePointsBA[i].rank == -1) continue;
1974     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
1975     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
1976     localPointsBA[numLeavesBA] = i;
1977     numLeavesBA++;
1978   }
1979   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1980   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1981   PetscFunctionReturn(0);
1982 }
1983 
1984 /*
1985   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
1986 
1987   Input Parameters:
1988 . sf - The global PetscSF
1989 
1990   Output Parameters:
1991 . out - The local PetscSF
1992  */
1993 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
1994 {
1995   MPI_Comm           comm;
1996   PetscMPIInt        myrank;
1997   const PetscInt     *ilocal;
1998   const PetscSFNode  *iremote;
1999   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
2000   PetscSFNode        *liremote;
2001   PetscSF            lsf;
2002   PetscErrorCode     ierr;
2003 
2004   PetscFunctionBegin;
2005   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2006   if (sf->ops->CreateLocalSF) {
2007     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2008   } else {
2009     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2010     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2011     ierr = MPI_Comm_rank(comm,&myrank);CHKERRQ(ierr);
2012 
2013     /* Find out local edges and build a local SF */
2014     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2015     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2016     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2017     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2018 
2019     for (i=j=0; i<nleaves; i++) {
2020       if (iremote[i].rank == (PetscInt)myrank) {
2021         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2022         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2023         liremote[j].index = iremote[i].index;
2024         j++;
2025       }
2026     }
2027     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2028     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2029     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2030     *out = lsf;
2031   }
2032   PetscFunctionReturn(0);
2033 }
2034 
2035 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2036 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2037 {
2038   PetscErrorCode     ierr;
2039   PetscMemType       rootmtype,leafmtype;
2040 
2041   PetscFunctionBegin;
2042   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2043   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2044   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2045   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2046   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2047   if (sf->ops->BcastToZero) {
2048     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2049   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2050   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2051   PetscFunctionReturn(0);
2052 }
2053 
2054