xref: /petsc/src/vec/is/sf/interface/sf.c (revision b85e67b730cce8d297c053b3c61d3977f008dd9a)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_USE_DEBUG)
6 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
7     if (PetscUnlikely(!(sf)->graphset))                              \
8       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
9   } while (0)
10 #else
11 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
12 #endif
13 
14 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",0};
15 
16 /*@
17    PetscSFCreate - create a star forest communication context
18 
19    Collective
20 
21    Input Arguments:
22 .  comm - communicator on which the star forest will operate
23 
24    Output Arguments:
25 .  sf - new star forest context
26 
27    Options Database Keys:
28 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
29 .  -sf_type window    -Use MPI-3 one-sided window for communication
30 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
31 
32    Level: intermediate
33 
34    Notes:
35    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
36    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
37    SFs are optimized and they have better performance than general SFs.
38 
39 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
40 @*/
41 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
42 {
43   PetscErrorCode ierr;
44   PetscSF        b;
45 
46   PetscFunctionBegin;
47   PetscValidPointer(sf,2);
48   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
49 
50   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
51 
52   b->nroots    = -1;
53   b->nleaves   = -1;
54   b->minleaf   = PETSC_MAX_INT;
55   b->maxleaf   = PETSC_MIN_INT;
56   b->nranks    = -1;
57   b->rankorder = PETSC_TRUE;
58   b->ingroup   = MPI_GROUP_NULL;
59   b->outgroup  = MPI_GROUP_NULL;
60   b->graphset  = PETSC_FALSE;
61 
62   *sf = b;
63   PetscFunctionReturn(0);
64 }
65 
66 /*@
67    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
68 
69    Collective
70 
71    Input Arguments:
72 .  sf - star forest
73 
74    Level: advanced
75 
76 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
77 @*/
78 PetscErrorCode PetscSFReset(PetscSF sf)
79 {
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
84   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
85   sf->nroots   = -1;
86   sf->nleaves  = -1;
87   sf->minleaf  = PETSC_MAX_INT;
88   sf->maxleaf  = PETSC_MIN_INT;
89   sf->mine     = NULL;
90   sf->remote   = NULL;
91   sf->graphset = PETSC_FALSE;
92   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
93   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
94   sf->nranks = -1;
95   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
96 #if defined(PETSC_HAVE_CUDA)
97   {
98   PetscInt  i;
99   for (i=0; i<2; i++) {if (sf->rmine_d[i]) {cudaError_t err = cudaFree(sf->rmine_d[i]);CHKERRCUDA(err);sf->rmine_d[i]=NULL;}}
100   }
101 #endif
102   sf->degreeknown = PETSC_FALSE;
103   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
104   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRQ(ierr);}
105   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRQ(ierr);}
106   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
107   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
108   sf->setupcalled = PETSC_FALSE;
109   PetscFunctionReturn(0);
110 }
111 
112 /*@C
113    PetscSFSetType - Set the PetscSF communication implementation
114 
115    Collective on PetscSF
116 
117    Input Parameters:
118 +  sf - the PetscSF context
119 -  type - a known method
120 
121    Options Database Key:
122 .  -sf_type <type> - Sets the method; use -help for a list
123    of available methods (for instance, window, basic, neighbor)
124 
125    Notes:
126    See "include/petscsf.h" for available methods (for instance)
127 +    PETSCSFWINDOW - MPI-2/3 one-sided
128 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
129 
130   Level: intermediate
131 
132 .seealso: PetscSFType, PetscSFCreate()
133 @*/
134 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
135 {
136   PetscErrorCode ierr,(*r)(PetscSF);
137   PetscBool      match;
138 
139   PetscFunctionBegin;
140   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
141   PetscValidCharPointer(type,2);
142 
143   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
144   if (match) PetscFunctionReturn(0);
145 
146   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
147   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
148   /* Destroy the previous PetscSF implementation context */
149   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
150   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
151   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
152   ierr = (*r)(sf);CHKERRQ(ierr);
153   PetscFunctionReturn(0);
154 }
155 
156 /*@C
157   PetscSFGetType - Get the PetscSF communication implementation
158 
159   Not Collective
160 
161   Input Parameter:
162 . sf  - the PetscSF context
163 
164   Output Parameter:
165 . type - the PetscSF type name
166 
167   Level: intermediate
168 
169 .seealso: PetscSFSetType(), PetscSFCreate()
170 @*/
171 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
172 {
173   PetscFunctionBegin;
174   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
175   PetscValidPointer(type,2);
176   *type = ((PetscObject)sf)->type_name;
177   PetscFunctionReturn(0);
178 }
179 
180 /*@
181    PetscSFDestroy - destroy star forest
182 
183    Collective
184 
185    Input Arguments:
186 .  sf - address of star forest
187 
188    Level: intermediate
189 
190 .seealso: PetscSFCreate(), PetscSFReset()
191 @*/
192 PetscErrorCode PetscSFDestroy(PetscSF *sf)
193 {
194   PetscErrorCode ierr;
195 
196   PetscFunctionBegin;
197   if (!*sf) PetscFunctionReturn(0);
198   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
199   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
200   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
201   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
202   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
203   PetscFunctionReturn(0);
204 }
205 
206 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
207 {
208 #if defined(PETSC_USE_DEBUG)
209   PetscInt           i, nleaves;
210   PetscMPIInt        size;
211   const PetscInt    *ilocal;
212   const PetscSFNode *iremote;
213   PetscErrorCode     ierr;
214 
215   PetscFunctionBegin;
216   if (!sf->graphset) PetscFunctionReturn(0);
217   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
218   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
219   for (i = 0; i < nleaves; i++) {
220     const PetscInt rank = iremote[i].rank;
221     const PetscInt remote = iremote[i].index;
222     const PetscInt leaf = ilocal ? ilocal[i] : i;
223     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
224     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
225     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
226   }
227   PetscFunctionReturn(0);
228 #else
229   PetscFunctionBegin;
230   PetscFunctionReturn(0);
231 #endif
232 }
233 
234 /*@
235    PetscSFSetUp - set up communication structures
236 
237    Collective
238 
239    Input Arguments:
240 .  sf - star forest communication object
241 
242    Level: beginner
243 
244 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
245 @*/
246 PetscErrorCode PetscSFSetUp(PetscSF sf)
247 {
248   PetscErrorCode ierr;
249 
250   PetscFunctionBegin;
251   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
252   PetscSFCheckGraphSet(sf,1);
253   if (sf->setupcalled) PetscFunctionReturn(0);
254   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
255   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);}
256   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
257   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
258   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
259   sf->setupcalled = PETSC_TRUE;
260   PetscFunctionReturn(0);
261 }
262 
263 /*@
264    PetscSFSetFromOptions - set PetscSF options using the options database
265 
266    Logically Collective
267 
268    Input Arguments:
269 .  sf - star forest
270 
271    Options Database Keys:
272 +  -sf_type               - implementation type, see PetscSFSetType()
273 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
274 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
275                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller.
276 -  -sf_use_stream_aware_mpi  - Assumes the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI.
277 
278    Level: intermediate
279 @*/
280 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
281 {
282   PetscSFType    deft;
283   char           type[256];
284   PetscErrorCode ierr;
285   PetscBool      flg;
286 
287   PetscFunctionBegin;
288   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
289   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
290   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
291   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
292   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
293   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
294   sf->use_default_stream = PETSC_FALSE;
295   ierr = PetscOptionsBool("-sf_use_default_stream","Assume SF's input and output root/leafdata is computed on the default stream","PetscSFSetFromOptions",sf->use_default_stream,&sf->use_default_stream,NULL);CHKERRQ(ierr);
296   sf->use_stream_aware_mpi = PETSC_FALSE;
297   ierr = PetscOptionsBool("-sf_use_stream_aware_mpi","Assume the underlying MPI is cuda-stream aware","PetscSFSetFromOptions",sf->use_stream_aware_mpi,&sf->use_stream_aware_mpi,NULL);CHKERRQ(ierr);
298   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
299   ierr = PetscOptionsEnd();CHKERRQ(ierr);
300   PetscFunctionReturn(0);
301 }
302 
303 /*@
304    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
305 
306    Logically Collective
307 
308    Input Arguments:
309 +  sf - star forest
310 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
311 
312    Level: advanced
313 
314 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
315 @*/
316 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
317 {
318   PetscFunctionBegin;
319   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
320   PetscValidLogicalCollectiveBool(sf,flg,2);
321   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
322   sf->rankorder = flg;
323   PetscFunctionReturn(0);
324 }
325 
326 /*@
327    PetscSFSetGraph - Set a parallel star forest
328 
329    Collective
330 
331    Input Arguments:
332 +  sf - star forest
333 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
334 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
335 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
336 during setup in debug mode)
337 .  localmode - copy mode for ilocal
338 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
339 during setup in debug mode)
340 -  remotemode - copy mode for iremote
341 
342    Level: intermediate
343 
344    Notes:
345     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
346 
347    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
348    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
349    needed
350 
351    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
352    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
353 
354 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
355 @*/
356 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
357 {
358   PetscErrorCode ierr;
359 
360   PetscFunctionBegin;
361   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
362   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
363   if (nleaves > 0) PetscValidPointer(iremote,6);
364   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
365   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
366 
367   if (sf->nroots >= 0) { /* Reset only if graph already set */
368     ierr = PetscSFReset(sf);CHKERRQ(ierr);
369   }
370 
371   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
372 
373   sf->nroots  = nroots;
374   sf->nleaves = nleaves;
375 
376   if (nleaves && ilocal) {
377     PetscInt i;
378     PetscInt minleaf = PETSC_MAX_INT;
379     PetscInt maxleaf = PETSC_MIN_INT;
380     int      contiguous = 1;
381     for (i=0; i<nleaves; i++) {
382       minleaf = PetscMin(minleaf,ilocal[i]);
383       maxleaf = PetscMax(maxleaf,ilocal[i]);
384       contiguous &= (ilocal[i] == i);
385     }
386     sf->minleaf = minleaf;
387     sf->maxleaf = maxleaf;
388     if (contiguous) {
389       if (localmode == PETSC_OWN_POINTER) {
390         ierr = PetscFree(ilocal);CHKERRQ(ierr);
391       }
392       ilocal = NULL;
393     }
394   } else {
395     sf->minleaf = 0;
396     sf->maxleaf = nleaves - 1;
397   }
398 
399   if (ilocal) {
400     switch (localmode) {
401     case PETSC_COPY_VALUES:
402       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
403       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
404       sf->mine = sf->mine_alloc;
405       break;
406     case PETSC_OWN_POINTER:
407       sf->mine_alloc = (PetscInt*)ilocal;
408       sf->mine       = sf->mine_alloc;
409       break;
410     case PETSC_USE_POINTER:
411       sf->mine_alloc = NULL;
412       sf->mine       = (PetscInt*)ilocal;
413       break;
414     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
415     }
416   }
417 
418   switch (remotemode) {
419   case PETSC_COPY_VALUES:
420     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
421     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
422     sf->remote = sf->remote_alloc;
423     break;
424   case PETSC_OWN_POINTER:
425     sf->remote_alloc = (PetscSFNode*)iremote;
426     sf->remote       = sf->remote_alloc;
427     break;
428   case PETSC_USE_POINTER:
429     sf->remote_alloc = NULL;
430     sf->remote       = (PetscSFNode*)iremote;
431     break;
432   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
433   }
434 
435   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
436   sf->graphset = PETSC_TRUE;
437   PetscFunctionReturn(0);
438 }
439 
440 /*@
441   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
442 
443   Collective
444 
445   Input Parameters:
446 + sf      - The PetscSF
447 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
448 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
449 
450   Notes:
451   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
452   n and N are local and global sizes of x respectively.
453 
454   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
455   sequential vectors y on all ranks.
456 
457   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
458   sequential vector y on rank 0.
459 
460   In above cases, entries of x are roots and entries of y are leaves.
461 
462   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
463   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
464   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
465   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
466   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
467 
468   In this case, roots and leaves are symmetric.
469 
470   Level: intermediate
471  @*/
472 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
473 {
474   MPI_Comm       comm;
475   PetscInt       n,N,res[2];
476   PetscMPIInt    rank,size;
477   PetscSFType    type;
478   PetscErrorCode ierr;
479 
480   PetscFunctionBegin;
481   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
482   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
483   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
484   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
485 
486   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
487     type = PETSCSFALLTOALL;
488     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
489     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
490     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
491     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
492   } else {
493     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
494     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
495     res[0] = n;
496     res[1] = -n;
497     /* Check if n are same over all ranks so that we can optimize it */
498     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
499     if (res[0] == -res[1]) { /* same n */
500       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
501     } else {
502       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
503     }
504     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
505   }
506   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
507 
508   sf->pattern = pattern;
509   sf->mine    = NULL; /* Contiguous */
510 
511   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
512      Also set other easy stuff.
513    */
514   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
515     sf->nleaves      = N;
516     sf->nroots       = n;
517     sf->nranks       = size;
518     sf->minleaf      = 0;
519     sf->maxleaf      = N - 1;
520   } else if (pattern == PETSCSF_PATTERN_GATHER) {
521     sf->nleaves      = rank ? 0 : N;
522     sf->nroots       = n;
523     sf->nranks       = rank ? 0 : size;
524     sf->minleaf      = 0;
525     sf->maxleaf      = rank ? -1 : N - 1;
526   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
527     sf->nleaves      = size;
528     sf->nroots       = size;
529     sf->nranks       = size;
530     sf->minleaf      = 0;
531     sf->maxleaf      = size - 1;
532   }
533   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
534   sf->graphset = PETSC_TRUE;
535   PetscFunctionReturn(0);
536 }
537 
538 /*@
539    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
540 
541    Collective
542 
543    Input Arguments:
544 
545 .  sf - star forest to invert
546 
547    Output Arguments:
548 .  isf - inverse of sf
549    Level: advanced
550 
551    Notes:
552    All roots must have degree 1.
553 
554    The local space may be a permutation, but cannot be sparse.
555 
556 .seealso: PetscSFSetGraph()
557 @*/
558 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
559 {
560   PetscErrorCode ierr;
561   PetscMPIInt    rank;
562   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
563   const PetscInt *ilocal;
564   PetscSFNode    *roots,*leaves;
565 
566   PetscFunctionBegin;
567   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
568   PetscSFCheckGraphSet(sf,1);
569   PetscValidPointer(isf,2);
570 
571   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
572   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
573 
574   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
575   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
576   for (i=0; i<maxlocal; i++) {
577     leaves[i].rank  = rank;
578     leaves[i].index = i;
579   }
580   for (i=0; i <nroots; i++) {
581     roots[i].rank  = -1;
582     roots[i].index = -1;
583   }
584   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
585   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
586 
587   /* Check whether our leaves are sparse */
588   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
589   if (count == nroots) newilocal = NULL;
590   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
591     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
592     for (i=0,count=0; i<nroots; i++) {
593       if (roots[i].rank >= 0) {
594         newilocal[count]   = i;
595         roots[count].rank  = roots[i].rank;
596         roots[count].index = roots[i].index;
597         count++;
598       }
599     }
600   }
601 
602   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
603   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
604   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
605   PetscFunctionReturn(0);
606 }
607 
608 /*@
609    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
610 
611    Collective
612 
613    Input Arguments:
614 +  sf - communication object to duplicate
615 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
616 
617    Output Arguments:
618 .  newsf - new communication object
619 
620    Level: beginner
621 
622 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
623 @*/
624 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
625 {
626   PetscSFType    type;
627   PetscErrorCode ierr;
628 
629   PetscFunctionBegin;
630   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
631   PetscValidLogicalCollectiveEnum(sf,opt,2);
632   PetscValidPointer(newsf,3);
633   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
634   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
635   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
636   if (opt == PETSCSF_DUPLICATE_GRAPH) {
637     PetscSFCheckGraphSet(sf,1);
638     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
639       PetscInt          nroots,nleaves;
640       const PetscInt    *ilocal;
641       const PetscSFNode *iremote;
642       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
643       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
644     } else {
645       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
646     }
647   }
648   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
649   PetscFunctionReturn(0);
650 }
651 
652 /*@C
653    PetscSFGetGraph - Get the graph specifying a parallel star forest
654 
655    Not Collective
656 
657    Input Arguments:
658 .  sf - star forest
659 
660    Output Arguments:
661 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
662 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
663 .  ilocal - locations of leaves in leafdata buffers
664 -  iremote - remote locations of root vertices for each leaf on the current process
665 
666    Notes:
667    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
668 
669    When called from Fortran, the returned iremote array is a copy and must be deallocated after use. Consequently, if you
670    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
671 
672    Level: intermediate
673 
674 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
675 @*/
676 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
677 {
678   PetscErrorCode ierr;
679 
680   PetscFunctionBegin;
681   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
682   if (sf->ops->GetGraph) {
683     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
684   } else {
685     if (nroots) *nroots = sf->nroots;
686     if (nleaves) *nleaves = sf->nleaves;
687     if (ilocal) *ilocal = sf->mine;
688     if (iremote) *iremote = sf->remote;
689   }
690   PetscFunctionReturn(0);
691 }
692 
693 /*@
694    PetscSFGetLeafRange - Get the active leaf ranges
695 
696    Not Collective
697 
698    Input Arguments:
699 .  sf - star forest
700 
701    Output Arguments:
702 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
703 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
704 
705    Level: developer
706 
707 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
708 @*/
709 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
710 {
711 
712   PetscFunctionBegin;
713   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
714   PetscSFCheckGraphSet(sf,1);
715   if (minleaf) *minleaf = sf->minleaf;
716   if (maxleaf) *maxleaf = sf->maxleaf;
717   PetscFunctionReturn(0);
718 }
719 
720 /*@C
721    PetscSFViewFromOptions - View from Options
722 
723    Collective on PetscSF
724 
725    Input Parameters:
726 +  A - the star forest
727 .  obj - Optional object
728 -  name - command line option
729 
730    Level: intermediate
731 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
732 @*/
733 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
734 {
735   PetscErrorCode ierr;
736 
737   PetscFunctionBegin;
738   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
739   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
740   PetscFunctionReturn(0);
741 }
742 
743 /*@C
744    PetscSFView - view a star forest
745 
746    Collective
747 
748    Input Arguments:
749 +  sf - star forest
750 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
751 
752    Level: beginner
753 
754 .seealso: PetscSFCreate(), PetscSFSetGraph()
755 @*/
756 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
757 {
758   PetscErrorCode    ierr;
759   PetscBool         iascii;
760   PetscViewerFormat format;
761 
762   PetscFunctionBegin;
763   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
764   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
765   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
766   PetscCheckSameComm(sf,1,viewer,2);
767   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
768   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
769   if (iascii) {
770     PetscMPIInt rank;
771     PetscInt    ii,i,j;
772 
773     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
774     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
775     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
776     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
777       if (!sf->graphset) {
778         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
779         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
780         PetscFunctionReturn(0);
781       }
782       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
783       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
784       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
785       for (i=0; i<sf->nleaves; i++) {
786         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
787       }
788       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
789       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
790       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
791         PetscMPIInt *tmpranks,*perm;
792         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
793         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
794         for (i=0; i<sf->nranks; i++) perm[i] = i;
795         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
796         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
797         for (ii=0; ii<sf->nranks; ii++) {
798           i = perm[ii];
799           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
800           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
801             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
802           }
803         }
804         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
805       }
806       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
807       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
808     }
809     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
810   }
811   PetscFunctionReturn(0);
812 }
813 
814 /*@C
815    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
816 
817    Not Collective
818 
819    Input Arguments:
820 .  sf - star forest
821 
822    Output Arguments:
823 +  nranks - number of ranks referenced by local part
824 .  ranks - array of ranks
825 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
826 .  rmine - concatenated array holding local indices referencing each remote rank
827 -  rremote - concatenated array holding remote indices referenced for each remote rank
828 
829    Level: developer
830 
831 .seealso: PetscSFGetLeafRanks()
832 @*/
833 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
834 {
835   PetscErrorCode ierr;
836 
837   PetscFunctionBegin;
838   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
839   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
840   if (sf->ops->GetRootRanks) {
841     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
842   } else {
843     /* The generic implementation */
844     if (nranks)  *nranks  = sf->nranks;
845     if (ranks)   *ranks   = sf->ranks;
846     if (roffset) *roffset = sf->roffset;
847     if (rmine)   *rmine   = sf->rmine;
848     if (rremote) *rremote = sf->rremote;
849   }
850   PetscFunctionReturn(0);
851 }
852 
853 /*@C
854    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
855 
856    Not Collective
857 
858    Input Arguments:
859 .  sf - star forest
860 
861    Output Arguments:
862 +  niranks - number of leaf ranks referencing roots on this process
863 .  iranks - array of ranks
864 .  ioffset - offset in irootloc for each rank (length niranks+1)
865 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
866 
867    Level: developer
868 
869 .seealso: PetscSFGetRootRanks()
870 @*/
871 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
872 {
873   PetscErrorCode ierr;
874 
875   PetscFunctionBegin;
876   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
877   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
878   if (sf->ops->GetLeafRanks) {
879     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
880   } else {
881     PetscSFType type;
882     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
883     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
884   }
885   PetscFunctionReturn(0);
886 }
887 
888 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
889   PetscInt i;
890   for (i=0; i<n; i++) {
891     if (needle == list[i]) return PETSC_TRUE;
892   }
893   return PETSC_FALSE;
894 }
895 
896 /*@C
897    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
898 
899    Collective
900 
901    Input Arguments:
902 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
903 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
904 
905    Level: developer
906 
907 .seealso: PetscSFGetRootRanks()
908 @*/
909 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
910 {
911   PetscErrorCode     ierr;
912   PetscTable         table;
913   PetscTablePosition pos;
914   PetscMPIInt        size,groupsize,*groupranks;
915   PetscInt           *rcount,*ranks;
916   PetscInt           i, irank = -1,orank = -1;
917 
918   PetscFunctionBegin;
919   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
920   PetscSFCheckGraphSet(sf,1);
921   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
922   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
923   for (i=0; i<sf->nleaves; i++) {
924     /* Log 1-based rank */
925     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
926   }
927   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
928   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
929   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
930   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
931   for (i=0; i<sf->nranks; i++) {
932     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
933     ranks[i]--;             /* Convert back to 0-based */
934   }
935   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
936 
937   /* We expect that dgroup is reliably "small" while nranks could be large */
938   {
939     MPI_Group group = MPI_GROUP_NULL;
940     PetscMPIInt *dgroupranks;
941     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
942     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRQ(ierr);
943     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
944     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
945     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
946     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRQ(ierr);}
947     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
948     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
949   }
950 
951   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
952   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i; ) {
953     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
954       if (InList(ranks[i],groupsize,groupranks)) break;
955     }
956     for ( ; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
957       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
958     }
959     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
960       PetscInt tmprank,tmpcount;
961 
962       tmprank             = ranks[i];
963       tmpcount            = rcount[i];
964       ranks[i]            = ranks[sf->ndranks];
965       rcount[i]           = rcount[sf->ndranks];
966       ranks[sf->ndranks]  = tmprank;
967       rcount[sf->ndranks] = tmpcount;
968       sf->ndranks++;
969     }
970   }
971   ierr = PetscFree(groupranks);CHKERRQ(ierr);
972   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
973   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
974   sf->roffset[0] = 0;
975   for (i=0; i<sf->nranks; i++) {
976     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
977     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
978     rcount[i]        = 0;
979   }
980   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
981     /* short circuit */
982     if (orank != sf->remote[i].rank) {
983       /* Search for index of iremote[i].rank in sf->ranks */
984       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
985       if (irank < 0) {
986         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
987         if (irank >= 0) irank += sf->ndranks;
988       }
989       orank = sf->remote[i].rank;
990     }
991     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
992     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
993     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
994     rcount[irank]++;
995   }
996   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
997   PetscFunctionReturn(0);
998 }
999 
1000 /*@C
1001    PetscSFGetGroups - gets incoming and outgoing process groups
1002 
1003    Collective
1004 
1005    Input Argument:
1006 .  sf - star forest
1007 
1008    Output Arguments:
1009 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1010 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1011 
1012    Level: developer
1013 
1014 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1015 @*/
1016 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1017 {
1018   PetscErrorCode ierr;
1019   MPI_Group      group = MPI_GROUP_NULL;
1020 
1021   PetscFunctionBegin;
1022   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1023   if (sf->ingroup == MPI_GROUP_NULL) {
1024     PetscInt       i;
1025     const PetscInt *indegree;
1026     PetscMPIInt    rank,*outranks,*inranks;
1027     PetscSFNode    *remote;
1028     PetscSF        bgcount;
1029 
1030     /* Compute the number of incoming ranks */
1031     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1032     for (i=0; i<sf->nranks; i++) {
1033       remote[i].rank  = sf->ranks[i];
1034       remote[i].index = 0;
1035     }
1036     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1037     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1038     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1039     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1040     /* Enumerate the incoming ranks */
1041     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1042     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1043     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1044     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1045     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1046     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1047     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRQ(ierr);
1048     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1049     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1050     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1051   }
1052   *incoming = sf->ingroup;
1053 
1054   if (sf->outgroup == MPI_GROUP_NULL) {
1055     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1056     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRQ(ierr);
1057     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1058   }
1059   *outgoing = sf->outgroup;
1060   PetscFunctionReturn(0);
1061 }
1062 
1063 /*@
1064    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1065 
1066    Collective
1067 
1068    Input Argument:
1069 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1070 
1071    Output Arguments:
1072 .  multi - star forest with split roots, such that each root has degree exactly 1
1073 
1074    Level: developer
1075 
1076    Notes:
1077 
1078    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1079    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1080    edge, it is a candidate for future optimization that might involve its removal.
1081 
1082 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1083 @*/
1084 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1085 {
1086   PetscErrorCode ierr;
1087 
1088   PetscFunctionBegin;
1089   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1090   PetscValidPointer(multi,2);
1091   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1092     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1093     *multi = sf->multi;
1094     PetscFunctionReturn(0);
1095   }
1096   if (!sf->multi) {
1097     const PetscInt *indegree;
1098     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1099     PetscSFNode    *remote;
1100     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1101     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1102     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1103     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1104     inoffset[0] = 0;
1105     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1106     for (i=0; i<maxlocal; i++) outones[i] = 1;
1107     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1108     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1109     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1110 #if defined(PETSC_USE_DEBUG)                                 /* Check that the expected number of increments occurred */
1111     for (i=0; i<sf->nroots; i++) {
1112       if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1113     }
1114 #endif
1115     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1116     for (i=0; i<sf->nleaves; i++) {
1117       remote[i].rank  = sf->remote[i].rank;
1118       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1119     }
1120     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1121     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1122     if (sf->rankorder) {        /* Sort the ranks */
1123       PetscMPIInt rank;
1124       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1125       PetscSFNode *newremote;
1126       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1127       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1128       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1129       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1130       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1131       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1132       /* Sort the incoming ranks at each vertex, build the inverse map */
1133       for (i=0; i<sf->nroots; i++) {
1134         PetscInt j;
1135         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1136         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1137         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1138       }
1139       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1140       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1141       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1142       for (i=0; i<sf->nleaves; i++) {
1143         newremote[i].rank  = sf->remote[i].rank;
1144         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1145       }
1146       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1147       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1148     }
1149     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1150   }
1151   *multi = sf->multi;
1152   PetscFunctionReturn(0);
1153 }
1154 
1155 /*@C
1156    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1157 
1158    Collective
1159 
1160    Input Arguments:
1161 +  sf - original star forest
1162 .  nselected  - number of selected roots on this process
1163 -  selected   - indices of the selected roots on this process
1164 
1165    Output Arguments:
1166 .  esf - new star forest
1167 
1168    Level: advanced
1169 
1170    Note:
1171    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1172    be done by calling PetscSFGetGraph().
1173 
1174 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1175 @*/
1176 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *esf)
1177 {
1178   PetscInt          i,j,n,nroots,nleaves,esf_nleaves,*new_ilocal,minleaf,maxleaf,maxlocal;
1179   const PetscInt    *ilocal;
1180   signed char       *rootdata,*leafdata,*leafmem;
1181   const PetscSFNode *iremote;
1182   PetscSFNode       *new_iremote;
1183   MPI_Comm          comm;
1184   PetscErrorCode    ierr;
1185 
1186   PetscFunctionBegin;
1187   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1188   PetscSFCheckGraphSet(sf,1);
1189   if (nselected) PetscValidPointer(selected,3);
1190   PetscValidPointer(esf,4);
1191 
1192   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1193 
1194   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1195   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1196   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1197 
1198 #if defined(PETSC_USE_DEBUG)
1199   /* Error out if selected[] has dups or  out of range indices */
1200   {
1201     PetscBool dups;
1202     ierr = PetscCheckDupsInt(nselected,selected,&dups);CHKERRQ(ierr);
1203     if (dups) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"selected[] has dups");
1204     for (i=0; i<nselected; i++)
1205       if (selected[i] < 0 || selected[i] >= nroots) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"selected root indice %D is out of [0,%D)",selected[i],nroots);
1206   }
1207 #endif
1208 
1209   if (sf->ops->CreateEmbeddedSF) {
1210     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,selected,esf);CHKERRQ(ierr);
1211   } else {
1212     /* A generic version of creating embedded sf */
1213     ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
1214     maxlocal = maxleaf - minleaf + 1;
1215     ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafmem);CHKERRQ(ierr);
1216     leafdata = leafmem - minleaf;
1217     /* Tag selected roots and bcast to leaves */
1218     for (i=0; i<nselected; i++) rootdata[selected[i]] = 1;
1219     ierr = PetscSFBcastBegin(sf,MPI_SIGNED_CHAR,rootdata,leafdata);CHKERRQ(ierr);
1220     ierr = PetscSFBcastEnd(sf,MPI_SIGNED_CHAR,rootdata,leafdata);CHKERRQ(ierr);
1221 
1222     /* Build esf with leaves that are still connected */
1223     esf_nleaves = 0;
1224     for (i=0; i<nleaves; i++) {
1225       j = ilocal ? ilocal[i] : i;
1226       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1227          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1228       */
1229       esf_nleaves += (leafdata[j] ? 1 : 0);
1230     }
1231     ierr = PetscMalloc1(esf_nleaves,&new_ilocal);CHKERRQ(ierr);
1232     ierr = PetscMalloc1(esf_nleaves,&new_iremote);CHKERRQ(ierr);
1233     for (i=n=0; i<nleaves; i++) {
1234       j = ilocal ? ilocal[i] : i;
1235       if (leafdata[j]) {
1236         new_ilocal[n]        = j;
1237         new_iremote[n].rank  = iremote[i].rank;
1238         new_iremote[n].index = iremote[i].index;
1239         ++n;
1240       }
1241     }
1242     ierr = PetscSFCreate(comm,esf);CHKERRQ(ierr);
1243     ierr = PetscSFSetFromOptions(*esf);CHKERRQ(ierr);
1244     ierr = PetscSFSetGraph(*esf,nroots,esf_nleaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1245     ierr = PetscFree2(rootdata,leafmem);CHKERRQ(ierr);
1246   }
1247   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 /*@C
1252   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1253 
1254   Collective
1255 
1256   Input Arguments:
1257 + sf - original star forest
1258 . nselected  - number of selected leaves on this process
1259 - selected   - indices of the selected leaves on this process
1260 
1261   Output Arguments:
1262 .  newsf - new star forest
1263 
1264   Level: advanced
1265 
1266 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1267 @*/
1268 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1269 {
1270   const PetscSFNode *iremote;
1271   PetscSFNode       *new_iremote;
1272   const PetscInt    *ilocal;
1273   PetscInt          i,nroots,*leaves,*new_ilocal;
1274   MPI_Comm          comm;
1275   PetscErrorCode    ierr;
1276 
1277   PetscFunctionBegin;
1278   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1279   PetscSFCheckGraphSet(sf,1);
1280   if (nselected) PetscValidPointer(selected,3);
1281   PetscValidPointer(newsf,4);
1282 
1283   /* Uniq selected[] and put results in leaves[] */
1284   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1285   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1286   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1287   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1288   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1289 
1290   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1291   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1292     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1293   } else {
1294     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1295     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1296     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1297     for (i=0; i<nselected; ++i) {
1298       const PetscInt l     = leaves[i];
1299       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1300       new_iremote[i].rank  = iremote[l].rank;
1301       new_iremote[i].index = iremote[l].index;
1302     }
1303     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1304     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1305   }
1306   ierr = PetscFree(leaves);CHKERRQ(ierr);
1307   PetscFunctionReturn(0);
1308 }
1309 
1310 /*@C
1311    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1312 
1313    Collective on PetscSF
1314 
1315    Input Arguments:
1316 +  sf - star forest on which to communicate
1317 .  unit - data type associated with each node
1318 .  rootdata - buffer to broadcast
1319 -  op - operation to use for reduction
1320 
1321    Output Arguments:
1322 .  leafdata - buffer to be reduced with values from each leaf's respective root
1323 
1324    Level: intermediate
1325 
1326 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd()
1327 @*/
1328 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1329 {
1330   PetscErrorCode ierr;
1331   PetscMemType   rootmtype,leafmtype;
1332 
1333   PetscFunctionBegin;
1334   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1335   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1336   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1337   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1338   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1339   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1340   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1341   PetscFunctionReturn(0);
1342 }
1343 
1344 /*@C
1345    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1346 
1347    Collective
1348 
1349    Input Arguments:
1350 +  sf - star forest
1351 .  unit - data type
1352 .  rootdata - buffer to broadcast
1353 -  op - operation to use for reduction
1354 
1355    Output Arguments:
1356 .  leafdata - buffer to be reduced with values from each leaf's respective root
1357 
1358    Level: intermediate
1359 
1360 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1361 @*/
1362 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1363 {
1364   PetscErrorCode ierr;
1365 
1366   PetscFunctionBegin;
1367   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1368   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1369   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1370   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootdata,leafdata,op);CHKERRQ(ierr);
1371   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1372   PetscFunctionReturn(0);
1373 }
1374 
1375 /*@C
1376    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1377 
1378    Collective
1379 
1380    Input Arguments:
1381 +  sf - star forest
1382 .  unit - data type
1383 .  leafdata - values to reduce
1384 -  op - reduction operation
1385 
1386    Output Arguments:
1387 .  rootdata - result of reduction of values from all leaves of each root
1388 
1389    Level: intermediate
1390 
1391 .seealso: PetscSFBcastBegin()
1392 @*/
1393 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1394 {
1395   PetscErrorCode ierr;
1396   PetscMemType   rootmtype,leafmtype;
1397 
1398   PetscFunctionBegin;
1399   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1400   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1401   ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1402   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1403   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1404   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1405   ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1406   PetscFunctionReturn(0);
1407 }
1408 
1409 /*@C
1410    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1411 
1412    Collective
1413 
1414    Input Arguments:
1415 +  sf - star forest
1416 .  unit - data type
1417 .  leafdata - values to reduce
1418 -  op - reduction operation
1419 
1420    Output Arguments:
1421 .  rootdata - result of reduction of values from all leaves of each root
1422 
1423    Level: intermediate
1424 
1425 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1426 @*/
1427 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1428 {
1429   PetscErrorCode ierr;
1430 
1431   PetscFunctionBegin;
1432   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1433   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1434   ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1435   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
1436   ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1437   PetscFunctionReturn(0);
1438 }
1439 
1440 /*@C
1441    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1442 
1443    Collective
1444 
1445    Input Arguments:
1446 +  sf - star forest
1447 .  unit - data type
1448 .  leafdata - leaf values to use in reduction
1449 -  op - operation to use for reduction
1450 
1451    Output Arguments:
1452 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1453 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1454 
1455    Level: advanced
1456 
1457    Note:
1458    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1459    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1460    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1461    integers.
1462 
1463 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1464 @*/
1465 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1466 {
1467   PetscErrorCode ierr;
1468   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1469 
1470   PetscFunctionBegin;
1471   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1472   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1473   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1474   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1475   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1476   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1477   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1478   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1479   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1480   PetscFunctionReturn(0);
1481 }
1482 
1483 /*@C
1484    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1485 
1486    Collective
1487 
1488    Input Arguments:
1489 +  sf - star forest
1490 .  unit - data type
1491 .  leafdata - leaf values to use in reduction
1492 -  op - operation to use for reduction
1493 
1494    Output Arguments:
1495 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1496 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1497 
1498    Level: advanced
1499 
1500 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1501 @*/
1502 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1503 {
1504   PetscErrorCode ierr;
1505 
1506   PetscFunctionBegin;
1507   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1508   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1509   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1510   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootdata,leafdata,leafupdate,op);CHKERRQ(ierr);
1511   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1512   PetscFunctionReturn(0);
1513 }
1514 
1515 /*@C
1516    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1517 
1518    Collective
1519 
1520    Input Arguments:
1521 .  sf - star forest
1522 
1523    Output Arguments:
1524 .  degree - degree of each root vertex
1525 
1526    Level: advanced
1527 
1528    Notes:
1529    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1530 
1531 .seealso: PetscSFGatherBegin()
1532 @*/
1533 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1534 {
1535   PetscErrorCode ierr;
1536 
1537   PetscFunctionBegin;
1538   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1539   PetscSFCheckGraphSet(sf,1);
1540   PetscValidPointer(degree,2);
1541   if (!sf->degreeknown) {
1542     PetscInt i, nroots = sf->nroots, maxlocal;
1543     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1544     maxlocal = sf->maxleaf-sf->minleaf+1;
1545     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1546     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1547     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1548     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1549     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1550   }
1551   *degree = NULL;
1552   PetscFunctionReturn(0);
1553 }
1554 
1555 /*@C
1556    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1557 
1558    Collective
1559 
1560    Input Arguments:
1561 .  sf - star forest
1562 
1563    Output Arguments:
1564 .  degree - degree of each root vertex
1565 
1566    Level: developer
1567 
1568    Notes:
1569    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1570 
1571 .seealso:
1572 @*/
1573 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1574 {
1575   PetscErrorCode ierr;
1576 
1577   PetscFunctionBegin;
1578   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1579   PetscSFCheckGraphSet(sf,1);
1580   PetscValidPointer(degree,2);
1581   if (!sf->degreeknown) {
1582     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1583     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1584     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1585     sf->degreeknown = PETSC_TRUE;
1586   }
1587   *degree = sf->degree;
1588   PetscFunctionReturn(0);
1589 }
1590 
1591 
1592 /*@C
1593    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1594    Each multi-root is assigned index of the corresponding original root.
1595 
1596    Collective
1597 
1598    Input Arguments:
1599 +  sf - star forest
1600 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1601 
1602    Output Arguments:
1603 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1604 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1605 
1606    Level: developer
1607 
1608    Notes:
1609    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1610 
1611 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1612 @*/
1613 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1614 {
1615   PetscSF             msf;
1616   PetscInt            i, j, k, nroots, nmroots;
1617   PetscErrorCode      ierr;
1618 
1619   PetscFunctionBegin;
1620   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1621   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1622   if (nroots) PetscValidIntPointer(degree,2);
1623   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1624   PetscValidPointer(multiRootsOrigNumbering,4);
1625   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1626   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1627   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1628   for (i=0,j=0,k=0; i<nroots; i++) {
1629     if (!degree[i]) continue;
1630     for (j=0; j<degree[i]; j++,k++) {
1631       (*multiRootsOrigNumbering)[k] = i;
1632     }
1633   }
1634 #if defined(PETSC_USE_DEBUG)
1635   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1636 #endif
1637   if (nMultiRoots) *nMultiRoots = nmroots;
1638   PetscFunctionReturn(0);
1639 }
1640 
1641 /*@C
1642    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1643 
1644    Collective
1645 
1646    Input Arguments:
1647 +  sf - star forest
1648 .  unit - data type
1649 -  leafdata - leaf data to gather to roots
1650 
1651    Output Argument:
1652 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1653 
1654    Level: intermediate
1655 
1656 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1657 @*/
1658 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1659 {
1660   PetscErrorCode ierr;
1661   PetscSF        multi;
1662 
1663   PetscFunctionBegin;
1664   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1665   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1666   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1667   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1668   PetscFunctionReturn(0);
1669 }
1670 
1671 /*@C
1672    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1673 
1674    Collective
1675 
1676    Input Arguments:
1677 +  sf - star forest
1678 .  unit - data type
1679 -  leafdata - leaf data to gather to roots
1680 
1681    Output Argument:
1682 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1683 
1684    Level: intermediate
1685 
1686 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1687 @*/
1688 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1689 {
1690   PetscErrorCode ierr;
1691   PetscSF        multi;
1692 
1693   PetscFunctionBegin;
1694   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1695   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1696   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1697   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1698   PetscFunctionReturn(0);
1699 }
1700 
1701 /*@C
1702    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1703 
1704    Collective
1705 
1706    Input Arguments:
1707 +  sf - star forest
1708 .  unit - data type
1709 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1710 
1711    Output Argument:
1712 .  leafdata - leaf data to be update with personal data from each respective root
1713 
1714    Level: intermediate
1715 
1716 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1717 @*/
1718 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1719 {
1720   PetscErrorCode ierr;
1721   PetscSF        multi;
1722 
1723   PetscFunctionBegin;
1724   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1725   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1726   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1727   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1728   PetscFunctionReturn(0);
1729 }
1730 
1731 /*@C
1732    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1733 
1734    Collective
1735 
1736    Input Arguments:
1737 +  sf - star forest
1738 .  unit - data type
1739 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1740 
1741    Output Argument:
1742 .  leafdata - leaf data to be update with personal data from each respective root
1743 
1744    Level: intermediate
1745 
1746 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1747 @*/
1748 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1749 {
1750   PetscErrorCode ierr;
1751   PetscSF        multi;
1752 
1753   PetscFunctionBegin;
1754   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1755   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1756   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1757   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1758   PetscFunctionReturn(0);
1759 }
1760 
1761 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1762 {
1763 #if defined(PETSC_USE_DEBUG)
1764   PetscInt        i, n, nleaves;
1765   const PetscInt *ilocal = NULL;
1766   PetscHSetI      seen;
1767   PetscErrorCode  ierr;
1768 
1769   PetscFunctionBegin;
1770   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1771   ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1772   for (i = 0; i < nleaves; i++) {
1773     const PetscInt leaf = ilocal ? ilocal[i] : i;
1774     ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1775   }
1776   ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1777   if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1778   ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1779   PetscFunctionReturn(0);
1780 #else
1781   PetscFunctionBegin;
1782   PetscFunctionReturn(0);
1783 #endif
1784 }
1785 
1786 /*@
1787   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1788 
1789   Input Parameters:
1790 + sfA - The first PetscSF
1791 - sfB - The second PetscSF
1792 
1793   Output Parameters:
1794 . sfBA - The composite SF
1795 
1796   Level: developer
1797 
1798   Notes:
1799   Currently, the two SFs must be defined on congruent communicators and they must be true star
1800   forests, i.e. the same leaf is not connected with different roots.
1801 
1802   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1803   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1804   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1805   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1806 
1807 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1808 @*/
1809 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1810 {
1811   PetscErrorCode    ierr;
1812   const PetscSFNode *remotePointsA,*remotePointsB;
1813   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1814   const PetscInt    *localPointsA,*localPointsB;
1815   PetscInt          *localPointsBA;
1816   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1817   PetscBool         denseB;
1818 
1819   PetscFunctionBegin;
1820   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1821   PetscSFCheckGraphSet(sfA,1);
1822   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1823   PetscSFCheckGraphSet(sfB,2);
1824   PetscCheckSameComm(sfA,1,sfB,2);
1825   PetscValidPointer(sfBA,3);
1826   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1827   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1828 
1829   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1830   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1831   if (localPointsA) {
1832     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
1833     for (i=0; i<numRootsB; i++) {
1834       reorderedRemotePointsA[i].rank = -1;
1835       reorderedRemotePointsA[i].index = -1;
1836     }
1837     for (i=0; i<numLeavesA; i++) {
1838       if (localPointsA[i] >= numRootsB) continue;
1839       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
1840     }
1841     remotePointsA = reorderedRemotePointsA;
1842   }
1843   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
1844   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
1845   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1846   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1847   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1848 
1849   denseB = (PetscBool)!localPointsB;
1850   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1851     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
1852     else numLeavesBA++;
1853   }
1854   if (denseB) {
1855     localPointsBA  = NULL;
1856     remotePointsBA = leafdataB;
1857   } else {
1858     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
1859     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
1860     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1861       const PetscInt l = localPointsB ? localPointsB[i] : i;
1862 
1863       if (leafdataB[l-minleaf].rank == -1) continue;
1864       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
1865       localPointsBA[numLeavesBA] = l;
1866       numLeavesBA++;
1867     }
1868     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
1869   }
1870   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1871   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 /*@
1876   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
1877 
1878   Input Parameters:
1879 + sfA - The first PetscSF
1880 - sfB - The second PetscSF
1881 
1882   Output Parameters:
1883 . sfBA - The composite SF.
1884 
1885   Level: developer
1886 
1887   Notes:
1888   Currently, the two SFs must be defined on congruent communicators and they must be true star
1889   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
1890   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
1891 
1892   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
1893   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
1894   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
1895   a Reduce on sfB, on connected roots.
1896 
1897 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
1898 @*/
1899 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1900 {
1901   PetscErrorCode    ierr;
1902   const PetscSFNode *remotePointsA,*remotePointsB;
1903   PetscSFNode       *remotePointsBA;
1904   const PetscInt    *localPointsA,*localPointsB;
1905   PetscSFNode       *reorderedRemotePointsA = NULL;
1906   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
1907   MPI_Op            op;
1908 #if defined(PETSC_USE_64BIT_INDICES)
1909   PetscBool         iswin;
1910 #endif
1911 
1912   PetscFunctionBegin;
1913   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1914   PetscSFCheckGraphSet(sfA,1);
1915   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1916   PetscSFCheckGraphSet(sfB,2);
1917   PetscCheckSameComm(sfA,1,sfB,2);
1918   PetscValidPointer(sfBA,3);
1919   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1920   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1921 
1922   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
1923   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
1924 
1925   /* TODO: Check roots of sfB have degree of 1 */
1926   /* Once we implement it, we can replace the MPI_MAXLOC
1927      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
1928      We use MPI_MAXLOC only to have a deterministic output from this routine if
1929      the root condition is not meet.
1930    */
1931   op = MPI_MAXLOC;
1932 #if defined(PETSC_USE_64BIT_INDICES)
1933   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
1934   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
1935   if (iswin) op = MPIU_REPLACE;
1936 #endif
1937 
1938   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
1939   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
1940   for (i=0; i<maxleaf - minleaf + 1; i++) {
1941     reorderedRemotePointsA[i].rank = -1;
1942     reorderedRemotePointsA[i].index = -1;
1943   }
1944   if (localPointsA) {
1945     for (i=0; i<numLeavesA; i++) {
1946       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
1947       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
1948     }
1949   } else {
1950     for (i=0; i<numLeavesA; i++) {
1951       if (i > maxleaf || i < minleaf) continue;
1952       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
1953     }
1954   }
1955 
1956   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
1957   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
1958   for (i=0; i<numRootsB; i++) {
1959     remotePointsBA[i].rank = -1;
1960     remotePointsBA[i].index = -1;
1961   }
1962 
1963   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
1964   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
1965   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1966   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
1967     if (remotePointsBA[i].rank == -1) continue;
1968     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
1969     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
1970     localPointsBA[numLeavesBA] = i;
1971     numLeavesBA++;
1972   }
1973   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1974   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1975   PetscFunctionReturn(0);
1976 }
1977 
1978 /*
1979   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
1980 
1981   Input Parameters:
1982 . sf - The global PetscSF
1983 
1984   Output Parameters:
1985 . out - The local PetscSF
1986  */
1987 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
1988 {
1989   MPI_Comm           comm;
1990   PetscMPIInt        myrank;
1991   const PetscInt     *ilocal;
1992   const PetscSFNode  *iremote;
1993   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
1994   PetscSFNode        *liremote;
1995   PetscSF            lsf;
1996   PetscErrorCode     ierr;
1997 
1998   PetscFunctionBegin;
1999   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2000   if (sf->ops->CreateLocalSF) {
2001     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2002   } else {
2003     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2004     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2005     ierr = MPI_Comm_rank(comm,&myrank);CHKERRQ(ierr);
2006 
2007     /* Find out local edges and build a local SF */
2008     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2009     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2010     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2011     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2012 
2013     for (i=j=0; i<nleaves; i++) {
2014       if (iremote[i].rank == (PetscInt)myrank) {
2015         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2016         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2017         liremote[j].index = iremote[i].index;
2018         j++;
2019       }
2020     }
2021     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2022     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2023     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2024     *out = lsf;
2025   }
2026   PetscFunctionReturn(0);
2027 }
2028 
2029 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2030 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2031 {
2032   PetscErrorCode     ierr;
2033   PetscMemType       rootmtype,leafmtype;
2034 
2035   PetscFunctionBegin;
2036   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2037   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2038   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2039   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2040   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2041   if (sf->ops->BcastToZero) {
2042     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2043   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2044   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2045   PetscFunctionReturn(0);
2046 }
2047 
2048