xref: /petsc/src/vec/is/sf/interface/sf.c (revision 4cc19a0c3736bb6cc12f018d87ef63c2c74635cc)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_USE_DEBUG)
6 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
7     if (PetscUnlikely(!(sf)->graphset))                              \
8       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
9   } while (0)
10 #else
11 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
12 #endif
13 
14 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",0};
15 
16 /*@
17    PetscSFCreate - create a star forest communication context
18 
19    Collective
20 
21    Input Arguments:
22 .  comm - communicator on which the star forest will operate
23 
24    Output Arguments:
25 .  sf - new star forest context
26 
27    Options Database Keys:
28 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
29 .  -sf_type window    -Use MPI-3 one-sided window for communication
30 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
31 
32    Level: intermediate
33 
34    Notes:
35    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
36    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
37    SFs are optimized and they have better performance than general SFs.
38 
39 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
40 @*/
41 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
42 {
43   PetscErrorCode ierr;
44   PetscSF        b;
45 
46   PetscFunctionBegin;
47   PetscValidPointer(sf,2);
48   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
49 
50   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
51 
52   b->nroots    = -1;
53   b->nleaves   = -1;
54   b->minleaf   = PETSC_MAX_INT;
55   b->maxleaf   = PETSC_MIN_INT;
56   b->nranks    = -1;
57   b->rankorder = PETSC_TRUE;
58   b->ingroup   = MPI_GROUP_NULL;
59   b->outgroup  = MPI_GROUP_NULL;
60   b->graphset  = PETSC_FALSE;
61 
62   *sf = b;
63   PetscFunctionReturn(0);
64 }
65 
66 /*@
67    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
68 
69    Collective
70 
71    Input Arguments:
72 .  sf - star forest
73 
74    Level: advanced
75 
76 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
77 @*/
78 PetscErrorCode PetscSFReset(PetscSF sf)
79 {
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
84   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
85   sf->nroots   = -1;
86   sf->nleaves  = -1;
87   sf->minleaf  = PETSC_MAX_INT;
88   sf->maxleaf  = PETSC_MIN_INT;
89   sf->mine     = NULL;
90   sf->remote   = NULL;
91   sf->graphset = PETSC_FALSE;
92   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
93   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
94   sf->nranks = -1;
95   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
96 #if defined(PETSC_HAVE_CUDA)
97   if (sf->rmine_d) {cudaError_t err = cudaFree(sf->rmine_d);CHKERRCUDA(err);sf->rmine_d=NULL;}
98 #endif
99   sf->degreeknown = PETSC_FALSE;
100   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
101   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRQ(ierr);}
102   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRQ(ierr);}
103   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
104   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
105   sf->setupcalled = PETSC_FALSE;
106   PetscFunctionReturn(0);
107 }
108 
109 /*@C
110    PetscSFSetType - Set the PetscSF communication implementation
111 
112    Collective on PetscSF
113 
114    Input Parameters:
115 +  sf - the PetscSF context
116 -  type - a known method
117 
118    Options Database Key:
119 .  -sf_type <type> - Sets the method; use -help for a list
120    of available methods (for instance, window, basic, neighbor)
121 
122    Notes:
123    See "include/petscsf.h" for available methods (for instance)
124 +    PETSCSFWINDOW - MPI-2/3 one-sided
125 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
126 
127   Level: intermediate
128 
129 .seealso: PetscSFType, PetscSFCreate()
130 @*/
131 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
132 {
133   PetscErrorCode ierr,(*r)(PetscSF);
134   PetscBool      match;
135 
136   PetscFunctionBegin;
137   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
138   PetscValidCharPointer(type,2);
139 
140   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
141   if (match) PetscFunctionReturn(0);
142 
143   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
144   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
145   /* Destroy the previous PetscSF implementation context */
146   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
147   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
148   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
149   ierr = (*r)(sf);CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 /*@C
154   PetscSFGetType - Get the PetscSF communication implementation
155 
156   Not Collective
157 
158   Input Parameter:
159 . sf  - the PetscSF context
160 
161   Output Parameter:
162 . type - the PetscSF type name
163 
164   Level: intermediate
165 
166 .seealso: PetscSFSetType(), PetscSFCreate()
167 @*/
168 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
169 {
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
172   PetscValidPointer(type,2);
173   *type = ((PetscObject)sf)->type_name;
174   PetscFunctionReturn(0);
175 }
176 
177 /*@
178    PetscSFDestroy - destroy star forest
179 
180    Collective
181 
182    Input Arguments:
183 .  sf - address of star forest
184 
185    Level: intermediate
186 
187 .seealso: PetscSFCreate(), PetscSFReset()
188 @*/
189 PetscErrorCode PetscSFDestroy(PetscSF *sf)
190 {
191   PetscErrorCode ierr;
192 
193   PetscFunctionBegin;
194   if (!*sf) PetscFunctionReturn(0);
195   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
196   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
197   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
198   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
199   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
204 {
205 #if defined(PETSC_USE_DEBUG)
206   PetscInt           i, nleaves;
207   PetscMPIInt        size;
208   const PetscInt    *ilocal;
209   const PetscSFNode *iremote;
210   PetscErrorCode     ierr;
211 
212   PetscFunctionBegin;
213   if (!sf->graphset) PetscFunctionReturn(0);
214   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
215   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
216   for (i = 0; i < nleaves; i++) {
217     const PetscInt rank = iremote[i].rank;
218     const PetscInt remote = iremote[i].index;
219     const PetscInt leaf = ilocal ? ilocal[i] : i;
220     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
221     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
222     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
223   }
224   PetscFunctionReturn(0);
225 #else
226   PetscFunctionBegin;
227   PetscFunctionReturn(0);
228 #endif
229 }
230 
231 /*@
232    PetscSFSetUp - set up communication structures
233 
234    Collective
235 
236    Input Arguments:
237 .  sf - star forest communication object
238 
239    Level: beginner
240 
241 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
242 @*/
243 PetscErrorCode PetscSFSetUp(PetscSF sf)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
249   PetscSFCheckGraphSet(sf,1);
250   if (sf->setupcalled) PetscFunctionReturn(0);
251   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
252   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);}
253   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
254   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
255   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
256   sf->setupcalled = PETSC_TRUE;
257   PetscFunctionReturn(0);
258 }
259 
260 /*@
261    PetscSFSetFromOptions - set PetscSF options using the options database
262 
263    Logically Collective
264 
265    Input Arguments:
266 .  sf - star forest
267 
268    Options Database Keys:
269 +  -sf_type              - implementation type, see PetscSFSetType()
270 .  -sf_rank_order        - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
271 -  -sf_use_pinned_buffer - use pinned (nonpagable) memory for send/recv buffers on host when communicating GPU data but GPU-aware MPI is not used.
272                            Only available for SF types of basic and neighbor.
273 
274    Level: intermediate
275 @*/
276 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
277 {
278   PetscSFType    deft;
279   char           type[256];
280   PetscErrorCode ierr;
281   PetscBool      flg;
282 
283   PetscFunctionBegin;
284   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
285   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
286   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
287   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
288   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
289   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
290   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
291   ierr = PetscOptionsEnd();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 /*@
296    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
297 
298    Logically Collective
299 
300    Input Arguments:
301 +  sf - star forest
302 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
303 
304    Level: advanced
305 
306 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
307 @*/
308 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
309 {
310 
311   PetscFunctionBegin;
312   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
313   PetscValidLogicalCollectiveBool(sf,flg,2);
314   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
315   sf->rankorder = flg;
316   PetscFunctionReturn(0);
317 }
318 
319 /*@
320    PetscSFSetGraph - Set a parallel star forest
321 
322    Collective
323 
324    Input Arguments:
325 +  sf - star forest
326 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
327 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
328 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
329 during setup in debug mode)
330 .  localmode - copy mode for ilocal
331 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
332 during setup in debug mode)
333 -  remotemode - copy mode for iremote
334 
335    Level: intermediate
336 
337    Notes:
338     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
339 
340    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
341    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
342    needed
343 
344    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
345    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
346 
347 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
348 @*/
349 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
350 {
351   PetscErrorCode ierr;
352 
353   PetscFunctionBegin;
354   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
355   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
356   if (nleaves > 0) PetscValidPointer(iremote,6);
357   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
358   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
359 
360   if (sf->nroots >= 0) { /* Reset only if graph already set */
361     ierr = PetscSFReset(sf);CHKERRQ(ierr);
362   }
363 
364   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
365 
366   sf->nroots  = nroots;
367   sf->nleaves = nleaves;
368 
369   if (nleaves && ilocal) {
370     PetscInt i;
371     PetscInt minleaf = PETSC_MAX_INT;
372     PetscInt maxleaf = PETSC_MIN_INT;
373     int      contiguous = 1;
374     for (i=0; i<nleaves; i++) {
375       minleaf = PetscMin(minleaf,ilocal[i]);
376       maxleaf = PetscMax(maxleaf,ilocal[i]);
377       contiguous &= (ilocal[i] == i);
378     }
379     sf->minleaf = minleaf;
380     sf->maxleaf = maxleaf;
381     if (contiguous) {
382       if (localmode == PETSC_OWN_POINTER) {
383         ierr = PetscFree(ilocal);CHKERRQ(ierr);
384       }
385       ilocal = NULL;
386     }
387   } else {
388     sf->minleaf = 0;
389     sf->maxleaf = nleaves - 1;
390   }
391 
392   if (ilocal) {
393     switch (localmode) {
394     case PETSC_COPY_VALUES:
395       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
396       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
397       sf->mine = sf->mine_alloc;
398       break;
399     case PETSC_OWN_POINTER:
400       sf->mine_alloc = (PetscInt*)ilocal;
401       sf->mine       = sf->mine_alloc;
402       break;
403     case PETSC_USE_POINTER:
404       sf->mine_alloc = NULL;
405       sf->mine       = (PetscInt*)ilocal;
406       break;
407     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
408     }
409   }
410 
411   switch (remotemode) {
412   case PETSC_COPY_VALUES:
413     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
414     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
415     sf->remote = sf->remote_alloc;
416     break;
417   case PETSC_OWN_POINTER:
418     sf->remote_alloc = (PetscSFNode*)iremote;
419     sf->remote       = sf->remote_alloc;
420     break;
421   case PETSC_USE_POINTER:
422     sf->remote_alloc = NULL;
423     sf->remote       = (PetscSFNode*)iremote;
424     break;
425   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
426   }
427 
428   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
429   sf->graphset = PETSC_TRUE;
430   PetscFunctionReturn(0);
431 }
432 
433 /*@
434   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
435 
436   Collective
437 
438   Input Parameters:
439 + sf      - The PetscSF
440 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
441 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
442 
443   Notes:
444   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
445   n and N are local and global sizes of x respectively.
446 
447   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
448   sequential vectors y on all ranks.
449 
450   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
451   sequential vector y on rank 0.
452 
453   In above cases, entries of x are roots and entries of y are leaves.
454 
455   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
456   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
457   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
458   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
459   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
460 
461   In this case, roots and leaves are symmetric.
462 
463   Level: intermediate
464  @*/
465 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
466 {
467   MPI_Comm       comm;
468   PetscInt       n,N,res[2];
469   PetscMPIInt    rank,size;
470   PetscSFType    type;
471   PetscErrorCode ierr;
472 
473   PetscFunctionBegin;
474   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
475   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
476   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
477   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
478 
479   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
480     type = PETSCSFALLTOALL;
481     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
482     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
483     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
484     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
485   } else {
486     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
487     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
488     res[0] = n;
489     res[1] = -n;
490     /* Check if n are same over all ranks so that we can optimize it */
491     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
492     if (res[0] == -res[1]) { /* same n */
493       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
494     } else {
495       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
496     }
497     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
498   }
499   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
500 
501   sf->pattern = pattern;
502   sf->mine    = NULL; /* Contiguous */
503 
504   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
505      Also set other easy stuff.
506    */
507   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
508     sf->nleaves      = N;
509     sf->nroots       = n;
510     sf->nranks       = size;
511     sf->minleaf      = 0;
512     sf->maxleaf      = N - 1;
513   } else if (pattern == PETSCSF_PATTERN_GATHER) {
514     sf->nleaves      = rank ? 0 : N;
515     sf->nroots       = n;
516     sf->nranks       = rank ? 0 : size;
517     sf->minleaf      = 0;
518     sf->maxleaf      = rank ? -1 : N - 1;
519   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
520     sf->nleaves      = size;
521     sf->nroots       = size;
522     sf->nranks       = size;
523     sf->minleaf      = 0;
524     sf->maxleaf      = size - 1;
525   }
526   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
527   sf->graphset = PETSC_TRUE;
528   PetscFunctionReturn(0);
529 }
530 
531 /*@
532    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
533 
534    Collective
535 
536    Input Arguments:
537 
538 .  sf - star forest to invert
539 
540    Output Arguments:
541 .  isf - inverse of sf
542    Level: advanced
543 
544    Notes:
545    All roots must have degree 1.
546 
547    The local space may be a permutation, but cannot be sparse.
548 
549 .seealso: PetscSFSetGraph()
550 @*/
551 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
552 {
553   PetscErrorCode ierr;
554   PetscMPIInt    rank;
555   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
556   const PetscInt *ilocal;
557   PetscSFNode    *roots,*leaves;
558 
559   PetscFunctionBegin;
560   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
561   PetscSFCheckGraphSet(sf,1);
562   PetscValidPointer(isf,2);
563 
564   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
565   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
566 
567   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
568   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
569   for (i=0; i<maxlocal; i++) {
570     leaves[i].rank  = rank;
571     leaves[i].index = i;
572   }
573   for (i=0; i <nroots; i++) {
574     roots[i].rank  = -1;
575     roots[i].index = -1;
576   }
577   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
578   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
579 
580   /* Check whether our leaves are sparse */
581   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
582   if (count == nroots) newilocal = NULL;
583   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
584     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
585     for (i=0,count=0; i<nroots; i++) {
586       if (roots[i].rank >= 0) {
587         newilocal[count]   = i;
588         roots[count].rank  = roots[i].rank;
589         roots[count].index = roots[i].index;
590         count++;
591       }
592     }
593   }
594 
595   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
596   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
597   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
598   PetscFunctionReturn(0);
599 }
600 
601 /*@
602    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
603 
604    Collective
605 
606    Input Arguments:
607 +  sf - communication object to duplicate
608 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
609 
610    Output Arguments:
611 .  newsf - new communication object
612 
613    Level: beginner
614 
615 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
616 @*/
617 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
618 {
619   PetscSFType    type;
620   PetscErrorCode ierr;
621 
622   PetscFunctionBegin;
623   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
624   PetscValidLogicalCollectiveEnum(sf,opt,2);
625   PetscValidPointer(newsf,3);
626   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
627   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
628   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
629   if (opt == PETSCSF_DUPLICATE_GRAPH) {
630     PetscSFCheckGraphSet(sf,1);
631     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
632       PetscInt          nroots,nleaves;
633       const PetscInt    *ilocal;
634       const PetscSFNode *iremote;
635       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
636       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
637     } else {
638       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
639     }
640   }
641   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
642   PetscFunctionReturn(0);
643 }
644 
645 /*@C
646    PetscSFGetGraph - Get the graph specifying a parallel star forest
647 
648    Not Collective
649 
650    Input Arguments:
651 .  sf - star forest
652 
653    Output Arguments:
654 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
655 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
656 .  ilocal - locations of leaves in leafdata buffers
657 -  iremote - remote locations of root vertices for each leaf on the current process
658 
659    Notes:
660    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
661 
662    When called from Fortran, the returned iremote array is a copy and must deallocated after use. Consequently, if you
663    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
664 
665    Level: intermediate
666 
667 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
668 @*/
669 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
670 {
671   PetscErrorCode ierr;
672 
673   PetscFunctionBegin;
674   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
675   if (sf->ops->GetGraph) {
676     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
677   } else {
678     if (nroots) *nroots = sf->nroots;
679     if (nleaves) *nleaves = sf->nleaves;
680     if (ilocal) *ilocal = sf->mine;
681     if (iremote) *iremote = sf->remote;
682   }
683   PetscFunctionReturn(0);
684 }
685 
686 /*@
687    PetscSFGetLeafRange - Get the active leaf ranges
688 
689    Not Collective
690 
691    Input Arguments:
692 .  sf - star forest
693 
694    Output Arguments:
695 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
696 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
697 
698    Level: developer
699 
700 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
701 @*/
702 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
703 {
704 
705   PetscFunctionBegin;
706   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
707   PetscSFCheckGraphSet(sf,1);
708   if (minleaf) *minleaf = sf->minleaf;
709   if (maxleaf) *maxleaf = sf->maxleaf;
710   PetscFunctionReturn(0);
711 }
712 
713 /*@C
714    PetscSFViewFromOptions - View from Options
715 
716    Collective on PetscSF
717 
718    Input Parameters:
719 +  A - the star forest
720 .  obj - Optional object
721 -  name - command line option
722 
723    Level: intermediate
724 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
725 @*/
726 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
727 {
728   PetscErrorCode ierr;
729 
730   PetscFunctionBegin;
731   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
732   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
733   PetscFunctionReturn(0);
734 }
735 
736 /*@C
737    PetscSFView - view a star forest
738 
739    Collective
740 
741    Input Arguments:
742 +  sf - star forest
743 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
744 
745    Level: beginner
746 
747 .seealso: PetscSFCreate(), PetscSFSetGraph()
748 @*/
749 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
750 {
751   PetscErrorCode    ierr;
752   PetscBool         iascii;
753   PetscViewerFormat format;
754 
755   PetscFunctionBegin;
756   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
757   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
758   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
759   PetscCheckSameComm(sf,1,viewer,2);
760   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
761   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
762   if (iascii) {
763     PetscMPIInt rank;
764     PetscInt    ii,i,j;
765 
766     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
767     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
768     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
769     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
770       if (!sf->graphset) {
771         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
772         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
773         PetscFunctionReturn(0);
774       }
775       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
776       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
777       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
778       for (i=0; i<sf->nleaves; i++) {
779         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
780       }
781       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
782       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
783       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
784         PetscMPIInt *tmpranks,*perm;
785         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
786         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
787         for (i=0; i<sf->nranks; i++) perm[i] = i;
788         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
789         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
790         for (ii=0; ii<sf->nranks; ii++) {
791           i = perm[ii];
792           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
793           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
794             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
795           }
796         }
797         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
798       }
799       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
800       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
801     }
802     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
803   }
804   PetscFunctionReturn(0);
805 }
806 
807 /*@C
808    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
809 
810    Not Collective
811 
812    Input Arguments:
813 .  sf - star forest
814 
815    Output Arguments:
816 +  nranks - number of ranks referenced by local part
817 .  ranks - array of ranks
818 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
819 .  rmine - concatenated array holding local indices referencing each remote rank
820 -  rremote - concatenated array holding remote indices referenced for each remote rank
821 
822    Level: developer
823 
824 .seealso: PetscSFGetLeafRanks()
825 @*/
826 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
827 {
828   PetscErrorCode ierr;
829 
830   PetscFunctionBegin;
831   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
832   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
833   if (sf->ops->GetRootRanks) {
834     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
835   } else {
836     /* The generic implementation */
837     if (nranks)  *nranks  = sf->nranks;
838     if (ranks)   *ranks   = sf->ranks;
839     if (roffset) *roffset = sf->roffset;
840     if (rmine)   *rmine   = sf->rmine;
841     if (rremote) *rremote = sf->rremote;
842   }
843   PetscFunctionReturn(0);
844 }
845 
846 /*@C
847    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
848 
849    Not Collective
850 
851    Input Arguments:
852 .  sf - star forest
853 
854    Output Arguments:
855 +  niranks - number of leaf ranks referencing roots on this process
856 .  iranks - array of ranks
857 .  ioffset - offset in irootloc for each rank (length niranks+1)
858 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
859 
860    Level: developer
861 
862 .seealso: PetscSFGetRootRanks()
863 @*/
864 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
865 {
866   PetscErrorCode ierr;
867 
868   PetscFunctionBegin;
869   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
870   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
871   if (sf->ops->GetLeafRanks) {
872     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
873   } else {
874     PetscSFType type;
875     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
876     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
877   }
878   PetscFunctionReturn(0);
879 }
880 
881 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
882   PetscInt i;
883   for (i=0; i<n; i++) {
884     if (needle == list[i]) return PETSC_TRUE;
885   }
886   return PETSC_FALSE;
887 }
888 
889 /*@C
890    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
891 
892    Collective
893 
894    Input Arguments:
895 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
896 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
897 
898    Level: developer
899 
900 .seealso: PetscSFGetRootRanks()
901 @*/
902 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
903 {
904   PetscErrorCode     ierr;
905   PetscTable         table;
906   PetscTablePosition pos;
907   PetscMPIInt        size,groupsize,*groupranks;
908   PetscInt           *rcount,*ranks;
909   PetscInt           i, irank = -1,orank = -1;
910 
911   PetscFunctionBegin;
912   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
913   PetscSFCheckGraphSet(sf,1);
914   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
915   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
916   for (i=0; i<sf->nleaves; i++) {
917     /* Log 1-based rank */
918     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
919   }
920   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
921   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
922   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
923   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
924   for (i=0; i<sf->nranks; i++) {
925     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
926     ranks[i]--;             /* Convert back to 0-based */
927   }
928   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
929 
930   /* We expect that dgroup is reliably "small" while nranks could be large */
931   {
932     MPI_Group group = MPI_GROUP_NULL;
933     PetscMPIInt *dgroupranks;
934     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
935     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRQ(ierr);
936     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
937     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
938     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
939     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRQ(ierr);}
940     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
941     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
942   }
943 
944   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
945   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i; ) {
946     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
947       if (InList(ranks[i],groupsize,groupranks)) break;
948     }
949     for ( ; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
950       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
951     }
952     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
953       PetscInt tmprank,tmpcount;
954 
955       tmprank             = ranks[i];
956       tmpcount            = rcount[i];
957       ranks[i]            = ranks[sf->ndranks];
958       rcount[i]           = rcount[sf->ndranks];
959       ranks[sf->ndranks]  = tmprank;
960       rcount[sf->ndranks] = tmpcount;
961       sf->ndranks++;
962     }
963   }
964   ierr = PetscFree(groupranks);CHKERRQ(ierr);
965   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
966   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
967   sf->roffset[0] = 0;
968   for (i=0; i<sf->nranks; i++) {
969     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
970     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
971     rcount[i]        = 0;
972   }
973   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
974     /* short circuit */
975     if (orank != sf->remote[i].rank) {
976       /* Search for index of iremote[i].rank in sf->ranks */
977       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
978       if (irank < 0) {
979         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
980         if (irank >= 0) irank += sf->ndranks;
981       }
982       orank = sf->remote[i].rank;
983     }
984     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
985     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
986     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
987     rcount[irank]++;
988   }
989   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
990   PetscFunctionReturn(0);
991 }
992 
993 /*@C
994    PetscSFGetGroups - gets incoming and outgoing process groups
995 
996    Collective
997 
998    Input Argument:
999 .  sf - star forest
1000 
1001    Output Arguments:
1002 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1003 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1004 
1005    Level: developer
1006 
1007 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1008 @*/
1009 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1010 {
1011   PetscErrorCode ierr;
1012   MPI_Group      group = MPI_GROUP_NULL;
1013 
1014   PetscFunctionBegin;
1015   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1016   if (sf->ingroup == MPI_GROUP_NULL) {
1017     PetscInt       i;
1018     const PetscInt *indegree;
1019     PetscMPIInt    rank,*outranks,*inranks;
1020     PetscSFNode    *remote;
1021     PetscSF        bgcount;
1022 
1023     /* Compute the number of incoming ranks */
1024     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1025     for (i=0; i<sf->nranks; i++) {
1026       remote[i].rank  = sf->ranks[i];
1027       remote[i].index = 0;
1028     }
1029     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1030     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1031     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1032     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1033 
1034     /* Enumerate the incoming ranks */
1035     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1036     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1037     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1038     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1039     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1040     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1041     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRQ(ierr);
1042     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1043     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1044     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1045   }
1046   *incoming = sf->ingroup;
1047 
1048   if (sf->outgroup == MPI_GROUP_NULL) {
1049     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1050     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRQ(ierr);
1051     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1052   }
1053   *outgoing = sf->outgroup;
1054   PetscFunctionReturn(0);
1055 }
1056 
1057 /*@
1058    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1059 
1060    Collective
1061 
1062    Input Argument:
1063 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1064 
1065    Output Arguments:
1066 .  multi - star forest with split roots, such that each root has degree exactly 1
1067 
1068    Level: developer
1069 
1070    Notes:
1071 
1072    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1073    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1074    edge, it is a candidate for future optimization that might involve its removal.
1075 
1076 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1077 @*/
1078 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1079 {
1080   PetscErrorCode ierr;
1081 
1082   PetscFunctionBegin;
1083   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1084   PetscValidPointer(multi,2);
1085   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1086     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1087     *multi = sf->multi;
1088     PetscFunctionReturn(0);
1089   }
1090   if (!sf->multi) {
1091     const PetscInt *indegree;
1092     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1093     PetscSFNode    *remote;
1094     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1095     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1096     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1097     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1098     inoffset[0] = 0;
1099     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1100     for (i=0; i<maxlocal; i++) outones[i] = 1;
1101     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1102     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1103     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1104 #if 0
1105 #if defined(PETSC_USE_DEBUG)                                 /* Check that the expected number of increments occurred */
1106     for (i=0; i<sf->nroots; i++) {
1107       if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1108     }
1109 #endif
1110 #endif
1111     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1112     for (i=0; i<sf->nleaves; i++) {
1113       remote[i].rank  = sf->remote[i].rank;
1114       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1115     }
1116     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1117     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1118     if (sf->rankorder) {        /* Sort the ranks */
1119       PetscMPIInt rank;
1120       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1121       PetscSFNode *newremote;
1122       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1123       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1124       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1125       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1126       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1127       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1128       /* Sort the incoming ranks at each vertex, build the inverse map */
1129       for (i=0; i<sf->nroots; i++) {
1130         PetscInt j;
1131         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1132         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1133         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1134       }
1135       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1136       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1137       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1138       for (i=0; i<sf->nleaves; i++) {
1139         newremote[i].rank  = sf->remote[i].rank;
1140         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1141       }
1142       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1143       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1144     }
1145     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1146   }
1147   *multi = sf->multi;
1148   PetscFunctionReturn(0);
1149 }
1150 
1151 /*@C
1152    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1153 
1154    Collective
1155 
1156    Input Arguments:
1157 +  sf - original star forest
1158 .  nselected  - number of selected roots on this process
1159 -  selected   - indices of the selected roots on this process
1160 
1161    Output Arguments:
1162 .  newsf - new star forest
1163 
1164    Level: advanced
1165 
1166    Note:
1167    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1168    be done by calling PetscSFGetGraph().
1169 
1170 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1171 @*/
1172 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1173 {
1174   PetscInt          i,n,*roots,*rootdata,*leafdata,nroots,nleaves,connected_leaves,*new_ilocal;
1175   const PetscSFNode *iremote;
1176   PetscSFNode       *new_iremote;
1177   PetscSF           tmpsf;
1178   MPI_Comm          comm;
1179   PetscErrorCode    ierr;
1180 
1181   PetscFunctionBegin;
1182   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1183   PetscSFCheckGraphSet(sf,1);
1184   if (nselected) PetscValidPointer(selected,3);
1185   PetscValidPointer(newsf,4);
1186 
1187   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1188   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1189   /* Uniq selected[] and put results in roots[] */
1190   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1191   ierr = PetscMalloc1(nselected,&roots);CHKERRQ(ierr);
1192   ierr = PetscArraycpy(roots,selected,nselected);CHKERRQ(ierr);
1193   ierr = PetscSortedRemoveDupsInt(&nselected,roots);CHKERRQ(ierr);
1194   if (nselected && (roots[0] < 0 || roots[nselected-1] >= sf->nroots)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max root indices %D/%D are not in [0,%D)",roots[0],roots[nselected-1],sf->nroots);
1195 
1196   if (sf->ops->CreateEmbeddedSF) {
1197     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,roots,newsf);CHKERRQ(ierr);
1198   } else {
1199     /* A generic version of creating embedded sf. Note that we called PetscSFSetGraph() twice, which is certainly expensive */
1200     /* Find out which leaves (not leaf data items) are still connected to roots in the embedded sf */
1201     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,NULL,&iremote);CHKERRQ(ierr);
1202     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&tmpsf);CHKERRQ(ierr);
1203     ierr = PetscSFSetGraph(tmpsf,nroots,nleaves,NULL/*contiguous*/,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);CHKERRQ(ierr);
1204     ierr = PetscCalloc2(nroots,&rootdata,nleaves,&leafdata);CHKERRQ(ierr);
1205     for (i=0; i<nselected; ++i) rootdata[roots[i]] = 1;
1206     ierr = PetscSFBcastBegin(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1207     ierr = PetscSFBcastEnd(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1208     ierr = PetscSFDestroy(&tmpsf);CHKERRQ(ierr);
1209 
1210     /* Build newsf with leaves that are still connected */
1211     connected_leaves = 0;
1212     for (i=0; i<nleaves; ++i) connected_leaves += leafdata[i];
1213     ierr = PetscMalloc1(connected_leaves,&new_ilocal);CHKERRQ(ierr);
1214     ierr = PetscMalloc1(connected_leaves,&new_iremote);CHKERRQ(ierr);
1215     for (i=0, n=0; i<nleaves; ++i) {
1216       if (leafdata[i]) {
1217         new_ilocal[n]        = sf->mine ? sf->mine[i] : i;
1218         new_iremote[n].rank  = sf->remote[i].rank;
1219         new_iremote[n].index = sf->remote[i].index;
1220         ++n;
1221       }
1222     }
1223 
1224     if (n != connected_leaves) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"There is a size mismatch in the SF embedding, %d != %d",n,connected_leaves);
1225     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1226     ierr = PetscSFSetGraph(*newsf,nroots,connected_leaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1227     ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
1228   }
1229   ierr = PetscFree(roots);CHKERRQ(ierr);
1230   ierr = PetscSFSetUp(*newsf);CHKERRQ(ierr);
1231   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1232   PetscFunctionReturn(0);
1233 }
1234 
1235 /*@C
1236   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1237 
1238   Collective
1239 
1240   Input Arguments:
1241 + sf - original star forest
1242 . nselected  - number of selected leaves on this process
1243 - selected   - indices of the selected leaves on this process
1244 
1245   Output Arguments:
1246 .  newsf - new star forest
1247 
1248   Level: advanced
1249 
1250 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1251 @*/
1252 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1253 {
1254   const PetscSFNode *iremote;
1255   PetscSFNode       *new_iremote;
1256   const PetscInt    *ilocal;
1257   PetscInt          i,nroots,*leaves,*new_ilocal;
1258   MPI_Comm          comm;
1259   PetscErrorCode    ierr;
1260 
1261   PetscFunctionBegin;
1262   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1263   PetscSFCheckGraphSet(sf,1);
1264   if (nselected) PetscValidPointer(selected,3);
1265   PetscValidPointer(newsf,4);
1266 
1267   /* Uniq selected[] and put results in leaves[] */
1268   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1269   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1270   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1271   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1272   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1273 
1274   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1275   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1276     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1277   } else {
1278     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1279     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1280     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1281     for (i=0; i<nselected; ++i) {
1282       const PetscInt l     = leaves[i];
1283       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1284       new_iremote[i].rank  = iremote[l].rank;
1285       new_iremote[i].index = iremote[l].index;
1286     }
1287     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1288     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1289   }
1290   ierr = PetscFree(leaves);CHKERRQ(ierr);
1291   PetscFunctionReturn(0);
1292 }
1293 
1294 /*@C
1295    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1296 
1297    Collective on PetscSF
1298 
1299    Input Arguments:
1300 +  sf - star forest on which to communicate
1301 .  unit - data type associated with each node
1302 .  rootdata - buffer to broadcast
1303 -  op - operation to use for reduction
1304 
1305    Output Arguments:
1306 .  leafdata - buffer to be reduced with values from each leaf's respective root
1307 
1308    Level: intermediate
1309 
1310 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd()
1311 @*/
1312 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1313 {
1314   PetscErrorCode ierr;
1315   PetscMemType   rootmtype,leafmtype;
1316 
1317   PetscFunctionBegin;
1318   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1319   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1320   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1321   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1322   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1323 #if defined(PETSC_HAVE_CUDA)
1324   /*  Shall we assume rootdata, leafdata are ready to use (instead of being computed by some asynchronous kernels)?
1325     To be similar to MPI, I'd like to have this assumption, since MPI does not have a concept of stream.
1326     But currently this assumption is not enforecd in Petsc. To be safe, I do synchronization here. Otherwise, if
1327     we do not sync now and call the Pack kernel directly on the default NULL stream (assume petsc objects are also
1328     computed on it), we have to sync the NULL stream before calling MPI routines. So, it looks a cudaDeviceSynchronize
1329     is inevitable. We do it now and put pack/unpack kernels to non-NULL streams.
1330    */
1331   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1332 #endif
1333   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1334   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1335   PetscFunctionReturn(0);
1336 }
1337 
1338 /*@C
1339    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1340 
1341    Collective
1342 
1343    Input Arguments:
1344 +  sf - star forest
1345 .  unit - data type
1346 .  rootdata - buffer to broadcast
1347 -  op - operation to use for reduction
1348 
1349    Output Arguments:
1350 .  leafdata - buffer to be reduced with values from each leaf's respective root
1351 
1352    Level: intermediate
1353 
1354 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1355 @*/
1356 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1357 {
1358   PetscErrorCode ierr;
1359   PetscMemType   rootmtype,leafmtype;
1360 
1361   PetscFunctionBegin;
1362   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1363   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1364   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1365   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1366   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1367   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1368   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1369   PetscFunctionReturn(0);
1370 }
1371 
1372 /*@C
1373    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1374 
1375    Collective
1376 
1377    Input Arguments:
1378 +  sf - star forest
1379 .  unit - data type
1380 .  leafdata - values to reduce
1381 -  op - reduction operation
1382 
1383    Output Arguments:
1384 .  rootdata - result of reduction of values from all leaves of each root
1385 
1386    Level: intermediate
1387 
1388 .seealso: PetscSFBcastBegin()
1389 @*/
1390 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1391 {
1392   PetscErrorCode ierr;
1393   PetscMemType   rootmtype,leafmtype;
1394 
1395   PetscFunctionBegin;
1396   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1397   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1398   ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1399   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1400   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1401 #if defined(PETSC_HAVE_CUDA)
1402   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1403 #endif
1404   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1405   ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1406   PetscFunctionReturn(0);
1407 }
1408 
1409 /*@C
1410    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1411 
1412    Collective
1413 
1414    Input Arguments:
1415 +  sf - star forest
1416 .  unit - data type
1417 .  leafdata - values to reduce
1418 -  op - reduction operation
1419 
1420    Output Arguments:
1421 .  rootdata - result of reduction of values from all leaves of each root
1422 
1423    Level: intermediate
1424 
1425 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1426 @*/
1427 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1428 {
1429   PetscErrorCode ierr;
1430   PetscMemType   rootmtype,leafmtype;
1431 
1432   PetscFunctionBegin;
1433   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1434   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1435   ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1436   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1437   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1438   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1439   ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 /*@C
1444    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1445 
1446    Collective
1447 
1448    Input Arguments:
1449 +  sf - star forest
1450 .  unit - data type
1451 .  leafdata - leaf values to use in reduction
1452 -  op - operation to use for reduction
1453 
1454    Output Arguments:
1455 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1456 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1457 
1458    Level: advanced
1459 
1460    Note:
1461    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1462    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1463    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1464    integers.
1465 
1466 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1467 @*/
1468 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1469 {
1470   PetscErrorCode ierr;
1471   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1472 
1473   PetscFunctionBegin;
1474   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1475   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1476   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1477   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1478   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1479   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1480   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1481 #if defined(PETSC_HAVE_CUDA)
1482   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1483 #endif
1484   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1485   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1486   PetscFunctionReturn(0);
1487 }
1488 
1489 /*@C
1490    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1491 
1492    Collective
1493 
1494    Input Arguments:
1495 +  sf - star forest
1496 .  unit - data type
1497 .  leafdata - leaf values to use in reduction
1498 -  op - operation to use for reduction
1499 
1500    Output Arguments:
1501 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1502 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1503 
1504    Level: advanced
1505 
1506 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1507 @*/
1508 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1509 {
1510   PetscErrorCode ierr;
1511   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1512 
1513   PetscFunctionBegin;
1514   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1515   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1516   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1517   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1518   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1519   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1520   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1521   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1522   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1523   PetscFunctionReturn(0);
1524 }
1525 
1526 /*@C
1527    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1528 
1529    Collective
1530 
1531    Input Arguments:
1532 .  sf - star forest
1533 
1534    Output Arguments:
1535 .  degree - degree of each root vertex
1536 
1537    Level: advanced
1538 
1539    Notes:
1540    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1541 
1542 .seealso: PetscSFGatherBegin()
1543 @*/
1544 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1545 {
1546   PetscErrorCode ierr;
1547 
1548   PetscFunctionBegin;
1549   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1550   PetscSFCheckGraphSet(sf,1);
1551   PetscValidPointer(degree,2);
1552   if (!sf->degreeknown) {
1553     PetscInt i, nroots = sf->nroots, maxlocal = sf->maxleaf+1;  /* TODO: We should use PetscSFGetLeafRange() */
1554     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1555     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1556     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1557     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1558     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1559     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1560   }
1561   *degree = NULL;
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 /*@C
1566    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1567 
1568    Collective
1569 
1570    Input Arguments:
1571 .  sf - star forest
1572 
1573    Output Arguments:
1574 .  degree - degree of each root vertex
1575 
1576    Level: developer
1577 
1578    Notes:
1579    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1580 
1581 .seealso:
1582 @*/
1583 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1584 {
1585   PetscErrorCode ierr;
1586 
1587   PetscFunctionBegin;
1588   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1589   PetscSFCheckGraphSet(sf,1);
1590   PetscValidPointer(degree,2);
1591   if (!sf->degreeknown) {
1592     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1593     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1594     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1595     sf->degreeknown = PETSC_TRUE;
1596   }
1597   *degree = sf->degree;
1598   PetscFunctionReturn(0);
1599 }
1600 
1601 
1602 /*@C
1603    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1604    Each multi-root is assigned index of the corresponding original root.
1605 
1606    Collective
1607 
1608    Input Arguments:
1609 +  sf - star forest
1610 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1611 
1612    Output Arguments:
1613 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1614 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1615 
1616    Level: developer
1617 
1618    Notes:
1619    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1620 
1621 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1622 @*/
1623 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1624 {
1625   PetscSF             msf;
1626   PetscInt            i, j, k, nroots, nmroots;
1627   PetscErrorCode      ierr;
1628 
1629   PetscFunctionBegin;
1630   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1631   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1632   if (nroots) PetscValidIntPointer(degree,2);
1633   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1634   PetscValidPointer(multiRootsOrigNumbering,4);
1635   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1636   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1637   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1638   for (i=0,j=0,k=0; i<nroots; i++) {
1639     if (!degree[i]) continue;
1640     for (j=0; j<degree[i]; j++,k++) {
1641       (*multiRootsOrigNumbering)[k] = i;
1642     }
1643   }
1644 #if defined(PETSC_USE_DEBUG)
1645   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1646 #endif
1647   if (nMultiRoots) *nMultiRoots = nmroots;
1648   PetscFunctionReturn(0);
1649 }
1650 
1651 /*@C
1652    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1653 
1654    Collective
1655 
1656    Input Arguments:
1657 +  sf - star forest
1658 .  unit - data type
1659 -  leafdata - leaf data to gather to roots
1660 
1661    Output Argument:
1662 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1663 
1664    Level: intermediate
1665 
1666 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1667 @*/
1668 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1669 {
1670   PetscErrorCode ierr;
1671   PetscSF        multi;
1672 
1673   PetscFunctionBegin;
1674   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1675   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1676   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1677   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1678   PetscFunctionReturn(0);
1679 }
1680 
1681 /*@C
1682    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1683 
1684    Collective
1685 
1686    Input Arguments:
1687 +  sf - star forest
1688 .  unit - data type
1689 -  leafdata - leaf data to gather to roots
1690 
1691    Output Argument:
1692 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1693 
1694    Level: intermediate
1695 
1696 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1697 @*/
1698 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1699 {
1700   PetscErrorCode ierr;
1701   PetscSF        multi;
1702 
1703   PetscFunctionBegin;
1704   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1705   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1706   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1707   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1708   PetscFunctionReturn(0);
1709 }
1710 
1711 /*@C
1712    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1713 
1714    Collective
1715 
1716    Input Arguments:
1717 +  sf - star forest
1718 .  unit - data type
1719 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1720 
1721    Output Argument:
1722 .  leafdata - leaf data to be update with personal data from each respective root
1723 
1724    Level: intermediate
1725 
1726 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1727 @*/
1728 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1729 {
1730   PetscErrorCode ierr;
1731   PetscSF        multi;
1732 
1733   PetscFunctionBegin;
1734   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1735   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1736   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1737   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1738   PetscFunctionReturn(0);
1739 }
1740 
1741 /*@C
1742    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1743 
1744    Collective
1745 
1746    Input Arguments:
1747 +  sf - star forest
1748 .  unit - data type
1749 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1750 
1751    Output Argument:
1752 .  leafdata - leaf data to be update with personal data from each respective root
1753 
1754    Level: intermediate
1755 
1756 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1757 @*/
1758 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1759 {
1760   PetscErrorCode ierr;
1761   PetscSF        multi;
1762 
1763   PetscFunctionBegin;
1764   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1765   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1766   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1767   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1768   PetscFunctionReturn(0);
1769 }
1770 
1771 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1772 {
1773 #if defined(PETSC_USE_DEBUG)
1774   PetscInt        i, n, nleaves;
1775   const PetscInt *ilocal = NULL;
1776   PetscHSetI      seen;
1777   PetscErrorCode  ierr;
1778 
1779   PetscFunctionBegin;
1780   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1781   ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1782   for (i = 0; i < nleaves; i++) {
1783     const PetscInt leaf = ilocal ? ilocal[i] : i;
1784     ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1785   }
1786   ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1787   if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1788   ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1789   PetscFunctionReturn(0);
1790 #else
1791   PetscFunctionBegin;
1792   PetscFunctionReturn(0);
1793 #endif
1794 }
1795 
1796 /*@
1797   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1798 
1799   Input Parameters:
1800 + sfA - The first PetscSF
1801 - sfB - The second PetscSF
1802 
1803   Output Parameters:
1804 . sfBA - The composite SF
1805 
1806   Level: developer
1807 
1808   Notes:
1809   Currently, the two SFs must be defined on congruent communicators and they must be true star
1810   forests, i.e. the same leaf is not connected with different roots.
1811 
1812   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1813   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1814   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1815   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1816 
1817 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1818 @*/
1819 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1820 {
1821   PetscErrorCode    ierr;
1822   const PetscSFNode *remotePointsA,*remotePointsB;
1823   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1824   const PetscInt    *localPointsA,*localPointsB;
1825   PetscInt          *localPointsBA;
1826   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1827   PetscBool         denseB;
1828 
1829   PetscFunctionBegin;
1830   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1831   PetscSFCheckGraphSet(sfA,1);
1832   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1833   PetscSFCheckGraphSet(sfB,2);
1834   PetscCheckSameComm(sfA,1,sfB,2);
1835   PetscValidPointer(sfBA,3);
1836   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1837   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1838 
1839   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1840   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1841   if (localPointsA) {
1842     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
1843     for (i=0; i<numRootsB; i++) {
1844       reorderedRemotePointsA[i].rank = -1;
1845       reorderedRemotePointsA[i].index = -1;
1846     }
1847     for (i=0; i<numLeavesA; i++) {
1848       if (localPointsA[i] >= numRootsB) continue;
1849       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
1850     }
1851     remotePointsA = reorderedRemotePointsA;
1852   }
1853   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
1854   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
1855   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1856   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1857   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1858 
1859   denseB = (PetscBool)!localPointsB;
1860   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1861     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
1862     else numLeavesBA++;
1863   }
1864   if (denseB) {
1865     localPointsBA  = NULL;
1866     remotePointsBA = leafdataB;
1867   } else {
1868     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
1869     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
1870     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1871       const PetscInt l = localPointsB ? localPointsB[i] : i;
1872 
1873       if (leafdataB[l-minleaf].rank == -1) continue;
1874       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
1875       localPointsBA[numLeavesBA] = l;
1876       numLeavesBA++;
1877     }
1878     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
1879   }
1880   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1881   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1882   PetscFunctionReturn(0);
1883 }
1884 
1885 /*@
1886   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
1887 
1888   Input Parameters:
1889 + sfA - The first PetscSF
1890 - sfB - The second PetscSF
1891 
1892   Output Parameters:
1893 . sfBA - The composite SF.
1894 
1895   Level: developer
1896 
1897   Notes:
1898   Currently, the two SFs must be defined on congruent communicators and they must be true star
1899   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
1900   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
1901 
1902   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
1903   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
1904   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
1905   a Reduce on sfB, on connected roots.
1906 
1907 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
1908 @*/
1909 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1910 {
1911   PetscErrorCode    ierr;
1912   const PetscSFNode *remotePointsA,*remotePointsB;
1913   PetscSFNode       *remotePointsBA;
1914   const PetscInt    *localPointsA,*localPointsB;
1915   PetscSFNode       *reorderedRemotePointsA = NULL;
1916   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
1917 
1918   PetscFunctionBegin;
1919   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1920   PetscSFCheckGraphSet(sfA,1);
1921   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1922   PetscSFCheckGraphSet(sfB,2);
1923   PetscCheckSameComm(sfA,1,sfB,2);
1924   PetscValidPointer(sfBA,3);
1925   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1926   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1927   /* TODO: Check roots of sfB have degree of 1 */
1928 
1929   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
1930   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
1931   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
1932   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
1933   for (i=0; i<maxleaf - minleaf + 1; i++) {
1934     reorderedRemotePointsA[i].rank = -1;
1935     reorderedRemotePointsA[i].index = -1;
1936   }
1937   if (localPointsA) {
1938     for (i=0; i<numLeavesA; i++) {
1939       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
1940       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
1941     }
1942   } else {
1943     for (i=0; i<numLeavesA; i++) {
1944       if (i > maxleaf || i < minleaf) continue;
1945       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
1946     }
1947   }
1948 
1949   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
1950   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
1951   for (i=0; i<numRootsB; i++) {
1952     remotePointsBA[i].rank = -1;
1953     remotePointsBA[i].index = -1;
1954   }
1955 
1956   /* Once we implement the TODO above (check all roots of sfB have degree of 1), we can replace the MPI_MAXLOC
1957      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
1958    */
1959   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,MPI_MAXLOC);CHKERRQ(ierr);
1960   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,MPI_MAXLOC);CHKERRQ(ierr);
1961   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1962   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
1963     if (remotePointsBA[i].rank == -1) continue;
1964     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
1965     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
1966     localPointsBA[numLeavesBA] = i;
1967     numLeavesBA++;
1968   }
1969   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1970   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1971   PetscFunctionReturn(0);
1972 }
1973 
1974 /*
1975   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
1976 
1977   Input Parameters:
1978 . sf - The global PetscSF
1979 
1980   Output Parameters:
1981 . out - The local PetscSF
1982  */
1983 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
1984 {
1985   MPI_Comm           comm;
1986   PetscMPIInt        myrank;
1987   const PetscInt     *ilocal;
1988   const PetscSFNode  *iremote;
1989   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
1990   PetscSFNode        *liremote;
1991   PetscSF            lsf;
1992   PetscErrorCode     ierr;
1993 
1994   PetscFunctionBegin;
1995   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1996   if (sf->ops->CreateLocalSF) {
1997     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
1998   } else {
1999     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2000     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2001     ierr = MPI_Comm_rank(comm,&myrank);CHKERRQ(ierr);
2002 
2003     /* Find out local edges and build a local SF */
2004     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2005     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2006     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2007     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2008 
2009     for (i=j=0; i<nleaves; i++) {
2010       if (iremote[i].rank == (PetscInt)myrank) {
2011         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2012         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2013         liremote[j].index = iremote[i].index;
2014         j++;
2015       }
2016     }
2017     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2018     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2019     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2020     *out = lsf;
2021   }
2022   PetscFunctionReturn(0);
2023 }
2024 
2025 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2026 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2027 {
2028   PetscErrorCode     ierr;
2029   PetscMemType       rootmtype,leafmtype;
2030 
2031   PetscFunctionBegin;
2032   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2033   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2034   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2035   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2036   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2037   if (sf->ops->BcastToZero) {
2038     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2039   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2040   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2041   PetscFunctionReturn(0);
2042 }
2043 
2044