xref: /petsc/src/vec/is/sf/interface/sf.c (revision 203a8786a33c3e4e033bac3ac4db6d2ca3bac443)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_USE_DEBUG)
6 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
7     if (PetscUnlikely(!(sf)->graphset))                              \
8       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
9   } while (0)
10 #else
11 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
12 #endif
13 
14 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",0};
15 
16 /*@
17    PetscSFCreate - create a star forest communication context
18 
19    Collective
20 
21    Input Arguments:
22 .  comm - communicator on which the star forest will operate
23 
24    Output Arguments:
25 .  sf - new star forest context
26 
27    Options Database Keys:
28 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
29 .  -sf_type window    -Use MPI-3 one-sided window for communication
30 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
31 
32    Level: intermediate
33 
34    Notes:
35    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
36    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
37    SFs are optimized and they have better performance than general SFs.
38 
39 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
40 @*/
41 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
42 {
43   PetscErrorCode ierr;
44   PetscSF        b;
45 
46   PetscFunctionBegin;
47   PetscValidPointer(sf,2);
48   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
49 
50   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
51 
52   b->nroots    = -1;
53   b->nleaves   = -1;
54   b->minleaf   = PETSC_MAX_INT;
55   b->maxleaf   = PETSC_MIN_INT;
56   b->nranks    = -1;
57   b->rankorder = PETSC_TRUE;
58   b->ingroup   = MPI_GROUP_NULL;
59   b->outgroup  = MPI_GROUP_NULL;
60   b->graphset  = PETSC_FALSE;
61 
62   *sf = b;
63   PetscFunctionReturn(0);
64 }
65 
66 /*@
67    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
68 
69    Collective
70 
71    Input Arguments:
72 .  sf - star forest
73 
74    Level: advanced
75 
76 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
77 @*/
78 PetscErrorCode PetscSFReset(PetscSF sf)
79 {
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
84   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
85   sf->nroots   = -1;
86   sf->nleaves  = -1;
87   sf->minleaf  = PETSC_MAX_INT;
88   sf->maxleaf  = PETSC_MIN_INT;
89   sf->mine     = NULL;
90   sf->remote   = NULL;
91   sf->graphset = PETSC_FALSE;
92   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
93   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
94   sf->nranks = -1;
95   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
96 #if defined(PETSC_HAVE_CUDA)
97   if (sf->rmine_d) {cudaError_t err = cudaFree(sf->rmine_d);CHKERRCUDA(err);sf->rmine_d=NULL;}
98 #endif
99   sf->degreeknown = PETSC_FALSE;
100   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
101   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRQ(ierr);}
102   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRQ(ierr);}
103   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
104   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
105   sf->setupcalled = PETSC_FALSE;
106   PetscFunctionReturn(0);
107 }
108 
109 /*@C
110    PetscSFSetType - Set the PetscSF communication implementation
111 
112    Collective on PetscSF
113 
114    Input Parameters:
115 +  sf - the PetscSF context
116 -  type - a known method
117 
118    Options Database Key:
119 .  -sf_type <type> - Sets the method; use -help for a list
120    of available methods (for instance, window, pt2pt, neighbor)
121 
122    Notes:
123    See "include/petscsf.h" for available methods (for instance)
124 +    PETSCSFWINDOW - MPI-2/3 one-sided
125 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
126 
127   Level: intermediate
128 
129 .seealso: PetscSFType, PetscSFCreate()
130 @*/
131 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
132 {
133   PetscErrorCode ierr,(*r)(PetscSF);
134   PetscBool      match;
135 
136   PetscFunctionBegin;
137   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
138   PetscValidCharPointer(type,2);
139 
140   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
141   if (match) PetscFunctionReturn(0);
142 
143   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
144   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
145   /* Destroy the previous PetscSF implementation context */
146   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
147   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
148   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
149   ierr = (*r)(sf);CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 /*@C
154   PetscSFGetType - Get the PetscSF communication implementation
155 
156   Not Collective
157 
158   Input Parameter:
159 . sf  - the PetscSF context
160 
161   Output Parameter:
162 . type - the PetscSF type name
163 
164   Level: intermediate
165 
166 .seealso: PetscSFSetType(), PetscSFCreate()
167 @*/
168 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
169 {
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
172   PetscValidPointer(type,2);
173   *type = ((PetscObject)sf)->type_name;
174   PetscFunctionReturn(0);
175 }
176 
177 /*@
178    PetscSFDestroy - destroy star forest
179 
180    Collective
181 
182    Input Arguments:
183 .  sf - address of star forest
184 
185    Level: intermediate
186 
187 .seealso: PetscSFCreate(), PetscSFReset()
188 @*/
189 PetscErrorCode PetscSFDestroy(PetscSF *sf)
190 {
191   PetscErrorCode ierr;
192 
193   PetscFunctionBegin;
194   if (!*sf) PetscFunctionReturn(0);
195   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
196   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
197   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
198   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
199   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
204 {
205 #if defined(PETSC_USE_DEBUG)
206   PetscInt           i, nleaves;
207   PetscMPIInt        size;
208   const PetscInt    *ilocal;
209   const PetscSFNode *iremote;
210   PetscErrorCode     ierr;
211 
212   PetscFunctionBegin;
213   if (!sf->graphset) PetscFunctionReturn(0);
214   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
215   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
216   for (i = 0; i < nleaves; i++) {
217     const PetscInt rank = iremote[i].rank;
218     const PetscInt remote = iremote[i].index;
219     const PetscInt leaf = ilocal ? ilocal[i] : i;
220     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
221     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
222     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
223   }
224   PetscFunctionReturn(0);
225 #else
226   PetscFunctionBegin;
227   PetscFunctionReturn(0);
228 #endif
229 }
230 
231 /*@
232    PetscSFSetUp - set up communication structures
233 
234    Collective
235 
236    Input Arguments:
237 .  sf - star forest communication object
238 
239    Level: beginner
240 
241 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
242 @*/
243 PetscErrorCode PetscSFSetUp(PetscSF sf)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
249   PetscSFCheckGraphSet(sf,1);
250   if (sf->setupcalled) PetscFunctionReturn(0);
251   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
252   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);}
253   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
254   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
255   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
256   sf->setupcalled = PETSC_TRUE;
257   PetscFunctionReturn(0);
258 }
259 
260 /*@
261    PetscSFSetFromOptions - set PetscSF options using the options database
262 
263    Logically Collective
264 
265    Input Arguments:
266 .  sf - star forest
267 
268    Options Database Keys:
269 +  -sf_type              - implementation type, see PetscSFSetType()
270 .  -sf_rank_order        - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
271 -  -sf_use_pinned_buffer - use pinned (nonpagable) memory for send/recv buffers on host when communicating GPU data but GPU-aware MPI is not used.
272                            Only available for SF types of basic and neighbor.
273 
274    Level: intermediate
275 
276 .seealso: PetscSFWindowSetSyncType()
277 @*/
278 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
279 {
280   PetscSFType    deft;
281   char           type[256];
282   PetscErrorCode ierr;
283   PetscBool      flg;
284 
285   PetscFunctionBegin;
286   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
287   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
288   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
289   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
290   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
291   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
292   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
293   ierr = PetscOptionsEnd();CHKERRQ(ierr);
294   PetscFunctionReturn(0);
295 }
296 
297 /*@
298    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
299 
300    Logically Collective
301 
302    Input Arguments:
303 +  sf - star forest
304 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
305 
306    Level: advanced
307 
308 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
309 @*/
310 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
311 {
312 
313   PetscFunctionBegin;
314   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
315   PetscValidLogicalCollectiveBool(sf,flg,2);
316   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
317   sf->rankorder = flg;
318   PetscFunctionReturn(0);
319 }
320 
321 /*@
322    PetscSFSetGraph - Set a parallel star forest
323 
324    Collective
325 
326    Input Arguments:
327 +  sf - star forest
328 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
329 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
330 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
331 during setup in debug mode)
332 .  localmode - copy mode for ilocal
333 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
334 during setup in debug mode)
335 -  remotemode - copy mode for iremote
336 
337    Level: intermediate
338 
339    Notes:
340     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
341 
342    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
343    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
344    needed
345 
346    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
347    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
348 
349 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
350 @*/
351 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
352 {
353   PetscErrorCode ierr;
354 
355   PetscFunctionBegin;
356   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
357   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
358   if (nleaves > 0) PetscValidPointer(iremote,6);
359   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
360   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
361 
362   ierr = PetscSFReset(sf);CHKERRQ(ierr);
363   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
364 
365   sf->nroots  = nroots;
366   sf->nleaves = nleaves;
367 
368   if (nleaves && ilocal) {
369     PetscInt i;
370     PetscInt minleaf = PETSC_MAX_INT;
371     PetscInt maxleaf = PETSC_MIN_INT;
372     int      contiguous = 1;
373     for (i=0; i<nleaves; i++) {
374       minleaf = PetscMin(minleaf,ilocal[i]);
375       maxleaf = PetscMax(maxleaf,ilocal[i]);
376       contiguous &= (ilocal[i] == i);
377     }
378     sf->minleaf = minleaf;
379     sf->maxleaf = maxleaf;
380     if (contiguous) {
381       if (localmode == PETSC_OWN_POINTER) {
382         ierr = PetscFree(ilocal);CHKERRQ(ierr);
383       }
384       ilocal = NULL;
385     }
386   } else {
387     sf->minleaf = 0;
388     sf->maxleaf = nleaves - 1;
389   }
390 
391   if (ilocal) {
392     switch (localmode) {
393     case PETSC_COPY_VALUES:
394       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
395       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
396       sf->mine = sf->mine_alloc;
397       break;
398     case PETSC_OWN_POINTER:
399       sf->mine_alloc = (PetscInt*)ilocal;
400       sf->mine       = sf->mine_alloc;
401       break;
402     case PETSC_USE_POINTER:
403       sf->mine_alloc = NULL;
404       sf->mine       = (PetscInt*)ilocal;
405       break;
406     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
407     }
408   }
409 
410   switch (remotemode) {
411   case PETSC_COPY_VALUES:
412     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
413     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
414     sf->remote = sf->remote_alloc;
415     break;
416   case PETSC_OWN_POINTER:
417     sf->remote_alloc = (PetscSFNode*)iremote;
418     sf->remote       = sf->remote_alloc;
419     break;
420   case PETSC_USE_POINTER:
421     sf->remote_alloc = NULL;
422     sf->remote       = (PetscSFNode*)iremote;
423     break;
424   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
425   }
426 
427   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
428   sf->graphset = PETSC_TRUE;
429   PetscFunctionReturn(0);
430 }
431 
432 /*@
433   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
434 
435   Collective
436 
437   Input Parameters:
438 + sf      - The PetscSF
439 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
440 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
441 
442   Notes:
443   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
444   n and N are local and global sizes of x respectively.
445 
446   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
447   sequential vectors y on all ranks.
448 
449   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
450   sequential vector y on rank 0.
451 
452   In above cases, entries of x are roots and entries of y are leaves.
453 
454   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
455   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
456   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
457   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
458   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
459 
460   In this case, roots and leaves are symmetric.
461 
462   Level: intermediate
463  @*/
464 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
465 {
466   MPI_Comm       comm;
467   PetscInt       n,N,res[2];
468   PetscMPIInt    rank,size;
469   PetscSFType    type;
470   PetscErrorCode ierr;
471 
472   PetscFunctionBegin;
473   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
474   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
475   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
476   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
477 
478   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
479     type = PETSCSFALLTOALL;
480     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
481     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
482     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
483     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
484   } else {
485     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
486     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
487     res[0] = n;
488     res[1] = -n;
489     /* Check if n are same over all ranks so that we can optimize it */
490     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
491     if (res[0] == -res[1]) { /* same n */
492       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
493     } else {
494       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
495     }
496     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
497   }
498   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
499 
500   sf->pattern = pattern;
501   sf->mine    = NULL; /* Contiguous */
502 
503   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
504      Also set other easy stuff.
505    */
506   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
507     sf->nleaves      = N;
508     sf->nroots       = n;
509     sf->nranks       = size;
510     sf->minleaf      = 0;
511     sf->maxleaf      = N - 1;
512   } else if (pattern == PETSCSF_PATTERN_GATHER) {
513     sf->nleaves      = rank ? 0 : N;
514     sf->nroots       = n;
515     sf->nranks       = rank ? 0 : size;
516     sf->minleaf      = 0;
517     sf->maxleaf      = rank ? -1 : N - 1;
518   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
519     sf->nleaves      = size;
520     sf->nroots       = size;
521     sf->nranks       = size;
522     sf->minleaf      = 0;
523     sf->maxleaf      = size - 1;
524   }
525   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
526   sf->graphset = PETSC_TRUE;
527   PetscFunctionReturn(0);
528 }
529 
530 /*@
531    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
532 
533    Collective
534 
535    Input Arguments:
536 
537 .  sf - star forest to invert
538 
539    Output Arguments:
540 .  isf - inverse of sf
541    Level: advanced
542 
543    Notes:
544    All roots must have degree 1.
545 
546    The local space may be a permutation, but cannot be sparse.
547 
548 .seealso: PetscSFSetGraph()
549 @*/
550 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
551 {
552   PetscErrorCode ierr;
553   PetscMPIInt    rank;
554   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
555   const PetscInt *ilocal;
556   PetscSFNode    *roots,*leaves;
557 
558   PetscFunctionBegin;
559   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
560   PetscSFCheckGraphSet(sf,1);
561   PetscValidPointer(isf,2);
562 
563   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
564   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
565 
566   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
567   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
568   for (i=0; i<maxlocal; i++) {
569     leaves[i].rank  = rank;
570     leaves[i].index = i;
571   }
572   for (i=0; i <nroots; i++) {
573     roots[i].rank  = -1;
574     roots[i].index = -1;
575   }
576   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
577   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
578 
579   /* Check whether our leaves are sparse */
580   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
581   if (count == nroots) newilocal = NULL;
582   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
583     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
584     for (i=0,count=0; i<nroots; i++) {
585       if (roots[i].rank >= 0) {
586         newilocal[count]   = i;
587         roots[count].rank  = roots[i].rank;
588         roots[count].index = roots[i].index;
589         count++;
590       }
591     }
592   }
593 
594   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
595   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
596   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
597   PetscFunctionReturn(0);
598 }
599 
600 /*@
601    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
602 
603    Collective
604 
605    Input Arguments:
606 +  sf - communication object to duplicate
607 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
608 
609    Output Arguments:
610 .  newsf - new communication object
611 
612    Level: beginner
613 
614 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
615 @*/
616 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
617 {
618   PetscSFType    type;
619   PetscErrorCode ierr;
620 
621   PetscFunctionBegin;
622   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
623   PetscValidLogicalCollectiveEnum(sf,opt,2);
624   PetscValidPointer(newsf,3);
625   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
626   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
627   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
628   if (opt == PETSCSF_DUPLICATE_GRAPH) {
629     PetscSFCheckGraphSet(sf,1);
630     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
631       PetscInt          nroots,nleaves;
632       const PetscInt    *ilocal;
633       const PetscSFNode *iremote;
634       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
635       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
636     } else {
637       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
638     }
639   }
640   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
641   PetscFunctionReturn(0);
642 }
643 
644 /*@C
645    PetscSFGetGraph - Get the graph specifying a parallel star forest
646 
647    Not Collective
648 
649    Input Arguments:
650 .  sf - star forest
651 
652    Output Arguments:
653 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
654 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
655 .  ilocal - locations of leaves in leafdata buffers
656 -  iremote - remote locations of root vertices for each leaf on the current process
657 
658    Notes:
659    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
660 
661    When called from Fortran, the returned iremote array is a copy and must deallocated after use. Consequently, if you
662    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
663 
664    Level: intermediate
665 
666 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
667 @*/
668 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
669 {
670   PetscErrorCode ierr;
671 
672   PetscFunctionBegin;
673   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
674   if (sf->ops->GetGraph) {
675     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
676   } else {
677     if (nroots) *nroots = sf->nroots;
678     if (nleaves) *nleaves = sf->nleaves;
679     if (ilocal) *ilocal = sf->mine;
680     if (iremote) *iremote = sf->remote;
681   }
682   PetscFunctionReturn(0);
683 }
684 
685 /*@
686    PetscSFGetLeafRange - Get the active leaf ranges
687 
688    Not Collective
689 
690    Input Arguments:
691 .  sf - star forest
692 
693    Output Arguments:
694 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
695 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
696 
697    Level: developer
698 
699 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
700 @*/
701 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
702 {
703 
704   PetscFunctionBegin;
705   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
706   PetscSFCheckGraphSet(sf,1);
707   if (minleaf) *minleaf = sf->minleaf;
708   if (maxleaf) *maxleaf = sf->maxleaf;
709   PetscFunctionReturn(0);
710 }
711 
712 /*@C
713    PetscSFViewFromOptions - View from Options
714 
715    Collective on PetscSF
716 
717    Input Parameters:
718 +  A - the star forest
719 .  obj - Optional object
720 -  name - command line option
721 
722    Level: intermediate
723 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
724 @*/
725 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
726 {
727   PetscErrorCode ierr;
728 
729   PetscFunctionBegin;
730   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
731   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
732   PetscFunctionReturn(0);
733 }
734 
735 /*@C
736    PetscSFView - view a star forest
737 
738    Collective
739 
740    Input Arguments:
741 +  sf - star forest
742 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
743 
744    Level: beginner
745 
746 .seealso: PetscSFCreate(), PetscSFSetGraph()
747 @*/
748 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
749 {
750   PetscErrorCode    ierr;
751   PetscBool         iascii;
752   PetscViewerFormat format;
753 
754   PetscFunctionBegin;
755   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
756   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
757   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
758   PetscCheckSameComm(sf,1,viewer,2);
759   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
760   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
761   if (iascii) {
762     PetscMPIInt rank;
763     PetscInt    ii,i,j;
764 
765     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
766     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
767     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
768     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
769       if (!sf->graphset) {
770         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
771         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
772         PetscFunctionReturn(0);
773       }
774       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
775       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
776       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
777       for (i=0; i<sf->nleaves; i++) {
778         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
779       }
780       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
781       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
782       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
783         PetscMPIInt *tmpranks,*perm;
784         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
785         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
786         for (i=0; i<sf->nranks; i++) perm[i] = i;
787         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
788         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
789         for (ii=0; ii<sf->nranks; ii++) {
790           i = perm[ii];
791           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
792           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
793             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
794           }
795         }
796         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
797       }
798       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
799       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
800     }
801     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
802   }
803   PetscFunctionReturn(0);
804 }
805 
806 /*@C
807    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
808 
809    Not Collective
810 
811    Input Arguments:
812 .  sf - star forest
813 
814    Output Arguments:
815 +  nranks - number of ranks referenced by local part
816 .  ranks - array of ranks
817 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
818 .  rmine - concatenated array holding local indices referencing each remote rank
819 -  rremote - concatenated array holding remote indices referenced for each remote rank
820 
821    Level: developer
822 
823 .seealso: PetscSFGetLeafRanks()
824 @*/
825 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
826 {
827   PetscErrorCode ierr;
828 
829   PetscFunctionBegin;
830   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
831   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
832   if (sf->ops->GetRootRanks) {
833     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
834   } else {
835     /* The generic implementation */
836     if (nranks)  *nranks  = sf->nranks;
837     if (ranks)   *ranks   = sf->ranks;
838     if (roffset) *roffset = sf->roffset;
839     if (rmine)   *rmine   = sf->rmine;
840     if (rremote) *rremote = sf->rremote;
841   }
842   PetscFunctionReturn(0);
843 }
844 
845 /*@C
846    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
847 
848    Not Collective
849 
850    Input Arguments:
851 .  sf - star forest
852 
853    Output Arguments:
854 +  niranks - number of leaf ranks referencing roots on this process
855 .  iranks - array of ranks
856 .  ioffset - offset in irootloc for each rank (length niranks+1)
857 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
858 
859    Level: developer
860 
861 .seealso: PetscSFGetRootRanks()
862 @*/
863 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
864 {
865   PetscErrorCode ierr;
866 
867   PetscFunctionBegin;
868   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
869   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
870   if (sf->ops->GetLeafRanks) {
871     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
872   } else {
873     PetscSFType type;
874     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
875     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
876   }
877   PetscFunctionReturn(0);
878 }
879 
880 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
881   PetscInt i;
882   for (i=0; i<n; i++) {
883     if (needle == list[i]) return PETSC_TRUE;
884   }
885   return PETSC_FALSE;
886 }
887 
888 /*@C
889    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
890 
891    Collective
892 
893    Input Arguments:
894 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
895 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
896 
897    Level: developer
898 
899 .seealso: PetscSFGetRootRanks()
900 @*/
901 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
902 {
903   PetscErrorCode     ierr;
904   PetscTable         table;
905   PetscTablePosition pos;
906   PetscMPIInt        size,groupsize,*groupranks;
907   PetscInt           *rcount,*ranks;
908   PetscInt           i, irank = -1,orank = -1;
909 
910   PetscFunctionBegin;
911   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
912   PetscSFCheckGraphSet(sf,1);
913   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
914   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
915   for (i=0; i<sf->nleaves; i++) {
916     /* Log 1-based rank */
917     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
918   }
919   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
920   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
921   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
922   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
923   for (i=0; i<sf->nranks; i++) {
924     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
925     ranks[i]--;             /* Convert back to 0-based */
926   }
927   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
928 
929   /* We expect that dgroup is reliably "small" while nranks could be large */
930   {
931     MPI_Group group = MPI_GROUP_NULL;
932     PetscMPIInt *dgroupranks;
933     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
934     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRQ(ierr);
935     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
936     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
937     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
938     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRQ(ierr);}
939     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
940     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
941   }
942 
943   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
944   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i; ) {
945     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
946       if (InList(ranks[i],groupsize,groupranks)) break;
947     }
948     for ( ; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
949       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
950     }
951     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
952       PetscInt    tmprank,tmpcount;
953 
954       tmprank             = ranks[i];
955       tmpcount            = rcount[i];
956       ranks[i]            = ranks[sf->ndranks];
957       rcount[i]           = rcount[sf->ndranks];
958       ranks[sf->ndranks]  = tmprank;
959       rcount[sf->ndranks] = tmpcount;
960       sf->ndranks++;
961     }
962   }
963   ierr = PetscFree(groupranks);CHKERRQ(ierr);
964   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
965   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
966   sf->roffset[0] = 0;
967   for (i=0; i<sf->nranks; i++) {
968     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
969     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
970     rcount[i]        = 0;
971   }
972   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
973     /* short circuit */
974     if (orank != sf->remote[i].rank) {
975       /* Search for index of iremote[i].rank in sf->ranks */
976       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
977       if (irank < 0) {
978         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
979         if (irank >= 0) irank += sf->ndranks;
980       }
981       orank = sf->remote[i].rank;
982     }
983     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
984     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
985     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
986     rcount[irank]++;
987   }
988   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
989   PetscFunctionReturn(0);
990 }
991 
992 /*@C
993    PetscSFGetGroups - gets incoming and outgoing process groups
994 
995    Collective
996 
997    Input Argument:
998 .  sf - star forest
999 
1000    Output Arguments:
1001 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1002 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1003 
1004    Level: developer
1005 
1006 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1007 @*/
1008 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1009 {
1010   PetscErrorCode ierr;
1011   MPI_Group      group = MPI_GROUP_NULL;
1012 
1013   PetscFunctionBegin;
1014   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining groups");
1015   if (sf->ingroup == MPI_GROUP_NULL) {
1016     PetscInt       i;
1017     const PetscInt *indegree;
1018     PetscMPIInt    rank,*outranks,*inranks;
1019     PetscSFNode    *remote;
1020     PetscSF        bgcount;
1021 
1022     /* Compute the number of incoming ranks */
1023     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1024     for (i=0; i<sf->nranks; i++) {
1025       remote[i].rank  = sf->ranks[i];
1026       remote[i].index = 0;
1027     }
1028     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1029     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1030     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1031     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1032 
1033     /* Enumerate the incoming ranks */
1034     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1035     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1036     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1037     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1038     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1039     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1040     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRQ(ierr);
1041     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1042     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1043     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1044   }
1045   *incoming = sf->ingroup;
1046 
1047   if (sf->outgroup == MPI_GROUP_NULL) {
1048     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1049     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRQ(ierr);
1050     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1051   }
1052   *outgoing = sf->outgroup;
1053   PetscFunctionReturn(0);
1054 }
1055 
1056 /*@
1057    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1058 
1059    Collective
1060 
1061    Input Argument:
1062 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1063 
1064    Output Arguments:
1065 .  multi - star forest with split roots, such that each root has degree exactly 1
1066 
1067    Level: developer
1068 
1069    Notes:
1070 
1071    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1072    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1073    edge, it is a candidate for future optimization that might involve its removal.
1074 
1075 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1076 @*/
1077 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1078 {
1079   PetscErrorCode ierr;
1080 
1081   PetscFunctionBegin;
1082   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1083   PetscValidPointer(multi,2);
1084   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1085     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1086     *multi = sf->multi;
1087     PetscFunctionReturn(0);
1088   }
1089   if (!sf->multi) {
1090     const PetscInt *indegree;
1091     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1092     PetscSFNode    *remote;
1093     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1094     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1095     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1096     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1097     inoffset[0] = 0;
1098     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1099     for (i=0; i<maxlocal; i++) outones[i] = 1;
1100     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1101     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1102     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1103 #if 0
1104 #if defined(PETSC_USE_DEBUG)                                 /* Check that the expected number of increments occurred */
1105     for (i=0; i<sf->nroots; i++) {
1106       if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1107     }
1108 #endif
1109 #endif
1110     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1111     for (i=0; i<sf->nleaves; i++) {
1112       remote[i].rank  = sf->remote[i].rank;
1113       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1114     }
1115     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1116     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1117     if (sf->rankorder) {        /* Sort the ranks */
1118       PetscMPIInt rank;
1119       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1120       PetscSFNode *newremote;
1121       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1122       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1123       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1124       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1125       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1126       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1127       /* Sort the incoming ranks at each vertex, build the inverse map */
1128       for (i=0; i<sf->nroots; i++) {
1129         PetscInt j;
1130         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1131         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1132         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1133       }
1134       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1135       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1136       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1137       for (i=0; i<sf->nleaves; i++) {
1138         newremote[i].rank  = sf->remote[i].rank;
1139         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1140       }
1141       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1142       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1143     }
1144     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1145   }
1146   *multi = sf->multi;
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 /*@C
1151    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1152 
1153    Collective
1154 
1155    Input Arguments:
1156 +  sf - original star forest
1157 .  nselected  - number of selected roots on this process
1158 -  selected   - indices of the selected roots on this process
1159 
1160    Output Arguments:
1161 .  newsf - new star forest
1162 
1163    Level: advanced
1164 
1165    Note:
1166    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1167    be done by calling PetscSFGetGraph().
1168 
1169 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1170 @*/
1171 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1172 {
1173   PetscInt          i,n,*roots,*rootdata,*leafdata,nroots,nleaves,connected_leaves,*new_ilocal;
1174   const PetscSFNode *iremote;
1175   PetscSFNode       *new_iremote;
1176   PetscSF           tmpsf;
1177   MPI_Comm          comm;
1178   PetscErrorCode    ierr;
1179 
1180   PetscFunctionBegin;
1181   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1182   PetscSFCheckGraphSet(sf,1);
1183   if (nselected) PetscValidPointer(selected,3);
1184   PetscValidPointer(newsf,4);
1185 
1186   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1187 
1188   /* Uniq selected[] and put results in roots[] */
1189   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1190   ierr = PetscMalloc1(nselected,&roots);CHKERRQ(ierr);
1191   ierr = PetscArraycpy(roots,selected,nselected);CHKERRQ(ierr);
1192   ierr = PetscSortedRemoveDupsInt(&nselected,roots);CHKERRQ(ierr);
1193   if (nselected && (roots[0] < 0 || roots[nselected-1] >= sf->nroots)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max root indices %D/%D are not in [0,%D)",roots[0],roots[nselected-1],sf->nroots);
1194 
1195   if (sf->ops->CreateEmbeddedSF) {
1196     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,roots,newsf);CHKERRQ(ierr);
1197   } else {
1198     /* A generic version of creating embedded sf. Note that we called PetscSFSetGraph() twice, which is certainly expensive */
1199     /* Find out which leaves (not leaf data items) are still connected to roots in the embedded sf */
1200     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,NULL,&iremote);CHKERRQ(ierr);
1201     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&tmpsf);CHKERRQ(ierr);
1202     ierr = PetscSFSetGraph(tmpsf,nroots,nleaves,NULL/*contiguous*/,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);CHKERRQ(ierr);
1203     ierr = PetscCalloc2(nroots,&rootdata,nleaves,&leafdata);CHKERRQ(ierr);
1204     for (i=0; i<nselected; ++i) rootdata[roots[i]] = 1;
1205     ierr = PetscSFBcastBegin(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1206     ierr = PetscSFBcastEnd(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1207     ierr = PetscSFDestroy(&tmpsf);CHKERRQ(ierr);
1208 
1209     /* Build newsf with leaves that are still connected */
1210     connected_leaves = 0;
1211     for (i=0; i<nleaves; ++i) connected_leaves += leafdata[i];
1212     ierr = PetscMalloc1(connected_leaves,&new_ilocal);CHKERRQ(ierr);
1213     ierr = PetscMalloc1(connected_leaves,&new_iremote);CHKERRQ(ierr);
1214     for (i=0, n=0; i<nleaves; ++i) {
1215       if (leafdata[i]) {
1216         new_ilocal[n]        = sf->mine ? sf->mine[i] : i;
1217         new_iremote[n].rank  = sf->remote[i].rank;
1218         new_iremote[n].index = sf->remote[i].index;
1219         ++n;
1220       }
1221     }
1222 
1223     if (n != connected_leaves) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"There is a size mismatch in the SF embedding, %d != %d",n,connected_leaves);
1224     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1225     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1226     ierr = PetscSFSetGraph(*newsf,nroots,connected_leaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1227     ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
1228   }
1229   ierr = PetscFree(roots);CHKERRQ(ierr);
1230   PetscFunctionReturn(0);
1231 }
1232 
1233 /*@C
1234   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1235 
1236   Collective
1237 
1238   Input Arguments:
1239 + sf - original star forest
1240 . nselected  - number of selected leaves on this process
1241 - selected   - indices of the selected leaves on this process
1242 
1243   Output Arguments:
1244 .  newsf - new star forest
1245 
1246   Level: advanced
1247 
1248 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1249 @*/
1250 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1251 {
1252   const PetscSFNode *iremote;
1253   PetscSFNode       *new_iremote;
1254   const PetscInt    *ilocal;
1255   PetscInt          i,nroots,*leaves,*new_ilocal;
1256   MPI_Comm          comm;
1257   PetscErrorCode    ierr;
1258 
1259   PetscFunctionBegin;
1260   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1261   PetscSFCheckGraphSet(sf,1);
1262   if (nselected) PetscValidPointer(selected,3);
1263   PetscValidPointer(newsf,4);
1264 
1265   /* Uniq selected[] and put results in leaves[] */
1266   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1267   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1268   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1269   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1270   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1271 
1272   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1273   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1274     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1275   } else {
1276     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1277     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1278     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1279     for (i=0; i<nselected; ++i) {
1280       const PetscInt l     = leaves[i];
1281       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1282       new_iremote[i].rank  = iremote[l].rank;
1283       new_iremote[i].index = iremote[l].index;
1284     }
1285     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1286     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1287     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1288   }
1289   ierr = PetscFree(leaves);CHKERRQ(ierr);
1290   PetscFunctionReturn(0);
1291 }
1292 
1293 /*@C
1294    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1295 
1296    Collective on PetscSF
1297 
1298    Input Arguments:
1299 +  sf - star forest on which to communicate
1300 .  unit - data type associated with each node
1301 .  rootdata - buffer to broadcast
1302 -  op - operation to use for reduction
1303 
1304    Output Arguments:
1305 .  leafdata - buffer to be reduced with values from each leaf's respective root
1306 
1307    Level: intermediate
1308 
1309 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd()
1310 @*/
1311 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1312 {
1313   PetscErrorCode ierr;
1314   PetscMemType   rootmtype,leafmtype;
1315 
1316   PetscFunctionBegin;
1317   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1318   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1319   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1320   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1321   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1322 #if defined(PETSC_HAVE_CUDA)
1323   /*  Shall we assume rootdata, leafdata are ready to use (instead of being computed by some asynchronous kernels)?
1324     To be similar to MPI, I'd like to have this assumption, since MPI does not have a concept of stream.
1325     But currently this assumption is not enforecd in Petsc. To be safe, I do synchronization here. Otherwise, if
1326     we do not sync now and call the Pack kernel directly on the default NULL stream (assume petsc objects are also
1327     computed on it), we have to sync the NULL stream before calling MPI routines. So, it looks a cudaDeviceSynchronize
1328     is inevitable. We do it now and put pack/unpack kernels to non-NULL streams.
1329    */
1330   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1331 #endif
1332   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1333   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1334   PetscFunctionReturn(0);
1335 }
1336 
1337 /*@C
1338    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1339 
1340    Collective
1341 
1342    Input Arguments:
1343 +  sf - star forest
1344 .  unit - data type
1345 .  rootdata - buffer to broadcast
1346 -  op - operation to use for reduction
1347 
1348    Output Arguments:
1349 .  leafdata - buffer to be reduced with values from each leaf's respective root
1350 
1351    Level: intermediate
1352 
1353 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1354 @*/
1355 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1356 {
1357   PetscErrorCode ierr;
1358   PetscMemType   rootmtype,leafmtype;
1359 
1360   PetscFunctionBegin;
1361   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1362   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1363   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1364   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1365   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1366   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1367   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1368   PetscFunctionReturn(0);
1369 }
1370 
1371 /*@C
1372    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1373 
1374    Collective
1375 
1376    Input Arguments:
1377 +  sf - star forest
1378 .  unit - data type
1379 .  leafdata - values to reduce
1380 -  op - reduction operation
1381 
1382    Output Arguments:
1383 .  rootdata - result of reduction of values from all leaves of each root
1384 
1385    Level: intermediate
1386 
1387 .seealso: PetscSFBcastBegin()
1388 @*/
1389 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1390 {
1391   PetscErrorCode ierr;
1392   PetscMemType   rootmtype,leafmtype;
1393 
1394   PetscFunctionBegin;
1395   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1396   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1397   ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1398   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1399   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1400 #if defined(PETSC_HAVE_CUDA)
1401   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1402 #endif
1403   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1404   ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1405   PetscFunctionReturn(0);
1406 }
1407 
1408 /*@C
1409    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1410 
1411    Collective
1412 
1413    Input Arguments:
1414 +  sf - star forest
1415 .  unit - data type
1416 .  leafdata - values to reduce
1417 -  op - reduction operation
1418 
1419    Output Arguments:
1420 .  rootdata - result of reduction of values from all leaves of each root
1421 
1422    Level: intermediate
1423 
1424 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1425 @*/
1426 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1427 {
1428   PetscErrorCode ierr;
1429   PetscMemType   rootmtype,leafmtype;
1430 
1431   PetscFunctionBegin;
1432   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1433   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1434   ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1435   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1436   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1437   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1438   ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1439   PetscFunctionReturn(0);
1440 }
1441 
1442 /*@C
1443    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1444 
1445    Collective
1446 
1447    Input Arguments:
1448 +  sf - star forest
1449 .  unit - data type
1450 .  leafdata - leaf values to use in reduction
1451 -  op - operation to use for reduction
1452 
1453    Output Arguments:
1454 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1455 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1456 
1457    Level: advanced
1458 
1459    Note:
1460    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1461    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1462    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1463    integers.
1464 
1465 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1466 @*/
1467 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1468 {
1469   PetscErrorCode ierr;
1470   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1471 
1472   PetscFunctionBegin;
1473   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1474   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1475   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1476   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1477   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1478   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1479   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1480 #if defined(PETSC_HAVE_CUDA)
1481   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1482 #endif
1483   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1484   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1485   PetscFunctionReturn(0);
1486 }
1487 
1488 /*@C
1489    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1490 
1491    Collective
1492 
1493    Input Arguments:
1494 +  sf - star forest
1495 .  unit - data type
1496 .  leafdata - leaf values to use in reduction
1497 -  op - operation to use for reduction
1498 
1499    Output Arguments:
1500 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1501 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1502 
1503    Level: advanced
1504 
1505 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1506 @*/
1507 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1508 {
1509   PetscErrorCode ierr;
1510   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1511 
1512   PetscFunctionBegin;
1513   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1514   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1515   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1516   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1517   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1518   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1519   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1520   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1521   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1522   PetscFunctionReturn(0);
1523 }
1524 
1525 /*@C
1526    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1527 
1528    Collective
1529 
1530    Input Arguments:
1531 .  sf - star forest
1532 
1533    Output Arguments:
1534 .  degree - degree of each root vertex
1535 
1536    Level: advanced
1537 
1538    Notes:
1539    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1540 
1541 .seealso: PetscSFGatherBegin()
1542 @*/
1543 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1544 {
1545   PetscErrorCode ierr;
1546 
1547   PetscFunctionBegin;
1548   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1549   PetscSFCheckGraphSet(sf,1);
1550   PetscValidPointer(degree,2);
1551   if (!sf->degreeknown) {
1552     PetscInt i, nroots = sf->nroots, maxlocal = sf->maxleaf+1;  /* TODO: We should use PetscSFGetLeafRange() */
1553     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1554     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1555     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1556     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1557     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1558     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1559   }
1560   *degree = NULL;
1561   PetscFunctionReturn(0);
1562 }
1563 
1564 /*@C
1565    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1566 
1567    Collective
1568 
1569    Input Arguments:
1570 .  sf - star forest
1571 
1572    Output Arguments:
1573 .  degree - degree of each root vertex
1574 
1575    Level: developer
1576 
1577    Notes:
1578    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1579 
1580 .seealso:
1581 @*/
1582 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1583 {
1584   PetscErrorCode ierr;
1585 
1586   PetscFunctionBegin;
1587   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1588   PetscSFCheckGraphSet(sf,1);
1589   PetscValidPointer(degree,2);
1590   if (!sf->degreeknown) {
1591     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1592     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1593     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1594     sf->degreeknown = PETSC_TRUE;
1595   }
1596   *degree = sf->degree;
1597   PetscFunctionReturn(0);
1598 }
1599 
1600 
1601 /*@C
1602    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1603    Each multi-root is assigned index of the corresponding original root.
1604 
1605    Collective
1606 
1607    Input Arguments:
1608 +  sf - star forest
1609 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1610 
1611    Output Arguments:
1612 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1613 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1614 
1615    Level: developer
1616 
1617    Notes:
1618    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1619 
1620 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1621 @*/
1622 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1623 {
1624   PetscSF             msf;
1625   PetscInt            i, j, k, nroots, nmroots;
1626   PetscErrorCode      ierr;
1627 
1628   PetscFunctionBegin;
1629   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1630   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1631   if (nroots) PetscValidIntPointer(degree,2);
1632   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1633   PetscValidPointer(multiRootsOrigNumbering,4);
1634   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1635   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1636   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1637   for (i=0,j=0,k=0; i<nroots; i++) {
1638     if (!degree[i]) continue;
1639     for (j=0; j<degree[i]; j++,k++) {
1640       (*multiRootsOrigNumbering)[k] = i;
1641     }
1642   }
1643 #if defined(PETSC_USE_DEBUG)
1644   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1645 #endif
1646   if (nMultiRoots) *nMultiRoots = nmroots;
1647   PetscFunctionReturn(0);
1648 }
1649 
1650 /*@C
1651    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1652 
1653    Collective
1654 
1655    Input Arguments:
1656 +  sf - star forest
1657 .  unit - data type
1658 -  leafdata - leaf data to gather to roots
1659 
1660    Output Argument:
1661 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1662 
1663    Level: intermediate
1664 
1665 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1666 @*/
1667 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1668 {
1669   PetscErrorCode ierr;
1670   PetscSF        multi;
1671 
1672   PetscFunctionBegin;
1673   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1674   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1675   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1676   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1677   PetscFunctionReturn(0);
1678 }
1679 
1680 /*@C
1681    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1682 
1683    Collective
1684 
1685    Input Arguments:
1686 +  sf - star forest
1687 .  unit - data type
1688 -  leafdata - leaf data to gather to roots
1689 
1690    Output Argument:
1691 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1692 
1693    Level: intermediate
1694 
1695 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1696 @*/
1697 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1698 {
1699   PetscErrorCode ierr;
1700   PetscSF        multi;
1701 
1702   PetscFunctionBegin;
1703   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1704   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1705   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1706   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1707   PetscFunctionReturn(0);
1708 }
1709 
1710 /*@C
1711    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1712 
1713    Collective
1714 
1715    Input Arguments:
1716 +  sf - star forest
1717 .  unit - data type
1718 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1719 
1720    Output Argument:
1721 .  leafdata - leaf data to be update with personal data from each respective root
1722 
1723    Level: intermediate
1724 
1725 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1726 @*/
1727 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1728 {
1729   PetscErrorCode ierr;
1730   PetscSF        multi;
1731 
1732   PetscFunctionBegin;
1733   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1734   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1735   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1736   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1737   PetscFunctionReturn(0);
1738 }
1739 
1740 /*@C
1741    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1742 
1743    Collective
1744 
1745    Input Arguments:
1746 +  sf - star forest
1747 .  unit - data type
1748 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1749 
1750    Output Argument:
1751 .  leafdata - leaf data to be update with personal data from each respective root
1752 
1753    Level: intermediate
1754 
1755 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1756 @*/
1757 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1758 {
1759   PetscErrorCode ierr;
1760   PetscSF        multi;
1761 
1762   PetscFunctionBegin;
1763   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1764   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1765   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1766   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1767   PetscFunctionReturn(0);
1768 }
1769 
1770 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1771 {
1772 #if defined(PETSC_USE_DEBUG)
1773   PetscInt        i, n, nleaves;
1774   const PetscInt *ilocal = NULL;
1775   PetscHSetI      seen;
1776   PetscErrorCode  ierr;
1777 
1778   PetscFunctionBegin;
1779   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1780   ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1781   for (i = 0; i < nleaves; i++) {
1782     const PetscInt leaf = ilocal ? ilocal[i] : i;
1783     ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1784   }
1785   ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1786   if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1787   ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1788   PetscFunctionReturn(0);
1789 #else
1790   PetscFunctionBegin;
1791   PetscFunctionReturn(0);
1792 #endif
1793 }
1794 
1795 /*@
1796   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1797 
1798   Input Parameters:
1799 + sfA - The first PetscSF
1800 - sfB - The second PetscSF
1801 
1802   Output Parameters:
1803 . sfBA - The composite SF
1804 
1805   Level: developer
1806 
1807   Notes:
1808   Currently, the two SFs must be defined on congruent communicators and they must be true star
1809   forests, i.e. the same leaf is not connected with different roots.
1810 
1811   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1812   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1813   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1814   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1815 
1816 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1817 @*/
1818 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1819 {
1820   PetscErrorCode    ierr;
1821   const PetscSFNode *remotePointsA,*remotePointsB;
1822   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1823   const PetscInt    *localPointsA,*localPointsB;
1824   PetscInt          *localPointsBA;
1825   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1826   PetscBool         denseB;
1827 
1828   PetscFunctionBegin;
1829   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1830   PetscSFCheckGraphSet(sfA,1);
1831   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1832   PetscSFCheckGraphSet(sfB,2);
1833   PetscCheckSameComm(sfA,1,sfB,2);
1834   PetscValidPointer(sfBA,3);
1835   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1836   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1837 
1838   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1839   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1840   if (localPointsA) {
1841     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
1842     for (i=0; i<numRootsB; i++) {
1843       reorderedRemotePointsA[i].rank = -1;
1844       reorderedRemotePointsA[i].index = -1;
1845     }
1846     for (i=0; i<numLeavesA; i++) {
1847       if (localPointsA[i] >= numRootsB) continue;
1848       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
1849     }
1850     remotePointsA = reorderedRemotePointsA;
1851   }
1852   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
1853   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
1854   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1855   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1856   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1857 
1858   denseB = (PetscBool)!localPointsB;
1859   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1860     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
1861     else numLeavesBA++;
1862   }
1863   if (denseB) {
1864     localPointsBA  = NULL;
1865     remotePointsBA = leafdataB;
1866   } else {
1867     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
1868     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
1869     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1870       const PetscInt l = localPointsB ? localPointsB[i] : i;
1871 
1872       if (leafdataB[l-minleaf].rank == -1) continue;
1873       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
1874       localPointsBA[numLeavesBA] = l;
1875       numLeavesBA++;
1876     }
1877     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
1878   }
1879   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1880   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1881   PetscFunctionReturn(0);
1882 }
1883 
1884 /*@
1885   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
1886 
1887   Input Parameters:
1888 + sfA - The first PetscSF
1889 - sfB - The second PetscSF
1890 
1891   Output Parameters:
1892 . sfBA - The composite SF.
1893 
1894   Level: developer
1895 
1896   Notes:
1897   Currently, the two SFs must be defined on congruent communicators and they must be true star
1898   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
1899   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
1900 
1901   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
1902   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
1903   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
1904   a Reduce on sfB, on connected roots.
1905 
1906 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
1907 @*/
1908 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1909 {
1910   PetscErrorCode    ierr;
1911   const PetscSFNode *remotePointsA,*remotePointsB;
1912   PetscSFNode       *remotePointsBA;
1913   const PetscInt    *localPointsA,*localPointsB;
1914   PetscSFNode       *reorderedRemotePointsA = NULL;
1915   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
1916 
1917   PetscFunctionBegin;
1918   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1919   PetscSFCheckGraphSet(sfA,1);
1920   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1921   PetscSFCheckGraphSet(sfB,2);
1922   PetscCheckSameComm(sfA,1,sfB,2);
1923   PetscValidPointer(sfBA,3);
1924   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1925   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1926   /* TODO: Check roots of sfB have degree of 1 */
1927 
1928   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
1929   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
1930   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
1931   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
1932   for (i=0; i<maxleaf - minleaf + 1; i++) {
1933     reorderedRemotePointsA[i].rank = -1;
1934     reorderedRemotePointsA[i].index = -1;
1935   }
1936   if (localPointsA) {
1937     for (i=0; i<numLeavesA; i++) {
1938       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
1939       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
1940     }
1941   } else {
1942     for (i=0; i<numLeavesA; i++) {
1943       if (i > maxleaf || i < minleaf) continue;
1944       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
1945     }
1946   }
1947 
1948   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
1949   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
1950   for (i=0; i<numRootsB; i++) {
1951     remotePointsBA[i].rank = -1;
1952     remotePointsBA[i].index = -1;
1953   }
1954 
1955   /* Once we implement the TODO above (check all roots of sfB have degree of 1), we can replace the MPI_MAXLOC
1956      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
1957    */
1958   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,MPI_MAXLOC);CHKERRQ(ierr);
1959   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,MPI_MAXLOC);CHKERRQ(ierr);
1960   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1961   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
1962     if (remotePointsBA[i].rank == -1) continue;
1963     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
1964     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
1965     localPointsBA[numLeavesBA] = i;
1966     numLeavesBA++;
1967   }
1968   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1969   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1970   PetscFunctionReturn(0);
1971 }
1972 
1973 /*
1974   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
1975 
1976   Input Parameters:
1977 . sf - The global PetscSF
1978 
1979   Output Parameters:
1980 . out - The local PetscSF
1981  */
1982 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
1983 {
1984   MPI_Comm           comm;
1985   PetscMPIInt        myrank;
1986   const PetscInt     *ilocal;
1987   const PetscSFNode  *iremote;
1988   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
1989   PetscSFNode        *liremote;
1990   PetscSF            lsf;
1991   PetscErrorCode     ierr;
1992 
1993   PetscFunctionBegin;
1994   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1995   if (sf->ops->CreateLocalSF) {
1996     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
1997   } else {
1998     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
1999     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2000     ierr = MPI_Comm_rank(comm,&myrank);CHKERRQ(ierr);
2001 
2002     /* Find out local edges and build a local SF */
2003     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2004     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2005     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2006     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2007 
2008     for (i=j=0; i<nleaves; i++) {
2009       if (iremote[i].rank == (PetscInt)myrank) {
2010         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2011         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2012         liremote[j].index = iremote[i].index;
2013         j++;
2014       }
2015     }
2016     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2017     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2018     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2019     *out = lsf;
2020   }
2021   PetscFunctionReturn(0);
2022 }
2023 
2024 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2025 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2026 {
2027   PetscErrorCode     ierr;
2028   PetscMemType       rootmtype,leafmtype;
2029 
2030   PetscFunctionBegin;
2031   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2032   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2033   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2034   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2035   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2036   if (sf->ops->BcastToZero) {
2037     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2038   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2039   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2040   PetscFunctionReturn(0);
2041 }
2042 
2043