xref: /petsc/src/vec/is/sf/interface/sf.c (revision 2a67d2dafadcab4517c1d00b73dab83ea36691b2)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_USE_DEBUG)
6 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
7     if (PetscUnlikely(!(sf)->graphset))                              \
8       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
9   } while (0)
10 #else
11 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
12 #endif
13 
14 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",0};
15 
16 /*@
17    PetscSFCreate - create a star forest communication context
18 
19    Collective
20 
21    Input Arguments:
22 .  comm - communicator on which the star forest will operate
23 
24    Output Arguments:
25 .  sf - new star forest context
26 
27    Options Database Keys:
28 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
29 .  -sf_type window    -Use MPI-3 one-sided window for communication
30 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
31 
32    Level: intermediate
33 
34    Notes:
35    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
36    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
37    SFs are optimized and they have better performance than general SFs.
38 
39 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
40 @*/
41 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
42 {
43   PetscErrorCode ierr;
44   PetscSF        b;
45 
46   PetscFunctionBegin;
47   PetscValidPointer(sf,2);
48   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
49 
50   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
51 
52   b->nroots    = -1;
53   b->nleaves   = -1;
54   b->minleaf   = PETSC_MAX_INT;
55   b->maxleaf   = PETSC_MIN_INT;
56   b->nranks    = -1;
57   b->rankorder = PETSC_TRUE;
58   b->ingroup   = MPI_GROUP_NULL;
59   b->outgroup  = MPI_GROUP_NULL;
60   b->graphset  = PETSC_FALSE;
61 
62   *sf = b;
63   PetscFunctionReturn(0);
64 }
65 
66 /*@
67    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
68 
69    Collective
70 
71    Input Arguments:
72 .  sf - star forest
73 
74    Level: advanced
75 
76 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
77 @*/
78 PetscErrorCode PetscSFReset(PetscSF sf)
79 {
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
84   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
85   sf->nroots   = -1;
86   sf->nleaves  = -1;
87   sf->minleaf  = PETSC_MAX_INT;
88   sf->maxleaf  = PETSC_MIN_INT;
89   sf->mine     = NULL;
90   sf->remote   = NULL;
91   sf->graphset = PETSC_FALSE;
92   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
93   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
94   sf->nranks = -1;
95   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
96 #if defined(PETSC_HAVE_CUDA)
97   if (sf->rmine_d) {cudaError_t err = cudaFree(sf->rmine_d);CHKERRCUDA(err);sf->rmine_d=NULL;}
98 #endif
99   sf->degreeknown = PETSC_FALSE;
100   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
101   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRQ(ierr);}
102   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRQ(ierr);}
103   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
104   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
105   sf->setupcalled = PETSC_FALSE;
106   PetscFunctionReturn(0);
107 }
108 
109 /*@C
110    PetscSFSetType - Set the PetscSF communication implementation
111 
112    Collective on PetscSF
113 
114    Input Parameters:
115 +  sf - the PetscSF context
116 -  type - a known method
117 
118    Options Database Key:
119 .  -sf_type <type> - Sets the method; use -help for a list
120    of available methods (for instance, window, basic, neighbor)
121 
122    Notes:
123    See "include/petscsf.h" for available methods (for instance)
124 +    PETSCSFWINDOW - MPI-2/3 one-sided
125 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
126 
127   Level: intermediate
128 
129 .seealso: PetscSFType, PetscSFCreate()
130 @*/
131 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
132 {
133   PetscErrorCode ierr,(*r)(PetscSF);
134   PetscBool      match;
135 
136   PetscFunctionBegin;
137   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
138   PetscValidCharPointer(type,2);
139 
140   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
141   if (match) PetscFunctionReturn(0);
142 
143   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
144   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
145   /* Destroy the previous PetscSF implementation context */
146   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
147   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
148   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
149   ierr = (*r)(sf);CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 /*@C
154   PetscSFGetType - Get the PetscSF communication implementation
155 
156   Not Collective
157 
158   Input Parameter:
159 . sf  - the PetscSF context
160 
161   Output Parameter:
162 . type - the PetscSF type name
163 
164   Level: intermediate
165 
166 .seealso: PetscSFSetType(), PetscSFCreate()
167 @*/
168 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
169 {
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
172   PetscValidPointer(type,2);
173   *type = ((PetscObject)sf)->type_name;
174   PetscFunctionReturn(0);
175 }
176 
177 /*@
178    PetscSFDestroy - destroy star forest
179 
180    Collective
181 
182    Input Arguments:
183 .  sf - address of star forest
184 
185    Level: intermediate
186 
187 .seealso: PetscSFCreate(), PetscSFReset()
188 @*/
189 PetscErrorCode PetscSFDestroy(PetscSF *sf)
190 {
191   PetscErrorCode ierr;
192 
193   PetscFunctionBegin;
194   if (!*sf) PetscFunctionReturn(0);
195   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
196   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
197   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
198   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
199   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
204 {
205 #if defined(PETSC_USE_DEBUG)
206   PetscInt           i, nleaves;
207   PetscMPIInt        size;
208   const PetscInt    *ilocal;
209   const PetscSFNode *iremote;
210   PetscErrorCode     ierr;
211 
212   PetscFunctionBegin;
213   if (!sf->graphset) PetscFunctionReturn(0);
214   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
215   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
216   for (i = 0; i < nleaves; i++) {
217     const PetscInt rank = iremote[i].rank;
218     const PetscInt remote = iremote[i].index;
219     const PetscInt leaf = ilocal ? ilocal[i] : i;
220     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
221     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
222     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
223   }
224   PetscFunctionReturn(0);
225 #else
226   PetscFunctionBegin;
227   PetscFunctionReturn(0);
228 #endif
229 }
230 
231 /*@
232    PetscSFSetUp - set up communication structures
233 
234    Collective
235 
236    Input Arguments:
237 .  sf - star forest communication object
238 
239    Level: beginner
240 
241 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
242 @*/
243 PetscErrorCode PetscSFSetUp(PetscSF sf)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
249   PetscSFCheckGraphSet(sf,1);
250   if (sf->setupcalled) PetscFunctionReturn(0);
251   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
252   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);}
253   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
254   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
255   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
256   sf->setupcalled = PETSC_TRUE;
257   PetscFunctionReturn(0);
258 }
259 
260 /*@
261    PetscSFSetFromOptions - set PetscSF options using the options database
262 
263    Logically Collective
264 
265    Input Arguments:
266 .  sf - star forest
267 
268    Options Database Keys:
269 +  -sf_type              - implementation type, see PetscSFSetType()
270 .  -sf_rank_order        - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
271 -  -sf_use_pinned_buffer - use pinned (nonpagable) memory for send/recv buffers on host when communicating GPU data but GPU-aware MPI is not used.
272                            Only available for SF types of basic and neighbor.
273 
274    Level: intermediate
275 
276 .seealso: PetscSFWindowSetSyncType()
277 @*/
278 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
279 {
280   PetscSFType    deft;
281   char           type[256];
282   PetscErrorCode ierr;
283   PetscBool      flg;
284 
285   PetscFunctionBegin;
286   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
287   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
288   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
289   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
290   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
291   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
292   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
293   ierr = PetscOptionsEnd();CHKERRQ(ierr);
294   PetscFunctionReturn(0);
295 }
296 
297 /*@
298    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
299 
300    Logically Collective
301 
302    Input Arguments:
303 +  sf - star forest
304 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
305 
306    Level: advanced
307 
308 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
309 @*/
310 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
311 {
312 
313   PetscFunctionBegin;
314   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
315   PetscValidLogicalCollectiveBool(sf,flg,2);
316   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
317   sf->rankorder = flg;
318   PetscFunctionReturn(0);
319 }
320 
321 /*@
322    PetscSFSetGraph - Set a parallel star forest
323 
324    Collective
325 
326    Input Arguments:
327 +  sf - star forest
328 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
329 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
330 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
331 during setup in debug mode)
332 .  localmode - copy mode for ilocal
333 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
334 during setup in debug mode)
335 -  remotemode - copy mode for iremote
336 
337    Level: intermediate
338 
339    Notes:
340     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
341 
342    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
343    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
344    needed
345 
346    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
347    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
348 
349 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
350 @*/
351 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
352 {
353   PetscErrorCode ierr;
354 
355   PetscFunctionBegin;
356   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
357   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
358   if (nleaves > 0) PetscValidPointer(iremote,6);
359   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
360   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
361 
362   if (sf->nroots >= 0) { /* Reset only if graph already set */
363     ierr = PetscSFReset(sf);CHKERRQ(ierr);
364   }
365 
366   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
367 
368   sf->nroots  = nroots;
369   sf->nleaves = nleaves;
370 
371   if (nleaves && ilocal) {
372     PetscInt i;
373     PetscInt minleaf = PETSC_MAX_INT;
374     PetscInt maxleaf = PETSC_MIN_INT;
375     int      contiguous = 1;
376     for (i=0; i<nleaves; i++) {
377       minleaf = PetscMin(minleaf,ilocal[i]);
378       maxleaf = PetscMax(maxleaf,ilocal[i]);
379       contiguous &= (ilocal[i] == i);
380     }
381     sf->minleaf = minleaf;
382     sf->maxleaf = maxleaf;
383     if (contiguous) {
384       if (localmode == PETSC_OWN_POINTER) {
385         ierr = PetscFree(ilocal);CHKERRQ(ierr);
386       }
387       ilocal = NULL;
388     }
389   } else {
390     sf->minleaf = 0;
391     sf->maxleaf = nleaves - 1;
392   }
393 
394   if (ilocal) {
395     switch (localmode) {
396     case PETSC_COPY_VALUES:
397       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
398       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
399       sf->mine = sf->mine_alloc;
400       break;
401     case PETSC_OWN_POINTER:
402       sf->mine_alloc = (PetscInt*)ilocal;
403       sf->mine       = sf->mine_alloc;
404       break;
405     case PETSC_USE_POINTER:
406       sf->mine_alloc = NULL;
407       sf->mine       = (PetscInt*)ilocal;
408       break;
409     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
410     }
411   }
412 
413   switch (remotemode) {
414   case PETSC_COPY_VALUES:
415     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
416     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
417     sf->remote = sf->remote_alloc;
418     break;
419   case PETSC_OWN_POINTER:
420     sf->remote_alloc = (PetscSFNode*)iremote;
421     sf->remote       = sf->remote_alloc;
422     break;
423   case PETSC_USE_POINTER:
424     sf->remote_alloc = NULL;
425     sf->remote       = (PetscSFNode*)iremote;
426     break;
427   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
428   }
429 
430   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
431   sf->graphset = PETSC_TRUE;
432   PetscFunctionReturn(0);
433 }
434 
435 /*@
436   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
437 
438   Collective
439 
440   Input Parameters:
441 + sf      - The PetscSF
442 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
443 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
444 
445   Notes:
446   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
447   n and N are local and global sizes of x respectively.
448 
449   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
450   sequential vectors y on all ranks.
451 
452   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
453   sequential vector y on rank 0.
454 
455   In above cases, entries of x are roots and entries of y are leaves.
456 
457   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
458   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
459   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
460   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
461   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
462 
463   In this case, roots and leaves are symmetric.
464 
465   Level: intermediate
466  @*/
467 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
468 {
469   MPI_Comm       comm;
470   PetscInt       n,N,res[2];
471   PetscMPIInt    rank,size;
472   PetscSFType    type;
473   PetscErrorCode ierr;
474 
475   PetscFunctionBegin;
476   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
477   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
478   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
479   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
480 
481   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
482     type = PETSCSFALLTOALL;
483     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
484     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
485     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
486     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
487   } else {
488     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
489     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
490     res[0] = n;
491     res[1] = -n;
492     /* Check if n are same over all ranks so that we can optimize it */
493     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
494     if (res[0] == -res[1]) { /* same n */
495       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
496     } else {
497       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
498     }
499     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
500   }
501   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
502 
503   sf->pattern = pattern;
504   sf->mine    = NULL; /* Contiguous */
505 
506   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
507      Also set other easy stuff.
508    */
509   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
510     sf->nleaves      = N;
511     sf->nroots       = n;
512     sf->nranks       = size;
513     sf->minleaf      = 0;
514     sf->maxleaf      = N - 1;
515   } else if (pattern == PETSCSF_PATTERN_GATHER) {
516     sf->nleaves      = rank ? 0 : N;
517     sf->nroots       = n;
518     sf->nranks       = rank ? 0 : size;
519     sf->minleaf      = 0;
520     sf->maxleaf      = rank ? -1 : N - 1;
521   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
522     sf->nleaves      = size;
523     sf->nroots       = size;
524     sf->nranks       = size;
525     sf->minleaf      = 0;
526     sf->maxleaf      = size - 1;
527   }
528   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
529   sf->graphset = PETSC_TRUE;
530   PetscFunctionReturn(0);
531 }
532 
533 /*@
534    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
535 
536    Collective
537 
538    Input Arguments:
539 
540 .  sf - star forest to invert
541 
542    Output Arguments:
543 .  isf - inverse of sf
544    Level: advanced
545 
546    Notes:
547    All roots must have degree 1.
548 
549    The local space may be a permutation, but cannot be sparse.
550 
551 .seealso: PetscSFSetGraph()
552 @*/
553 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
554 {
555   PetscErrorCode ierr;
556   PetscMPIInt    rank;
557   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
558   const PetscInt *ilocal;
559   PetscSFNode    *roots,*leaves;
560 
561   PetscFunctionBegin;
562   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
563   PetscSFCheckGraphSet(sf,1);
564   PetscValidPointer(isf,2);
565 
566   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
567   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
568 
569   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
570   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
571   for (i=0; i<maxlocal; i++) {
572     leaves[i].rank  = rank;
573     leaves[i].index = i;
574   }
575   for (i=0; i <nroots; i++) {
576     roots[i].rank  = -1;
577     roots[i].index = -1;
578   }
579   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
580   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
581 
582   /* Check whether our leaves are sparse */
583   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
584   if (count == nroots) newilocal = NULL;
585   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
586     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
587     for (i=0,count=0; i<nroots; i++) {
588       if (roots[i].rank >= 0) {
589         newilocal[count]   = i;
590         roots[count].rank  = roots[i].rank;
591         roots[count].index = roots[i].index;
592         count++;
593       }
594     }
595   }
596 
597   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
598   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
599   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
600   PetscFunctionReturn(0);
601 }
602 
603 /*@
604    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
605 
606    Collective
607 
608    Input Arguments:
609 +  sf - communication object to duplicate
610 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
611 
612    Output Arguments:
613 .  newsf - new communication object
614 
615    Level: beginner
616 
617 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
618 @*/
619 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
620 {
621   PetscSFType    type;
622   PetscErrorCode ierr;
623 
624   PetscFunctionBegin;
625   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
626   PetscValidLogicalCollectiveEnum(sf,opt,2);
627   PetscValidPointer(newsf,3);
628   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
629   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
630   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
631   if (opt == PETSCSF_DUPLICATE_GRAPH) {
632     PetscSFCheckGraphSet(sf,1);
633     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
634       PetscInt          nroots,nleaves;
635       const PetscInt    *ilocal;
636       const PetscSFNode *iremote;
637       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
638       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
639     } else {
640       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
641     }
642   }
643   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
644   PetscFunctionReturn(0);
645 }
646 
647 /*@C
648    PetscSFGetGraph - Get the graph specifying a parallel star forest
649 
650    Not Collective
651 
652    Input Arguments:
653 .  sf - star forest
654 
655    Output Arguments:
656 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
657 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
658 .  ilocal - locations of leaves in leafdata buffers
659 -  iremote - remote locations of root vertices for each leaf on the current process
660 
661    Notes:
662    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
663 
664    When called from Fortran, the returned iremote array is a copy and must deallocated after use. Consequently, if you
665    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
666 
667    Level: intermediate
668 
669 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
670 @*/
671 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
672 {
673   PetscErrorCode ierr;
674 
675   PetscFunctionBegin;
676   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
677   if (sf->ops->GetGraph) {
678     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
679   } else {
680     if (nroots) *nroots = sf->nroots;
681     if (nleaves) *nleaves = sf->nleaves;
682     if (ilocal) *ilocal = sf->mine;
683     if (iremote) *iremote = sf->remote;
684   }
685   PetscFunctionReturn(0);
686 }
687 
688 /*@
689    PetscSFGetLeafRange - Get the active leaf ranges
690 
691    Not Collective
692 
693    Input Arguments:
694 .  sf - star forest
695 
696    Output Arguments:
697 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
698 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
699 
700    Level: developer
701 
702 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
703 @*/
704 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
705 {
706 
707   PetscFunctionBegin;
708   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
709   PetscSFCheckGraphSet(sf,1);
710   if (minleaf) *minleaf = sf->minleaf;
711   if (maxleaf) *maxleaf = sf->maxleaf;
712   PetscFunctionReturn(0);
713 }
714 
715 /*@C
716    PetscSFViewFromOptions - View from Options
717 
718    Collective on PetscSF
719 
720    Input Parameters:
721 +  A - the star forest
722 .  obj - Optional object
723 -  name - command line option
724 
725    Level: intermediate
726 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
727 @*/
728 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
729 {
730   PetscErrorCode ierr;
731 
732   PetscFunctionBegin;
733   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
734   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
735   PetscFunctionReturn(0);
736 }
737 
738 /*@C
739    PetscSFView - view a star forest
740 
741    Collective
742 
743    Input Arguments:
744 +  sf - star forest
745 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
746 
747    Level: beginner
748 
749 .seealso: PetscSFCreate(), PetscSFSetGraph()
750 @*/
751 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
752 {
753   PetscErrorCode    ierr;
754   PetscBool         iascii;
755   PetscViewerFormat format;
756 
757   PetscFunctionBegin;
758   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
759   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
760   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
761   PetscCheckSameComm(sf,1,viewer,2);
762   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
763   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
764   if (iascii) {
765     PetscMPIInt rank;
766     PetscInt    ii,i,j;
767 
768     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
769     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
770     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
771     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
772       if (!sf->graphset) {
773         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
774         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
775         PetscFunctionReturn(0);
776       }
777       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
778       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
779       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
780       for (i=0; i<sf->nleaves; i++) {
781         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
782       }
783       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
784       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
785       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
786         PetscMPIInt *tmpranks,*perm;
787         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
788         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
789         for (i=0; i<sf->nranks; i++) perm[i] = i;
790         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
791         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
792         for (ii=0; ii<sf->nranks; ii++) {
793           i = perm[ii];
794           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
795           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
796             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
797           }
798         }
799         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
800       }
801       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
802       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
803     }
804     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
805   }
806   PetscFunctionReturn(0);
807 }
808 
809 /*@C
810    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
811 
812    Not Collective
813 
814    Input Arguments:
815 .  sf - star forest
816 
817    Output Arguments:
818 +  nranks - number of ranks referenced by local part
819 .  ranks - array of ranks
820 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
821 .  rmine - concatenated array holding local indices referencing each remote rank
822 -  rremote - concatenated array holding remote indices referenced for each remote rank
823 
824    Level: developer
825 
826 .seealso: PetscSFGetLeafRanks()
827 @*/
828 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
829 {
830   PetscErrorCode ierr;
831 
832   PetscFunctionBegin;
833   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
834   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
835   if (sf->ops->GetRootRanks) {
836     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
837   } else {
838     /* The generic implementation */
839     if (nranks)  *nranks  = sf->nranks;
840     if (ranks)   *ranks   = sf->ranks;
841     if (roffset) *roffset = sf->roffset;
842     if (rmine)   *rmine   = sf->rmine;
843     if (rremote) *rremote = sf->rremote;
844   }
845   PetscFunctionReturn(0);
846 }
847 
848 /*@C
849    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
850 
851    Not Collective
852 
853    Input Arguments:
854 .  sf - star forest
855 
856    Output Arguments:
857 +  niranks - number of leaf ranks referencing roots on this process
858 .  iranks - array of ranks
859 .  ioffset - offset in irootloc for each rank (length niranks+1)
860 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
861 
862    Level: developer
863 
864 .seealso: PetscSFGetRootRanks()
865 @*/
866 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
867 {
868   PetscErrorCode ierr;
869 
870   PetscFunctionBegin;
871   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
872   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
873   if (sf->ops->GetLeafRanks) {
874     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
875   } else {
876     PetscSFType type;
877     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
878     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
879   }
880   PetscFunctionReturn(0);
881 }
882 
883 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
884   PetscInt i;
885   for (i=0; i<n; i++) {
886     if (needle == list[i]) return PETSC_TRUE;
887   }
888   return PETSC_FALSE;
889 }
890 
891 /*@C
892    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
893 
894    Collective
895 
896    Input Arguments:
897 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
898 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
899 
900    Level: developer
901 
902 .seealso: PetscSFGetRootRanks()
903 @*/
904 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
905 {
906   PetscErrorCode     ierr;
907   PetscTable         table;
908   PetscTablePosition pos;
909   PetscMPIInt        size,groupsize,*groupranks;
910   PetscInt           *rcount,*ranks;
911   PetscInt           i, irank = -1,orank = -1;
912 
913   PetscFunctionBegin;
914   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
915   PetscSFCheckGraphSet(sf,1);
916   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
917   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
918   for (i=0; i<sf->nleaves; i++) {
919     /* Log 1-based rank */
920     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
921   }
922   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
923   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
924   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
925   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
926   for (i=0; i<sf->nranks; i++) {
927     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
928     ranks[i]--;             /* Convert back to 0-based */
929   }
930   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
931 
932   /* We expect that dgroup is reliably "small" while nranks could be large */
933   {
934     MPI_Group group = MPI_GROUP_NULL;
935     PetscMPIInt *dgroupranks;
936     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
937     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRQ(ierr);
938     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
939     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
940     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
941     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRQ(ierr);}
942     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
943     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
944   }
945 
946   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
947   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i; ) {
948     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
949       if (InList(ranks[i],groupsize,groupranks)) break;
950     }
951     for ( ; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
952       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
953     }
954     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
955       PetscInt tmprank,tmpcount;
956 
957       tmprank             = ranks[i];
958       tmpcount            = rcount[i];
959       ranks[i]            = ranks[sf->ndranks];
960       rcount[i]           = rcount[sf->ndranks];
961       ranks[sf->ndranks]  = tmprank;
962       rcount[sf->ndranks] = tmpcount;
963       sf->ndranks++;
964     }
965   }
966   ierr = PetscFree(groupranks);CHKERRQ(ierr);
967   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
968   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
969   sf->roffset[0] = 0;
970   for (i=0; i<sf->nranks; i++) {
971     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
972     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
973     rcount[i]        = 0;
974   }
975   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
976     /* short circuit */
977     if (orank != sf->remote[i].rank) {
978       /* Search for index of iremote[i].rank in sf->ranks */
979       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
980       if (irank < 0) {
981         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
982         if (irank >= 0) irank += sf->ndranks;
983       }
984       orank = sf->remote[i].rank;
985     }
986     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
987     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
988     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
989     rcount[irank]++;
990   }
991   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
992   PetscFunctionReturn(0);
993 }
994 
995 /*@C
996    PetscSFGetGroups - gets incoming and outgoing process groups
997 
998    Collective
999 
1000    Input Argument:
1001 .  sf - star forest
1002 
1003    Output Arguments:
1004 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1005 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1006 
1007    Level: developer
1008 
1009 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1010 @*/
1011 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1012 {
1013   PetscErrorCode ierr;
1014   MPI_Group      group = MPI_GROUP_NULL;
1015 
1016   PetscFunctionBegin;
1017   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1018   if (sf->ingroup == MPI_GROUP_NULL) {
1019     PetscInt       i;
1020     const PetscInt *indegree;
1021     PetscMPIInt    rank,*outranks,*inranks;
1022     PetscSFNode    *remote;
1023     PetscSF        bgcount;
1024 
1025     /* Compute the number of incoming ranks */
1026     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1027     for (i=0; i<sf->nranks; i++) {
1028       remote[i].rank  = sf->ranks[i];
1029       remote[i].index = 0;
1030     }
1031     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1032     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1033     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1034     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1035 
1036     /* Enumerate the incoming ranks */
1037     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1038     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1039     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1040     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1041     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1042     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1043     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRQ(ierr);
1044     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1045     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1046     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1047   }
1048   *incoming = sf->ingroup;
1049 
1050   if (sf->outgroup == MPI_GROUP_NULL) {
1051     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1052     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRQ(ierr);
1053     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1054   }
1055   *outgoing = sf->outgroup;
1056   PetscFunctionReturn(0);
1057 }
1058 
1059 /*@
1060    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1061 
1062    Collective
1063 
1064    Input Argument:
1065 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1066 
1067    Output Arguments:
1068 .  multi - star forest with split roots, such that each root has degree exactly 1
1069 
1070    Level: developer
1071 
1072    Notes:
1073 
1074    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1075    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1076    edge, it is a candidate for future optimization that might involve its removal.
1077 
1078 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1079 @*/
1080 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1081 {
1082   PetscErrorCode ierr;
1083 
1084   PetscFunctionBegin;
1085   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1086   PetscValidPointer(multi,2);
1087   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1088     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1089     *multi = sf->multi;
1090     PetscFunctionReturn(0);
1091   }
1092   if (!sf->multi) {
1093     const PetscInt *indegree;
1094     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1095     PetscSFNode    *remote;
1096     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1097     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1098     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1099     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1100     inoffset[0] = 0;
1101     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1102     for (i=0; i<maxlocal; i++) outones[i] = 1;
1103     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1104     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1105     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1106 #if 0
1107 #if defined(PETSC_USE_DEBUG)                                 /* Check that the expected number of increments occurred */
1108     for (i=0; i<sf->nroots; i++) {
1109       if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1110     }
1111 #endif
1112 #endif
1113     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1114     for (i=0; i<sf->nleaves; i++) {
1115       remote[i].rank  = sf->remote[i].rank;
1116       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1117     }
1118     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1119     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1120     if (sf->rankorder) {        /* Sort the ranks */
1121       PetscMPIInt rank;
1122       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1123       PetscSFNode *newremote;
1124       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1125       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1126       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1127       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1128       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1129       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1130       /* Sort the incoming ranks at each vertex, build the inverse map */
1131       for (i=0; i<sf->nroots; i++) {
1132         PetscInt j;
1133         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1134         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1135         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1136       }
1137       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1138       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1139       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1140       for (i=0; i<sf->nleaves; i++) {
1141         newremote[i].rank  = sf->remote[i].rank;
1142         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1143       }
1144       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1145       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1146     }
1147     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1148   }
1149   *multi = sf->multi;
1150   PetscFunctionReturn(0);
1151 }
1152 
1153 /*@C
1154    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1155 
1156    Collective
1157 
1158    Input Arguments:
1159 +  sf - original star forest
1160 .  nselected  - number of selected roots on this process
1161 -  selected   - indices of the selected roots on this process
1162 
1163    Output Arguments:
1164 .  newsf - new star forest
1165 
1166    Level: advanced
1167 
1168    Note:
1169    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1170    be done by calling PetscSFGetGraph().
1171 
1172 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1173 @*/
1174 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1175 {
1176   PetscInt          i,n,*roots,*rootdata,*leafdata,nroots,nleaves,connected_leaves,*new_ilocal;
1177   const PetscSFNode *iremote;
1178   PetscSFNode       *new_iremote;
1179   PetscSF           tmpsf;
1180   MPI_Comm          comm;
1181   PetscErrorCode    ierr;
1182 
1183   PetscFunctionBegin;
1184   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1185   PetscSFCheckGraphSet(sf,1);
1186   if (nselected) PetscValidPointer(selected,3);
1187   PetscValidPointer(newsf,4);
1188 
1189   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1190   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1191   /* Uniq selected[] and put results in roots[] */
1192   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1193   ierr = PetscMalloc1(nselected,&roots);CHKERRQ(ierr);
1194   ierr = PetscArraycpy(roots,selected,nselected);CHKERRQ(ierr);
1195   ierr = PetscSortedRemoveDupsInt(&nselected,roots);CHKERRQ(ierr);
1196   if (nselected && (roots[0] < 0 || roots[nselected-1] >= sf->nroots)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max root indices %D/%D are not in [0,%D)",roots[0],roots[nselected-1],sf->nroots);
1197 
1198   if (sf->ops->CreateEmbeddedSF) {
1199     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,roots,newsf);CHKERRQ(ierr);
1200   } else {
1201     /* A generic version of creating embedded sf. Note that we called PetscSFSetGraph() twice, which is certainly expensive */
1202     /* Find out which leaves (not leaf data items) are still connected to roots in the embedded sf */
1203     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,NULL,&iremote);CHKERRQ(ierr);
1204     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&tmpsf);CHKERRQ(ierr);
1205     ierr = PetscSFSetGraph(tmpsf,nroots,nleaves,NULL/*contiguous*/,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);CHKERRQ(ierr);
1206     ierr = PetscCalloc2(nroots,&rootdata,nleaves,&leafdata);CHKERRQ(ierr);
1207     for (i=0; i<nselected; ++i) rootdata[roots[i]] = 1;
1208     ierr = PetscSFBcastBegin(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1209     ierr = PetscSFBcastEnd(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1210     ierr = PetscSFDestroy(&tmpsf);CHKERRQ(ierr);
1211 
1212     /* Build newsf with leaves that are still connected */
1213     connected_leaves = 0;
1214     for (i=0; i<nleaves; ++i) connected_leaves += leafdata[i];
1215     ierr = PetscMalloc1(connected_leaves,&new_ilocal);CHKERRQ(ierr);
1216     ierr = PetscMalloc1(connected_leaves,&new_iremote);CHKERRQ(ierr);
1217     for (i=0, n=0; i<nleaves; ++i) {
1218       if (leafdata[i]) {
1219         new_ilocal[n]        = sf->mine ? sf->mine[i] : i;
1220         new_iremote[n].rank  = sf->remote[i].rank;
1221         new_iremote[n].index = sf->remote[i].index;
1222         ++n;
1223       }
1224     }
1225 
1226     if (n != connected_leaves) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"There is a size mismatch in the SF embedding, %d != %d",n,connected_leaves);
1227     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1228     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1229     ierr = PetscSFSetGraph(*newsf,nroots,connected_leaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1230     ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
1231   }
1232   ierr = PetscFree(roots);CHKERRQ(ierr);
1233   ierr = PetscSFSetUp(*newsf);CHKERRQ(ierr);
1234   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1235   PetscFunctionReturn(0);
1236 }
1237 
1238 /*@C
1239   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1240 
1241   Collective
1242 
1243   Input Arguments:
1244 + sf - original star forest
1245 . nselected  - number of selected leaves on this process
1246 - selected   - indices of the selected leaves on this process
1247 
1248   Output Arguments:
1249 .  newsf - new star forest
1250 
1251   Level: advanced
1252 
1253 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1254 @*/
1255 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1256 {
1257   const PetscSFNode *iremote;
1258   PetscSFNode       *new_iremote;
1259   const PetscInt    *ilocal;
1260   PetscInt          i,nroots,*leaves,*new_ilocal;
1261   MPI_Comm          comm;
1262   PetscErrorCode    ierr;
1263 
1264   PetscFunctionBegin;
1265   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1266   PetscSFCheckGraphSet(sf,1);
1267   if (nselected) PetscValidPointer(selected,3);
1268   PetscValidPointer(newsf,4);
1269 
1270   /* Uniq selected[] and put results in leaves[] */
1271   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1272   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1273   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1274   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1275   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1276 
1277   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1278   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1279     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1280   } else {
1281     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1282     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1283     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1284     for (i=0; i<nselected; ++i) {
1285       const PetscInt l     = leaves[i];
1286       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1287       new_iremote[i].rank  = iremote[l].rank;
1288       new_iremote[i].index = iremote[l].index;
1289     }
1290     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1291     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1292     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1293   }
1294   ierr = PetscFree(leaves);CHKERRQ(ierr);
1295   PetscFunctionReturn(0);
1296 }
1297 
1298 /*@C
1299    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1300 
1301    Collective on PetscSF
1302 
1303    Input Arguments:
1304 +  sf - star forest on which to communicate
1305 .  unit - data type associated with each node
1306 .  rootdata - buffer to broadcast
1307 -  op - operation to use for reduction
1308 
1309    Output Arguments:
1310 .  leafdata - buffer to be reduced with values from each leaf's respective root
1311 
1312    Level: intermediate
1313 
1314 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd()
1315 @*/
1316 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1317 {
1318   PetscErrorCode ierr;
1319   PetscMemType   rootmtype,leafmtype;
1320 
1321   PetscFunctionBegin;
1322   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1323   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1324   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1325   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1326   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1327 #if defined(PETSC_HAVE_CUDA)
1328   /*  Shall we assume rootdata, leafdata are ready to use (instead of being computed by some asynchronous kernels)?
1329     To be similar to MPI, I'd like to have this assumption, since MPI does not have a concept of stream.
1330     But currently this assumption is not enforecd in Petsc. To be safe, I do synchronization here. Otherwise, if
1331     we do not sync now and call the Pack kernel directly on the default NULL stream (assume petsc objects are also
1332     computed on it), we have to sync the NULL stream before calling MPI routines. So, it looks a cudaDeviceSynchronize
1333     is inevitable. We do it now and put pack/unpack kernels to non-NULL streams.
1334    */
1335   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1336 #endif
1337   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1338   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1339   PetscFunctionReturn(0);
1340 }
1341 
1342 /*@C
1343    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1344 
1345    Collective
1346 
1347    Input Arguments:
1348 +  sf - star forest
1349 .  unit - data type
1350 .  rootdata - buffer to broadcast
1351 -  op - operation to use for reduction
1352 
1353    Output Arguments:
1354 .  leafdata - buffer to be reduced with values from each leaf's respective root
1355 
1356    Level: intermediate
1357 
1358 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1359 @*/
1360 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1361 {
1362   PetscErrorCode ierr;
1363   PetscMemType   rootmtype,leafmtype;
1364 
1365   PetscFunctionBegin;
1366   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1367   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1368   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1369   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1370   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1371   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1372   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1373   PetscFunctionReturn(0);
1374 }
1375 
1376 /*@C
1377    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1378 
1379    Collective
1380 
1381    Input Arguments:
1382 +  sf - star forest
1383 .  unit - data type
1384 .  leafdata - values to reduce
1385 -  op - reduction operation
1386 
1387    Output Arguments:
1388 .  rootdata - result of reduction of values from all leaves of each root
1389 
1390    Level: intermediate
1391 
1392 .seealso: PetscSFBcastBegin()
1393 @*/
1394 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1395 {
1396   PetscErrorCode ierr;
1397   PetscMemType   rootmtype,leafmtype;
1398 
1399   PetscFunctionBegin;
1400   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1401   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1402   ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1403   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1404   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1405 #if defined(PETSC_HAVE_CUDA)
1406   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1407 #endif
1408   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1409   ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1410   PetscFunctionReturn(0);
1411 }
1412 
1413 /*@C
1414    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1415 
1416    Collective
1417 
1418    Input Arguments:
1419 +  sf - star forest
1420 .  unit - data type
1421 .  leafdata - values to reduce
1422 -  op - reduction operation
1423 
1424    Output Arguments:
1425 .  rootdata - result of reduction of values from all leaves of each root
1426 
1427    Level: intermediate
1428 
1429 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1430 @*/
1431 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1432 {
1433   PetscErrorCode ierr;
1434   PetscMemType   rootmtype,leafmtype;
1435 
1436   PetscFunctionBegin;
1437   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1438   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1439   ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1440   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1441   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1442   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1443   ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1444   PetscFunctionReturn(0);
1445 }
1446 
1447 /*@C
1448    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1449 
1450    Collective
1451 
1452    Input Arguments:
1453 +  sf - star forest
1454 .  unit - data type
1455 .  leafdata - leaf values to use in reduction
1456 -  op - operation to use for reduction
1457 
1458    Output Arguments:
1459 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1460 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1461 
1462    Level: advanced
1463 
1464    Note:
1465    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1466    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1467    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1468    integers.
1469 
1470 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1471 @*/
1472 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1473 {
1474   PetscErrorCode ierr;
1475   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1476 
1477   PetscFunctionBegin;
1478   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1479   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1480   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1481   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1482   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1483   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1484   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1485 #if defined(PETSC_HAVE_CUDA)
1486   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1487 #endif
1488   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1489   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1490   PetscFunctionReturn(0);
1491 }
1492 
1493 /*@C
1494    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1495 
1496    Collective
1497 
1498    Input Arguments:
1499 +  sf - star forest
1500 .  unit - data type
1501 .  leafdata - leaf values to use in reduction
1502 -  op - operation to use for reduction
1503 
1504    Output Arguments:
1505 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1506 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1507 
1508    Level: advanced
1509 
1510 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1511 @*/
1512 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1513 {
1514   PetscErrorCode ierr;
1515   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1516 
1517   PetscFunctionBegin;
1518   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1519   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1520   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1521   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1522   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1523   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1524   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1525   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1526   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1527   PetscFunctionReturn(0);
1528 }
1529 
1530 /*@C
1531    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1532 
1533    Collective
1534 
1535    Input Arguments:
1536 .  sf - star forest
1537 
1538    Output Arguments:
1539 .  degree - degree of each root vertex
1540 
1541    Level: advanced
1542 
1543    Notes:
1544    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1545 
1546 .seealso: PetscSFGatherBegin()
1547 @*/
1548 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1549 {
1550   PetscErrorCode ierr;
1551 
1552   PetscFunctionBegin;
1553   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1554   PetscSFCheckGraphSet(sf,1);
1555   PetscValidPointer(degree,2);
1556   if (!sf->degreeknown) {
1557     PetscInt i, nroots = sf->nroots, maxlocal = sf->maxleaf+1;  /* TODO: We should use PetscSFGetLeafRange() */
1558     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1559     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1560     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1561     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1562     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1563     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1564   }
1565   *degree = NULL;
1566   PetscFunctionReturn(0);
1567 }
1568 
1569 /*@C
1570    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1571 
1572    Collective
1573 
1574    Input Arguments:
1575 .  sf - star forest
1576 
1577    Output Arguments:
1578 .  degree - degree of each root vertex
1579 
1580    Level: developer
1581 
1582    Notes:
1583    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1584 
1585 .seealso:
1586 @*/
1587 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1588 {
1589   PetscErrorCode ierr;
1590 
1591   PetscFunctionBegin;
1592   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1593   PetscSFCheckGraphSet(sf,1);
1594   PetscValidPointer(degree,2);
1595   if (!sf->degreeknown) {
1596     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1597     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1598     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1599     sf->degreeknown = PETSC_TRUE;
1600   }
1601   *degree = sf->degree;
1602   PetscFunctionReturn(0);
1603 }
1604 
1605 
1606 /*@C
1607    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1608    Each multi-root is assigned index of the corresponding original root.
1609 
1610    Collective
1611 
1612    Input Arguments:
1613 +  sf - star forest
1614 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1615 
1616    Output Arguments:
1617 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1618 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1619 
1620    Level: developer
1621 
1622    Notes:
1623    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1624 
1625 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1626 @*/
1627 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1628 {
1629   PetscSF             msf;
1630   PetscInt            i, j, k, nroots, nmroots;
1631   PetscErrorCode      ierr;
1632 
1633   PetscFunctionBegin;
1634   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1635   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1636   if (nroots) PetscValidIntPointer(degree,2);
1637   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1638   PetscValidPointer(multiRootsOrigNumbering,4);
1639   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1640   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1641   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1642   for (i=0,j=0,k=0; i<nroots; i++) {
1643     if (!degree[i]) continue;
1644     for (j=0; j<degree[i]; j++,k++) {
1645       (*multiRootsOrigNumbering)[k] = i;
1646     }
1647   }
1648 #if defined(PETSC_USE_DEBUG)
1649   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1650 #endif
1651   if (nMultiRoots) *nMultiRoots = nmroots;
1652   PetscFunctionReturn(0);
1653 }
1654 
1655 /*@C
1656    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1657 
1658    Collective
1659 
1660    Input Arguments:
1661 +  sf - star forest
1662 .  unit - data type
1663 -  leafdata - leaf data to gather to roots
1664 
1665    Output Argument:
1666 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1667 
1668    Level: intermediate
1669 
1670 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1671 @*/
1672 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1673 {
1674   PetscErrorCode ierr;
1675   PetscSF        multi;
1676 
1677   PetscFunctionBegin;
1678   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1679   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1680   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1681   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1682   PetscFunctionReturn(0);
1683 }
1684 
1685 /*@C
1686    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1687 
1688    Collective
1689 
1690    Input Arguments:
1691 +  sf - star forest
1692 .  unit - data type
1693 -  leafdata - leaf data to gather to roots
1694 
1695    Output Argument:
1696 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1697 
1698    Level: intermediate
1699 
1700 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1701 @*/
1702 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1703 {
1704   PetscErrorCode ierr;
1705   PetscSF        multi;
1706 
1707   PetscFunctionBegin;
1708   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1709   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1710   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1711   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1712   PetscFunctionReturn(0);
1713 }
1714 
1715 /*@C
1716    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1717 
1718    Collective
1719 
1720    Input Arguments:
1721 +  sf - star forest
1722 .  unit - data type
1723 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1724 
1725    Output Argument:
1726 .  leafdata - leaf data to be update with personal data from each respective root
1727 
1728    Level: intermediate
1729 
1730 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1731 @*/
1732 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1733 {
1734   PetscErrorCode ierr;
1735   PetscSF        multi;
1736 
1737   PetscFunctionBegin;
1738   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1739   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1740   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1741   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 /*@C
1746    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1747 
1748    Collective
1749 
1750    Input Arguments:
1751 +  sf - star forest
1752 .  unit - data type
1753 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1754 
1755    Output Argument:
1756 .  leafdata - leaf data to be update with personal data from each respective root
1757 
1758    Level: intermediate
1759 
1760 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1761 @*/
1762 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1763 {
1764   PetscErrorCode ierr;
1765   PetscSF        multi;
1766 
1767   PetscFunctionBegin;
1768   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1769   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1770   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1771   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1772   PetscFunctionReturn(0);
1773 }
1774 
1775 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1776 {
1777 #if defined(PETSC_USE_DEBUG)
1778   PetscInt        i, n, nleaves;
1779   const PetscInt *ilocal = NULL;
1780   PetscHSetI      seen;
1781   PetscErrorCode  ierr;
1782 
1783   PetscFunctionBegin;
1784   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1785   ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1786   for (i = 0; i < nleaves; i++) {
1787     const PetscInt leaf = ilocal ? ilocal[i] : i;
1788     ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1789   }
1790   ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1791   if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1792   ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1793   PetscFunctionReturn(0);
1794 #else
1795   PetscFunctionBegin;
1796   PetscFunctionReturn(0);
1797 #endif
1798 }
1799 
1800 /*@
1801   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1802 
1803   Input Parameters:
1804 + sfA - The first PetscSF
1805 - sfB - The second PetscSF
1806 
1807   Output Parameters:
1808 . sfBA - The composite SF
1809 
1810   Level: developer
1811 
1812   Notes:
1813   Currently, the two SFs must be defined on congruent communicators and they must be true star
1814   forests, i.e. the same leaf is not connected with different roots.
1815 
1816   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1817   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1818   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1819   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1820 
1821 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1822 @*/
1823 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1824 {
1825   PetscErrorCode    ierr;
1826   const PetscSFNode *remotePointsA,*remotePointsB;
1827   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1828   const PetscInt    *localPointsA,*localPointsB;
1829   PetscInt          *localPointsBA;
1830   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1831   PetscBool         denseB;
1832 
1833   PetscFunctionBegin;
1834   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1835   PetscSFCheckGraphSet(sfA,1);
1836   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1837   PetscSFCheckGraphSet(sfB,2);
1838   PetscCheckSameComm(sfA,1,sfB,2);
1839   PetscValidPointer(sfBA,3);
1840   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1841   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1842 
1843   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1844   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1845   if (localPointsA) {
1846     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
1847     for (i=0; i<numRootsB; i++) {
1848       reorderedRemotePointsA[i].rank = -1;
1849       reorderedRemotePointsA[i].index = -1;
1850     }
1851     for (i=0; i<numLeavesA; i++) {
1852       if (localPointsA[i] >= numRootsB) continue;
1853       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
1854     }
1855     remotePointsA = reorderedRemotePointsA;
1856   }
1857   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
1858   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
1859   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1860   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1861   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1862 
1863   denseB = (PetscBool)!localPointsB;
1864   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1865     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
1866     else numLeavesBA++;
1867   }
1868   if (denseB) {
1869     localPointsBA  = NULL;
1870     remotePointsBA = leafdataB;
1871   } else {
1872     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
1873     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
1874     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1875       const PetscInt l = localPointsB ? localPointsB[i] : i;
1876 
1877       if (leafdataB[l-minleaf].rank == -1) continue;
1878       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
1879       localPointsBA[numLeavesBA] = l;
1880       numLeavesBA++;
1881     }
1882     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
1883   }
1884   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1885   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1886   PetscFunctionReturn(0);
1887 }
1888 
1889 /*@
1890   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
1891 
1892   Input Parameters:
1893 + sfA - The first PetscSF
1894 - sfB - The second PetscSF
1895 
1896   Output Parameters:
1897 . sfBA - The composite SF.
1898 
1899   Level: developer
1900 
1901   Notes:
1902   Currently, the two SFs must be defined on congruent communicators and they must be true star
1903   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
1904   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
1905 
1906   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
1907   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
1908   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
1909   a Reduce on sfB, on connected roots.
1910 
1911 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
1912 @*/
1913 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1914 {
1915   PetscErrorCode    ierr;
1916   const PetscSFNode *remotePointsA,*remotePointsB;
1917   PetscSFNode       *remotePointsBA;
1918   const PetscInt    *localPointsA,*localPointsB;
1919   PetscSFNode       *reorderedRemotePointsA = NULL;
1920   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
1921 
1922   PetscFunctionBegin;
1923   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1924   PetscSFCheckGraphSet(sfA,1);
1925   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1926   PetscSFCheckGraphSet(sfB,2);
1927   PetscCheckSameComm(sfA,1,sfB,2);
1928   PetscValidPointer(sfBA,3);
1929   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1930   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1931   /* TODO: Check roots of sfB have degree of 1 */
1932 
1933   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
1934   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
1935   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
1936   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
1937   for (i=0; i<maxleaf - minleaf + 1; i++) {
1938     reorderedRemotePointsA[i].rank = -1;
1939     reorderedRemotePointsA[i].index = -1;
1940   }
1941   if (localPointsA) {
1942     for (i=0; i<numLeavesA; i++) {
1943       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
1944       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
1945     }
1946   } else {
1947     for (i=0; i<numLeavesA; i++) {
1948       if (i > maxleaf || i < minleaf) continue;
1949       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
1950     }
1951   }
1952 
1953   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
1954   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
1955   for (i=0; i<numRootsB; i++) {
1956     remotePointsBA[i].rank = -1;
1957     remotePointsBA[i].index = -1;
1958   }
1959 
1960   /* Once we implement the TODO above (check all roots of sfB have degree of 1), we can replace the MPI_MAXLOC
1961      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
1962    */
1963   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,MPI_MAXLOC);CHKERRQ(ierr);
1964   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,MPI_MAXLOC);CHKERRQ(ierr);
1965   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1966   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
1967     if (remotePointsBA[i].rank == -1) continue;
1968     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
1969     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
1970     localPointsBA[numLeavesBA] = i;
1971     numLeavesBA++;
1972   }
1973   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1974   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1975   PetscFunctionReturn(0);
1976 }
1977 
1978 /*
1979   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
1980 
1981   Input Parameters:
1982 . sf - The global PetscSF
1983 
1984   Output Parameters:
1985 . out - The local PetscSF
1986  */
1987 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
1988 {
1989   MPI_Comm           comm;
1990   PetscMPIInt        myrank;
1991   const PetscInt     *ilocal;
1992   const PetscSFNode  *iremote;
1993   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
1994   PetscSFNode        *liremote;
1995   PetscSF            lsf;
1996   PetscErrorCode     ierr;
1997 
1998   PetscFunctionBegin;
1999   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2000   if (sf->ops->CreateLocalSF) {
2001     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2002   } else {
2003     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2004     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2005     ierr = MPI_Comm_rank(comm,&myrank);CHKERRQ(ierr);
2006 
2007     /* Find out local edges and build a local SF */
2008     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2009     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2010     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2011     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2012 
2013     for (i=j=0; i<nleaves; i++) {
2014       if (iremote[i].rank == (PetscInt)myrank) {
2015         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2016         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2017         liremote[j].index = iremote[i].index;
2018         j++;
2019       }
2020     }
2021     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2022     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2023     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2024     *out = lsf;
2025   }
2026   PetscFunctionReturn(0);
2027 }
2028 
2029 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2030 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2031 {
2032   PetscErrorCode     ierr;
2033   PetscMemType       rootmtype,leafmtype;
2034 
2035   PetscFunctionBegin;
2036   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2037   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2038   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2039   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2040   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2041   if (sf->ops->BcastToZero) {
2042     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2043   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2044   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2045   PetscFunctionReturn(0);
2046 }
2047 
2048