xref: /petsc/src/vec/is/sf/interface/sf.c (revision ef0bb6c736604ce380bf8bea4ebd4a7bda431d97)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_USE_DEBUG)
6 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
7     if (PetscUnlikely(!(sf)->graphset))                              \
8       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
9   } while (0)
10 #else
11 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
12 #endif
13 
14 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",0};
15 
16 /*@
17    PetscSFCreate - create a star forest communication context
18 
19    Collective
20 
21    Input Arguments:
22 .  comm - communicator on which the star forest will operate
23 
24    Output Arguments:
25 .  sf - new star forest context
26 
27    Options Database Keys:
28 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
29 .  -sf_type window    -Use MPI-3 one-sided window for communication
30 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
31 
32    Level: intermediate
33 
34    Notes:
35    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
36    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
37    SFs are optimized and they have better performance than general SFs.
38 
39 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
40 @*/
41 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
42 {
43   PetscErrorCode ierr;
44   PetscSF        b;
45 
46   PetscFunctionBegin;
47   PetscValidPointer(sf,2);
48   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
49 
50   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
51 
52   b->nroots    = -1;
53   b->nleaves   = -1;
54   b->minleaf   = PETSC_MAX_INT;
55   b->maxleaf   = PETSC_MIN_INT;
56   b->nranks    = -1;
57   b->rankorder = PETSC_TRUE;
58   b->ingroup   = MPI_GROUP_NULL;
59   b->outgroup  = MPI_GROUP_NULL;
60   b->graphset  = PETSC_FALSE;
61 
62   *sf = b;
63   PetscFunctionReturn(0);
64 }
65 
66 /*@
67    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
68 
69    Collective
70 
71    Input Arguments:
72 .  sf - star forest
73 
74    Level: advanced
75 
76 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
77 @*/
78 PetscErrorCode PetscSFReset(PetscSF sf)
79 {
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
84   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
85   sf->nroots   = -1;
86   sf->nleaves  = -1;
87   sf->minleaf  = PETSC_MAX_INT;
88   sf->maxleaf  = PETSC_MIN_INT;
89   sf->mine     = NULL;
90   sf->remote   = NULL;
91   sf->graphset = PETSC_FALSE;
92   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
93   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
94   sf->nranks = -1;
95   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
96 #if defined(PETSC_HAVE_CUDA)
97   if (sf->rmine_d) {cudaError_t err = cudaFree(sf->rmine_d);CHKERRCUDA(err);sf->rmine_d=NULL;}
98 #endif
99   sf->degreeknown = PETSC_FALSE;
100   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
101   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRQ(ierr);}
102   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRQ(ierr);}
103   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
104   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
105   sf->setupcalled = PETSC_FALSE;
106   PetscFunctionReturn(0);
107 }
108 
109 /*@C
110    PetscSFSetType - Set the PetscSF communication implementation
111 
112    Collective on PetscSF
113 
114    Input Parameters:
115 +  sf - the PetscSF context
116 -  type - a known method
117 
118    Options Database Key:
119 .  -sf_type <type> - Sets the method; use -help for a list
120    of available methods (for instance, window, pt2pt, neighbor)
121 
122    Notes:
123    See "include/petscsf.h" for available methods (for instance)
124 +    PETSCSFWINDOW - MPI-2/3 one-sided
125 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
126 
127   Level: intermediate
128 
129 .seealso: PetscSFType, PetscSFCreate()
130 @*/
131 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
132 {
133   PetscErrorCode ierr,(*r)(PetscSF);
134   PetscBool      match;
135 
136   PetscFunctionBegin;
137   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
138   PetscValidCharPointer(type,2);
139 
140   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
141   if (match) PetscFunctionReturn(0);
142 
143   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
144   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
145   /* Destroy the previous PetscSF implementation context */
146   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
147   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
148   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
149   ierr = (*r)(sf);CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 /*@C
154   PetscSFGetType - Get the PetscSF communication implementation
155 
156   Not Collective
157 
158   Input Parameter:
159 . sf  - the PetscSF context
160 
161   Output Parameter:
162 . type - the PetscSF type name
163 
164   Level: intermediate
165 
166 .seealso: PetscSFSetType(), PetscSFCreate()
167 @*/
168 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
169 {
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
172   PetscValidPointer(type,2);
173   *type = ((PetscObject)sf)->type_name;
174   PetscFunctionReturn(0);
175 }
176 
177 /*@
178    PetscSFDestroy - destroy star forest
179 
180    Collective
181 
182    Input Arguments:
183 .  sf - address of star forest
184 
185    Level: intermediate
186 
187 .seealso: PetscSFCreate(), PetscSFReset()
188 @*/
189 PetscErrorCode PetscSFDestroy(PetscSF *sf)
190 {
191   PetscErrorCode ierr;
192 
193   PetscFunctionBegin;
194   if (!*sf) PetscFunctionReturn(0);
195   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
196   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
197   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
198   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
199   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
204 {
205 #if defined(PETSC_USE_DEBUG)
206   PetscInt           i, nleaves;
207   PetscMPIInt        size;
208   const PetscInt    *ilocal;
209   const PetscSFNode *iremote;
210   PetscErrorCode     ierr;
211 
212   PetscFunctionBegin;
213   if (!sf->graphset) PetscFunctionReturn(0);
214   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
215   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
216   for (i = 0; i < nleaves; i++) {
217     const PetscInt rank = iremote[i].rank;
218     const PetscInt remote = iremote[i].index;
219     const PetscInt leaf = ilocal ? ilocal[i] : i;
220     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
221     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
222     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
223   }
224   PetscFunctionReturn(0);
225 #else
226   PetscFunctionBegin;
227   PetscFunctionReturn(0);
228 #endif
229 }
230 
231 /*@
232    PetscSFSetUp - set up communication structures
233 
234    Collective
235 
236    Input Arguments:
237 .  sf - star forest communication object
238 
239    Level: beginner
240 
241 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
242 @*/
243 PetscErrorCode PetscSFSetUp(PetscSF sf)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
249   PetscSFCheckGraphSet(sf,1);
250   if (sf->setupcalled) PetscFunctionReturn(0);
251   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
252   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);}
253   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
254   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
255   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
256   sf->setupcalled = PETSC_TRUE;
257   PetscFunctionReturn(0);
258 }
259 
260 /*@
261    PetscSFSetFromOptions - set PetscSF options using the options database
262 
263    Logically Collective
264 
265    Input Arguments:
266 .  sf - star forest
267 
268    Options Database Keys:
269 +  -sf_type              - implementation type, see PetscSFSetType()
270 .  -sf_rank_order        - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
271 -  -sf_use_pinned_buffer - use pinned (nonpagable) memory for send/recv buffers on host when communicating GPU data but GPU-aware MPI is not used.
272                            Only available for SF types of basic and neighbor.
273 
274    Level: intermediate
275 
276 .seealso: PetscSFWindowSetSyncType()
277 @*/
278 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
279 {
280   PetscSFType    deft;
281   char           type[256];
282   PetscErrorCode ierr;
283   PetscBool      flg;
284 
285   PetscFunctionBegin;
286   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
287   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
288   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
289   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
290   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
291   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
292   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
293   ierr = PetscOptionsEnd();CHKERRQ(ierr);
294   PetscFunctionReturn(0);
295 }
296 
297 /*@
298    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
299 
300    Logically Collective
301 
302    Input Arguments:
303 +  sf - star forest
304 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
305 
306    Level: advanced
307 
308 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
309 @*/
310 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
311 {
312 
313   PetscFunctionBegin;
314   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
315   PetscValidLogicalCollectiveBool(sf,flg,2);
316   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
317   sf->rankorder = flg;
318   PetscFunctionReturn(0);
319 }
320 
321 /*@
322    PetscSFSetGraph - Set a parallel star forest
323 
324    Collective
325 
326    Input Arguments:
327 +  sf - star forest
328 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
329 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
330 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
331 during setup in debug mode)
332 .  localmode - copy mode for ilocal
333 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
334 during setup in debug mode)
335 -  remotemode - copy mode for iremote
336 
337    Level: intermediate
338 
339    Notes:
340     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
341 
342    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
343    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
344    needed
345 
346    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
347    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
348 
349 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
350 @*/
351 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
352 {
353   PetscErrorCode ierr;
354 
355   PetscFunctionBegin;
356   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
357   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
358   if (nleaves > 0) PetscValidPointer(iremote,6);
359   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
360   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
361 
362   ierr = PetscSFReset(sf);CHKERRQ(ierr);
363   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
364 
365   sf->nroots  = nroots;
366   sf->nleaves = nleaves;
367 
368   if (nleaves && ilocal) {
369     PetscInt i;
370     PetscInt minleaf = PETSC_MAX_INT;
371     PetscInt maxleaf = PETSC_MIN_INT;
372     int      contiguous = 1;
373     for (i=0; i<nleaves; i++) {
374       minleaf = PetscMin(minleaf,ilocal[i]);
375       maxleaf = PetscMax(maxleaf,ilocal[i]);
376       contiguous &= (ilocal[i] == i);
377     }
378     sf->minleaf = minleaf;
379     sf->maxleaf = maxleaf;
380     if (contiguous) {
381       if (localmode == PETSC_OWN_POINTER) {
382         ierr = PetscFree(ilocal);CHKERRQ(ierr);
383       }
384       ilocal = NULL;
385     }
386   } else {
387     sf->minleaf = 0;
388     sf->maxleaf = nleaves - 1;
389   }
390 
391   if (ilocal) {
392     switch (localmode) {
393     case PETSC_COPY_VALUES:
394       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
395       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
396       sf->mine = sf->mine_alloc;
397       break;
398     case PETSC_OWN_POINTER:
399       sf->mine_alloc = (PetscInt*)ilocal;
400       sf->mine       = sf->mine_alloc;
401       break;
402     case PETSC_USE_POINTER:
403       sf->mine_alloc = NULL;
404       sf->mine       = (PetscInt*)ilocal;
405       break;
406     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
407     }
408   }
409 
410   switch (remotemode) {
411   case PETSC_COPY_VALUES:
412     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
413     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
414     sf->remote = sf->remote_alloc;
415     break;
416   case PETSC_OWN_POINTER:
417     sf->remote_alloc = (PetscSFNode*)iremote;
418     sf->remote       = sf->remote_alloc;
419     break;
420   case PETSC_USE_POINTER:
421     sf->remote_alloc = NULL;
422     sf->remote       = (PetscSFNode*)iremote;
423     break;
424   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
425   }
426 
427   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
428   sf->graphset = PETSC_TRUE;
429   PetscFunctionReturn(0);
430 }
431 
432 /*@
433   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
434 
435   Collective
436 
437   Input Parameters:
438 + sf      - The PetscSF
439 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
440 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
441 
442   Notes:
443   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
444   n and N are local and global sizes of x respectively.
445 
446   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
447   sequential vectors y on all ranks.
448 
449   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
450   sequential vector y on rank 0.
451 
452   In above cases, entries of x are roots and entries of y are leaves.
453 
454   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
455   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
456   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
457   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
458   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
459 
460   In this case, roots and leaves are symmetric.
461 
462   Level: intermediate
463  @*/
464 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
465 {
466   MPI_Comm       comm;
467   PetscInt       n,N,res[2];
468   PetscMPIInt    rank,size;
469   PetscSFType    type;
470   PetscErrorCode ierr;
471 
472   PetscFunctionBegin;
473   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
474   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
475   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
476   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
477 
478   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
479     type = PETSCSFALLTOALL;
480     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
481     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
482     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
483     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
484   } else {
485     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
486     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
487     res[0] = n;
488     res[1] = -n;
489     /* Check if n are same over all ranks so that we can optimize it */
490     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
491     if (res[0] == -res[1]) { /* same n */
492       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
493     } else {
494       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
495     }
496     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
497   }
498   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
499 
500   sf->pattern = pattern;
501   sf->mine    = NULL; /* Contiguous */
502 
503   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
504      Also set other easy stuff.
505    */
506   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
507     sf->nleaves      = N;
508     sf->nroots       = n;
509     sf->nranks       = size;
510     sf->minleaf      = 0;
511     sf->maxleaf      = N - 1;
512   } else if (pattern == PETSCSF_PATTERN_GATHER) {
513     sf->nleaves      = rank ? 0 : N;
514     sf->nroots       = n;
515     sf->nranks       = rank ? 0 : size;
516     sf->minleaf      = 0;
517     sf->maxleaf      = rank ? -1 : N - 1;
518   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
519     sf->nleaves      = size;
520     sf->nroots       = size;
521     sf->nranks       = size;
522     sf->minleaf      = 0;
523     sf->maxleaf      = size - 1;
524   }
525   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
526   sf->graphset = PETSC_TRUE;
527   PetscFunctionReturn(0);
528 }
529 
530 /*@
531    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
532 
533    Collective
534 
535    Input Arguments:
536 
537 .  sf - star forest to invert
538 
539    Output Arguments:
540 .  isf - inverse of sf
541    Level: advanced
542 
543    Notes:
544    All roots must have degree 1.
545 
546    The local space may be a permutation, but cannot be sparse.
547 
548 .seealso: PetscSFSetGraph()
549 @*/
550 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
551 {
552   PetscErrorCode ierr;
553   PetscMPIInt    rank;
554   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
555   const PetscInt *ilocal;
556   PetscSFNode    *roots,*leaves;
557 
558   PetscFunctionBegin;
559   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
560   PetscSFCheckGraphSet(sf,1);
561   PetscValidPointer(isf,2);
562 
563   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
564   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
565 
566   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
567   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
568   for (i=0; i<maxlocal; i++) {
569     leaves[i].rank  = rank;
570     leaves[i].index = i;
571   }
572   for (i=0; i <nroots; i++) {
573     roots[i].rank  = -1;
574     roots[i].index = -1;
575   }
576   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
577   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
578 
579   /* Check whether our leaves are sparse */
580   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
581   if (count == nroots) newilocal = NULL;
582   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
583     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
584     for (i=0,count=0; i<nroots; i++) {
585       if (roots[i].rank >= 0) {
586         newilocal[count]   = i;
587         roots[count].rank  = roots[i].rank;
588         roots[count].index = roots[i].index;
589         count++;
590       }
591     }
592   }
593 
594   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
595   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
596   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
597   PetscFunctionReturn(0);
598 }
599 
600 /*@
601    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
602 
603    Collective
604 
605    Input Arguments:
606 +  sf - communication object to duplicate
607 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
608 
609    Output Arguments:
610 .  newsf - new communication object
611 
612    Level: beginner
613 
614 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
615 @*/
616 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
617 {
618   PetscSFType    type;
619   PetscErrorCode ierr;
620 
621   PetscFunctionBegin;
622   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
623   PetscValidLogicalCollectiveEnum(sf,opt,2);
624   PetscValidPointer(newsf,3);
625   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
626   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
627   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
628   if (opt == PETSCSF_DUPLICATE_GRAPH) {
629     PetscSFCheckGraphSet(sf,1);
630     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
631       PetscInt          nroots,nleaves;
632       const PetscInt    *ilocal;
633       const PetscSFNode *iremote;
634       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
635       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
636     } else {
637       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
638     }
639   }
640   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
641   PetscFunctionReturn(0);
642 }
643 
644 /*@C
645    PetscSFGetGraph - Get the graph specifying a parallel star forest
646 
647    Not Collective
648 
649    Input Arguments:
650 .  sf - star forest
651 
652    Output Arguments:
653 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
654 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
655 .  ilocal - locations of leaves in leafdata buffers
656 -  iremote - remote locations of root vertices for each leaf on the current process
657 
658    Notes:
659    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
660 
661    When called from Fortran, the returned iremote array is a copy and must deallocated after use. Consequently, if you
662    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
663 
664    Level: intermediate
665 
666 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
667 @*/
668 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
669 {
670   PetscErrorCode ierr;
671 
672   PetscFunctionBegin;
673   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
674   if (sf->ops->GetGraph) {
675     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
676   } else {
677     if (nroots) *nroots = sf->nroots;
678     if (nleaves) *nleaves = sf->nleaves;
679     if (ilocal) *ilocal = sf->mine;
680     if (iremote) *iremote = sf->remote;
681   }
682   PetscFunctionReturn(0);
683 }
684 
685 /*@
686    PetscSFGetLeafRange - Get the active leaf ranges
687 
688    Not Collective
689 
690    Input Arguments:
691 .  sf - star forest
692 
693    Output Arguments:
694 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
695 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
696 
697    Level: developer
698 
699 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
700 @*/
701 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
702 {
703 
704   PetscFunctionBegin;
705   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
706   PetscSFCheckGraphSet(sf,1);
707   if (minleaf) *minleaf = sf->minleaf;
708   if (maxleaf) *maxleaf = sf->maxleaf;
709   PetscFunctionReturn(0);
710 }
711 
712 /*@C
713    PetscSFViewFromOptions - View from Options
714 
715    Collective on PetscSF
716 
717    Input Parameters:
718 +  A - the star forest
719 .  obj - Optional object
720 -  name - command line option
721 
722    Level: intermediate
723 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
724 @*/
725 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
726 {
727   PetscErrorCode ierr;
728 
729   PetscFunctionBegin;
730   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
731   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
732   PetscFunctionReturn(0);
733 }
734 
735 /*@C
736    PetscSFView - view a star forest
737 
738    Collective
739 
740    Input Arguments:
741 +  sf - star forest
742 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
743 
744    Level: beginner
745 
746 .seealso: PetscSFCreate(), PetscSFSetGraph()
747 @*/
748 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
749 {
750   PetscErrorCode    ierr;
751   PetscBool         iascii;
752   PetscViewerFormat format;
753 
754   PetscFunctionBegin;
755   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
756   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
757   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
758   PetscCheckSameComm(sf,1,viewer,2);
759   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
760   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
761   if (iascii) {
762     PetscMPIInt rank;
763     PetscInt    ii,i,j;
764 
765     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
766     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
767     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
768     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
769       if (!sf->graphset) {
770         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
771         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
772         PetscFunctionReturn(0);
773       }
774       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
775       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
776       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
777       for (i=0; i<sf->nleaves; i++) {
778         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
779       }
780       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
781       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
782       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
783         PetscMPIInt *tmpranks,*perm;
784         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
785         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
786         for (i=0; i<sf->nranks; i++) perm[i] = i;
787         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
788         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
789         for (ii=0; ii<sf->nranks; ii++) {
790           i = perm[ii];
791           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
792           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
793             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
794           }
795         }
796         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
797       }
798       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
799       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
800     }
801     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
802   }
803   PetscFunctionReturn(0);
804 }
805 
806 /*@C
807    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
808 
809    Not Collective
810 
811    Input Arguments:
812 .  sf - star forest
813 
814    Output Arguments:
815 +  nranks - number of ranks referenced by local part
816 .  ranks - array of ranks
817 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
818 .  rmine - concatenated array holding local indices referencing each remote rank
819 -  rremote - concatenated array holding remote indices referenced for each remote rank
820 
821    Level: developer
822 
823 .seealso: PetscSFGetLeafRanks()
824 @*/
825 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
826 {
827   PetscErrorCode ierr;
828 
829   PetscFunctionBegin;
830   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
831   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
832   if (sf->ops->GetRootRanks) {
833     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
834   } else {
835     /* The generic implementation */
836     if (nranks)  *nranks  = sf->nranks;
837     if (ranks)   *ranks   = sf->ranks;
838     if (roffset) *roffset = sf->roffset;
839     if (rmine)   *rmine   = sf->rmine;
840     if (rremote) *rremote = sf->rremote;
841   }
842   PetscFunctionReturn(0);
843 }
844 
845 /*@C
846    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
847 
848    Not Collective
849 
850    Input Arguments:
851 .  sf - star forest
852 
853    Output Arguments:
854 +  niranks - number of leaf ranks referencing roots on this process
855 .  iranks - array of ranks
856 .  ioffset - offset in irootloc for each rank (length niranks+1)
857 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
858 
859    Level: developer
860 
861 .seealso: PetscSFGetRootRanks()
862 @*/
863 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
864 {
865   PetscErrorCode ierr;
866 
867   PetscFunctionBegin;
868   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
869   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
870   if (sf->ops->GetLeafRanks) {
871     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
872   } else {
873     PetscSFType type;
874     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
875     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
876   }
877   PetscFunctionReturn(0);
878 }
879 
880 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
881   PetscInt i;
882   for (i=0; i<n; i++) {
883     if (needle == list[i]) return PETSC_TRUE;
884   }
885   return PETSC_FALSE;
886 }
887 
888 /*@C
889    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
890 
891    Collective
892 
893    Input Arguments:
894 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
895 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
896 
897    Level: developer
898 
899 .seealso: PetscSFGetRootRanks()
900 @*/
901 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
902 {
903   PetscErrorCode     ierr;
904   PetscTable         table;
905   PetscTablePosition pos;
906   PetscMPIInt        size,groupsize,*groupranks;
907   PetscInt           *rcount,*ranks;
908   PetscInt           i, irank = -1,orank = -1;
909 
910   PetscFunctionBegin;
911   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
912   PetscSFCheckGraphSet(sf,1);
913   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
914   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
915   for (i=0; i<sf->nleaves; i++) {
916     /* Log 1-based rank */
917     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
918   }
919   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
920   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
921   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
922   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
923   for (i=0; i<sf->nranks; i++) {
924     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
925     ranks[i]--;             /* Convert back to 0-based */
926   }
927   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
928 
929   /* We expect that dgroup is reliably "small" while nranks could be large */
930   {
931     MPI_Group group = MPI_GROUP_NULL;
932     PetscMPIInt *dgroupranks;
933     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
934     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRQ(ierr);
935     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
936     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
937     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
938     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRQ(ierr);}
939     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
940     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
941   }
942 
943   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
944   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i; ) {
945     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
946       if (InList(ranks[i],groupsize,groupranks)) break;
947     }
948     for ( ; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
949       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
950     }
951     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
952       PetscInt    tmprank,tmpcount;
953 
954       tmprank             = ranks[i];
955       tmpcount            = rcount[i];
956       ranks[i]            = ranks[sf->ndranks];
957       rcount[i]           = rcount[sf->ndranks];
958       ranks[sf->ndranks]  = tmprank;
959       rcount[sf->ndranks] = tmpcount;
960       sf->ndranks++;
961     }
962   }
963   ierr = PetscFree(groupranks);CHKERRQ(ierr);
964   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
965   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
966   sf->roffset[0] = 0;
967   for (i=0; i<sf->nranks; i++) {
968     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
969     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
970     rcount[i]        = 0;
971   }
972   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
973     /* short circuit */
974     if (orank != sf->remote[i].rank) {
975       /* Search for index of iremote[i].rank in sf->ranks */
976       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
977       if (irank < 0) {
978         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
979         if (irank >= 0) irank += sf->ndranks;
980       }
981       orank = sf->remote[i].rank;
982     }
983     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
984     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
985     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
986     rcount[irank]++;
987   }
988   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
989   PetscFunctionReturn(0);
990 }
991 
992 /*@C
993    PetscSFGetGroups - gets incoming and outgoing process groups
994 
995    Collective
996 
997    Input Argument:
998 .  sf - star forest
999 
1000    Output Arguments:
1001 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1002 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1003 
1004    Level: developer
1005 
1006 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1007 @*/
1008 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1009 {
1010   PetscErrorCode ierr;
1011   MPI_Group      group = MPI_GROUP_NULL;
1012 
1013   PetscFunctionBegin;
1014   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining groups");
1015   if (sf->ingroup == MPI_GROUP_NULL) {
1016     PetscInt       i;
1017     const PetscInt *indegree;
1018     PetscMPIInt    rank,*outranks,*inranks;
1019     PetscSFNode    *remote;
1020     PetscSF        bgcount;
1021 
1022     /* Compute the number of incoming ranks */
1023     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1024     for (i=0; i<sf->nranks; i++) {
1025       remote[i].rank  = sf->ranks[i];
1026       remote[i].index = 0;
1027     }
1028     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1029     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1030     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1031     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1032 
1033     /* Enumerate the incoming ranks */
1034     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1035     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1036     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1037     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1038     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1039     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1040     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRQ(ierr);
1041     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1042     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1043     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1044   }
1045   *incoming = sf->ingroup;
1046 
1047   if (sf->outgroup == MPI_GROUP_NULL) {
1048     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1049     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRQ(ierr);
1050     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1051   }
1052   *outgoing = sf->outgroup;
1053   PetscFunctionReturn(0);
1054 }
1055 
1056 /*@
1057    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1058 
1059    Collective
1060 
1061    Input Argument:
1062 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1063 
1064    Output Arguments:
1065 .  multi - star forest with split roots, such that each root has degree exactly 1
1066 
1067    Level: developer
1068 
1069    Notes:
1070 
1071    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1072    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1073    edge, it is a candidate for future optimization that might involve its removal.
1074 
1075 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1076 @*/
1077 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1078 {
1079   PetscErrorCode ierr;
1080 
1081   PetscFunctionBegin;
1082   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1083   PetscValidPointer(multi,2);
1084   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1085     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1086     *multi = sf->multi;
1087     PetscFunctionReturn(0);
1088   }
1089   if (!sf->multi) {
1090     const PetscInt *indegree;
1091     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1092     PetscSFNode    *remote;
1093     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1094     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1095     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1096     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1097     inoffset[0] = 0;
1098     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1099     for (i=0; i<maxlocal; i++) outones[i] = 1;
1100     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1101     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1102     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1103 #if 0
1104 #if defined(PETSC_USE_DEBUG)                                 /* Check that the expected number of increments occurred */
1105     for (i=0; i<sf->nroots; i++) {
1106       if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1107     }
1108 #endif
1109 #endif
1110     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1111     for (i=0; i<sf->nleaves; i++) {
1112       remote[i].rank  = sf->remote[i].rank;
1113       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1114     }
1115     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1116     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1117     if (sf->rankorder) {        /* Sort the ranks */
1118       PetscMPIInt rank;
1119       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1120       PetscSFNode *newremote;
1121       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1122       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1123       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1124       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1125       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1126       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1127       /* Sort the incoming ranks at each vertex, build the inverse map */
1128       for (i=0; i<sf->nroots; i++) {
1129         PetscInt j;
1130         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1131         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1132         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1133       }
1134       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1135       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1136       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1137       for (i=0; i<sf->nleaves; i++) {
1138         newremote[i].rank  = sf->remote[i].rank;
1139         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1140       }
1141       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1142       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1143     }
1144     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1145   }
1146   *multi = sf->multi;
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 /*@C
1151    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1152 
1153    Collective
1154 
1155    Input Arguments:
1156 +  sf - original star forest
1157 .  nselected  - number of selected roots on this process
1158 -  selected   - indices of the selected roots on this process
1159 
1160    Output Arguments:
1161 .  newsf - new star forest
1162 
1163    Level: advanced
1164 
1165    Note:
1166    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1167    be done by calling PetscSFGetGraph().
1168 
1169 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1170 @*/
1171 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1172 {
1173   PetscInt          i,n,*roots,*rootdata,*leafdata,nroots,nleaves,connected_leaves,*new_ilocal;
1174   const PetscSFNode *iremote;
1175   PetscSFNode       *new_iremote;
1176   PetscSF           tmpsf;
1177   MPI_Comm          comm;
1178   PetscErrorCode    ierr;
1179 
1180   PetscFunctionBegin;
1181   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1182   PetscSFCheckGraphSet(sf,1);
1183   if (nselected) PetscValidPointer(selected,3);
1184   PetscValidPointer(newsf,4);
1185 
1186   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1187   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1188   /* Uniq selected[] and put results in roots[] */
1189   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1190   ierr = PetscMalloc1(nselected,&roots);CHKERRQ(ierr);
1191   ierr = PetscArraycpy(roots,selected,nselected);CHKERRQ(ierr);
1192   ierr = PetscSortedRemoveDupsInt(&nselected,roots);CHKERRQ(ierr);
1193   if (nselected && (roots[0] < 0 || roots[nselected-1] >= sf->nroots)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max root indices %D/%D are not in [0,%D)",roots[0],roots[nselected-1],sf->nroots);
1194 
1195   if (sf->ops->CreateEmbeddedSF) {
1196     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,roots,newsf);CHKERRQ(ierr);
1197   } else {
1198     /* A generic version of creating embedded sf. Note that we called PetscSFSetGraph() twice, which is certainly expensive */
1199     /* Find out which leaves (not leaf data items) are still connected to roots in the embedded sf */
1200     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,NULL,&iremote);CHKERRQ(ierr);
1201     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&tmpsf);CHKERRQ(ierr);
1202     ierr = PetscSFSetGraph(tmpsf,nroots,nleaves,NULL/*contiguous*/,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);CHKERRQ(ierr);
1203     ierr = PetscCalloc2(nroots,&rootdata,nleaves,&leafdata);CHKERRQ(ierr);
1204     for (i=0; i<nselected; ++i) rootdata[roots[i]] = 1;
1205     ierr = PetscSFBcastBegin(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1206     ierr = PetscSFBcastEnd(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1207     ierr = PetscSFDestroy(&tmpsf);CHKERRQ(ierr);
1208 
1209     /* Build newsf with leaves that are still connected */
1210     connected_leaves = 0;
1211     for (i=0; i<nleaves; ++i) connected_leaves += leafdata[i];
1212     ierr = PetscMalloc1(connected_leaves,&new_ilocal);CHKERRQ(ierr);
1213     ierr = PetscMalloc1(connected_leaves,&new_iremote);CHKERRQ(ierr);
1214     for (i=0, n=0; i<nleaves; ++i) {
1215       if (leafdata[i]) {
1216         new_ilocal[n]        = sf->mine ? sf->mine[i] : i;
1217         new_iremote[n].rank  = sf->remote[i].rank;
1218         new_iremote[n].index = sf->remote[i].index;
1219         ++n;
1220       }
1221     }
1222 
1223     if (n != connected_leaves) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"There is a size mismatch in the SF embedding, %d != %d",n,connected_leaves);
1224     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1225     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1226     ierr = PetscSFSetGraph(*newsf,nroots,connected_leaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1227     ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
1228   }
1229   ierr = PetscFree(roots);CHKERRQ(ierr);
1230   ierr = PetscSFSetUp(*newsf);CHKERRQ(ierr);
1231   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1232   PetscFunctionReturn(0);
1233 }
1234 
1235 /*@C
1236   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1237 
1238   Collective
1239 
1240   Input Arguments:
1241 + sf - original star forest
1242 . nselected  - number of selected leaves on this process
1243 - selected   - indices of the selected leaves on this process
1244 
1245   Output Arguments:
1246 .  newsf - new star forest
1247 
1248   Level: advanced
1249 
1250 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1251 @*/
1252 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1253 {
1254   const PetscSFNode *iremote;
1255   PetscSFNode       *new_iremote;
1256   const PetscInt    *ilocal;
1257   PetscInt          i,nroots,*leaves,*new_ilocal;
1258   MPI_Comm          comm;
1259   PetscErrorCode    ierr;
1260 
1261   PetscFunctionBegin;
1262   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1263   PetscSFCheckGraphSet(sf,1);
1264   if (nselected) PetscValidPointer(selected,3);
1265   PetscValidPointer(newsf,4);
1266 
1267   /* Uniq selected[] and put results in leaves[] */
1268   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1269   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1270   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1271   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1272   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1273 
1274   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1275   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1276     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1277   } else {
1278     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1279     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1280     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1281     for (i=0; i<nselected; ++i) {
1282       const PetscInt l     = leaves[i];
1283       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1284       new_iremote[i].rank  = iremote[l].rank;
1285       new_iremote[i].index = iremote[l].index;
1286     }
1287     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1288     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1289     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1290   }
1291   ierr = PetscFree(leaves);CHKERRQ(ierr);
1292   PetscFunctionReturn(0);
1293 }
1294 
1295 /*@C
1296    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1297 
1298    Collective on PetscSF
1299 
1300    Input Arguments:
1301 +  sf - star forest on which to communicate
1302 .  unit - data type associated with each node
1303 .  rootdata - buffer to broadcast
1304 -  op - operation to use for reduction
1305 
1306    Output Arguments:
1307 .  leafdata - buffer to be reduced with values from each leaf's respective root
1308 
1309    Level: intermediate
1310 
1311 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd()
1312 @*/
1313 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1314 {
1315   PetscErrorCode ierr;
1316   PetscMemType   rootmtype,leafmtype;
1317 
1318   PetscFunctionBegin;
1319   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1320   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1321   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1322   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1323   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1324 #if defined(PETSC_HAVE_CUDA)
1325   /*  Shall we assume rootdata, leafdata are ready to use (instead of being computed by some asynchronous kernels)?
1326     To be similar to MPI, I'd like to have this assumption, since MPI does not have a concept of stream.
1327     But currently this assumption is not enforecd in Petsc. To be safe, I do synchronization here. Otherwise, if
1328     we do not sync now and call the Pack kernel directly on the default NULL stream (assume petsc objects are also
1329     computed on it), we have to sync the NULL stream before calling MPI routines. So, it looks a cudaDeviceSynchronize
1330     is inevitable. We do it now and put pack/unpack kernels to non-NULL streams.
1331    */
1332   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1333 #endif
1334   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1335   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1336   PetscFunctionReturn(0);
1337 }
1338 
1339 /*@C
1340    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1341 
1342    Collective
1343 
1344    Input Arguments:
1345 +  sf - star forest
1346 .  unit - data type
1347 .  rootdata - buffer to broadcast
1348 -  op - operation to use for reduction
1349 
1350    Output Arguments:
1351 .  leafdata - buffer to be reduced with values from each leaf's respective root
1352 
1353    Level: intermediate
1354 
1355 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1356 @*/
1357 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1358 {
1359   PetscErrorCode ierr;
1360   PetscMemType   rootmtype,leafmtype;
1361 
1362   PetscFunctionBegin;
1363   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1364   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1365   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1366   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1367   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1368   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1369   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 /*@C
1374    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1375 
1376    Collective
1377 
1378    Input Arguments:
1379 +  sf - star forest
1380 .  unit - data type
1381 .  leafdata - values to reduce
1382 -  op - reduction operation
1383 
1384    Output Arguments:
1385 .  rootdata - result of reduction of values from all leaves of each root
1386 
1387    Level: intermediate
1388 
1389 .seealso: PetscSFBcastBegin()
1390 @*/
1391 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1392 {
1393   PetscErrorCode ierr;
1394   PetscMemType   rootmtype,leafmtype;
1395 
1396   PetscFunctionBegin;
1397   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1398   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1399   ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1400   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1401   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1402 #if defined(PETSC_HAVE_CUDA)
1403   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1404 #endif
1405   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1406   ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1407   PetscFunctionReturn(0);
1408 }
1409 
1410 /*@C
1411    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1412 
1413    Collective
1414 
1415    Input Arguments:
1416 +  sf - star forest
1417 .  unit - data type
1418 .  leafdata - values to reduce
1419 -  op - reduction operation
1420 
1421    Output Arguments:
1422 .  rootdata - result of reduction of values from all leaves of each root
1423 
1424    Level: intermediate
1425 
1426 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1427 @*/
1428 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1429 {
1430   PetscErrorCode ierr;
1431   PetscMemType   rootmtype,leafmtype;
1432 
1433   PetscFunctionBegin;
1434   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1435   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1436   ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1437   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1438   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1439   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1440   ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1441   PetscFunctionReturn(0);
1442 }
1443 
1444 /*@C
1445    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1446 
1447    Collective
1448 
1449    Input Arguments:
1450 +  sf - star forest
1451 .  unit - data type
1452 .  leafdata - leaf values to use in reduction
1453 -  op - operation to use for reduction
1454 
1455    Output Arguments:
1456 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1457 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1458 
1459    Level: advanced
1460 
1461    Note:
1462    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1463    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1464    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1465    integers.
1466 
1467 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1468 @*/
1469 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1470 {
1471   PetscErrorCode ierr;
1472   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1473 
1474   PetscFunctionBegin;
1475   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1476   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1477   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1478   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1479   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1480   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1481   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1482 #if defined(PETSC_HAVE_CUDA)
1483   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1484 #endif
1485   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1486   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1487   PetscFunctionReturn(0);
1488 }
1489 
1490 /*@C
1491    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1492 
1493    Collective
1494 
1495    Input Arguments:
1496 +  sf - star forest
1497 .  unit - data type
1498 .  leafdata - leaf values to use in reduction
1499 -  op - operation to use for reduction
1500 
1501    Output Arguments:
1502 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1503 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1504 
1505    Level: advanced
1506 
1507 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1508 @*/
1509 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1510 {
1511   PetscErrorCode ierr;
1512   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1513 
1514   PetscFunctionBegin;
1515   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1516   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1517   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1518   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1519   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1520   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1521   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1522   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1523   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1524   PetscFunctionReturn(0);
1525 }
1526 
1527 /*@C
1528    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1529 
1530    Collective
1531 
1532    Input Arguments:
1533 .  sf - star forest
1534 
1535    Output Arguments:
1536 .  degree - degree of each root vertex
1537 
1538    Level: advanced
1539 
1540    Notes:
1541    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1542 
1543 .seealso: PetscSFGatherBegin()
1544 @*/
1545 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1546 {
1547   PetscErrorCode ierr;
1548 
1549   PetscFunctionBegin;
1550   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1551   PetscSFCheckGraphSet(sf,1);
1552   PetscValidPointer(degree,2);
1553   if (!sf->degreeknown) {
1554     PetscInt i, nroots = sf->nroots, maxlocal = sf->maxleaf+1;  /* TODO: We should use PetscSFGetLeafRange() */
1555     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1556     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1557     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1558     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1559     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1560     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1561   }
1562   *degree = NULL;
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 /*@C
1567    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1568 
1569    Collective
1570 
1571    Input Arguments:
1572 .  sf - star forest
1573 
1574    Output Arguments:
1575 .  degree - degree of each root vertex
1576 
1577    Level: developer
1578 
1579    Notes:
1580    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1581 
1582 .seealso:
1583 @*/
1584 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1585 {
1586   PetscErrorCode ierr;
1587 
1588   PetscFunctionBegin;
1589   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1590   PetscSFCheckGraphSet(sf,1);
1591   PetscValidPointer(degree,2);
1592   if (!sf->degreeknown) {
1593     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1594     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1595     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1596     sf->degreeknown = PETSC_TRUE;
1597   }
1598   *degree = sf->degree;
1599   PetscFunctionReturn(0);
1600 }
1601 
1602 
1603 /*@C
1604    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1605    Each multi-root is assigned index of the corresponding original root.
1606 
1607    Collective
1608 
1609    Input Arguments:
1610 +  sf - star forest
1611 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1612 
1613    Output Arguments:
1614 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1615 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1616 
1617    Level: developer
1618 
1619    Notes:
1620    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1621 
1622 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1623 @*/
1624 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1625 {
1626   PetscSF             msf;
1627   PetscInt            i, j, k, nroots, nmroots;
1628   PetscErrorCode      ierr;
1629 
1630   PetscFunctionBegin;
1631   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1632   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1633   if (nroots) PetscValidIntPointer(degree,2);
1634   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1635   PetscValidPointer(multiRootsOrigNumbering,4);
1636   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1637   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1638   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1639   for (i=0,j=0,k=0; i<nroots; i++) {
1640     if (!degree[i]) continue;
1641     for (j=0; j<degree[i]; j++,k++) {
1642       (*multiRootsOrigNumbering)[k] = i;
1643     }
1644   }
1645 #if defined(PETSC_USE_DEBUG)
1646   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1647 #endif
1648   if (nMultiRoots) *nMultiRoots = nmroots;
1649   PetscFunctionReturn(0);
1650 }
1651 
1652 /*@C
1653    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1654 
1655    Collective
1656 
1657    Input Arguments:
1658 +  sf - star forest
1659 .  unit - data type
1660 -  leafdata - leaf data to gather to roots
1661 
1662    Output Argument:
1663 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1664 
1665    Level: intermediate
1666 
1667 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1668 @*/
1669 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1670 {
1671   PetscErrorCode ierr;
1672   PetscSF        multi;
1673 
1674   PetscFunctionBegin;
1675   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1676   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1677   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1678   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1679   PetscFunctionReturn(0);
1680 }
1681 
1682 /*@C
1683    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1684 
1685    Collective
1686 
1687    Input Arguments:
1688 +  sf - star forest
1689 .  unit - data type
1690 -  leafdata - leaf data to gather to roots
1691 
1692    Output Argument:
1693 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1694 
1695    Level: intermediate
1696 
1697 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1698 @*/
1699 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1700 {
1701   PetscErrorCode ierr;
1702   PetscSF        multi;
1703 
1704   PetscFunctionBegin;
1705   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1706   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1707   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1708   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1709   PetscFunctionReturn(0);
1710 }
1711 
1712 /*@C
1713    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1714 
1715    Collective
1716 
1717    Input Arguments:
1718 +  sf - star forest
1719 .  unit - data type
1720 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1721 
1722    Output Argument:
1723 .  leafdata - leaf data to be update with personal data from each respective root
1724 
1725    Level: intermediate
1726 
1727 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1728 @*/
1729 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1730 {
1731   PetscErrorCode ierr;
1732   PetscSF        multi;
1733 
1734   PetscFunctionBegin;
1735   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1736   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1737   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1738   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1739   PetscFunctionReturn(0);
1740 }
1741 
1742 /*@C
1743    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1744 
1745    Collective
1746 
1747    Input Arguments:
1748 +  sf - star forest
1749 .  unit - data type
1750 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1751 
1752    Output Argument:
1753 .  leafdata - leaf data to be update with personal data from each respective root
1754 
1755    Level: intermediate
1756 
1757 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1758 @*/
1759 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1760 {
1761   PetscErrorCode ierr;
1762   PetscSF        multi;
1763 
1764   PetscFunctionBegin;
1765   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1766   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1767   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1768   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1769   PetscFunctionReturn(0);
1770 }
1771 
1772 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1773 {
1774 #if defined(PETSC_USE_DEBUG)
1775   PetscInt        i, n, nleaves;
1776   const PetscInt *ilocal = NULL;
1777   PetscHSetI      seen;
1778   PetscErrorCode  ierr;
1779 
1780   PetscFunctionBegin;
1781   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1782   ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1783   for (i = 0; i < nleaves; i++) {
1784     const PetscInt leaf = ilocal ? ilocal[i] : i;
1785     ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1786   }
1787   ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1788   if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1789   ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1790   PetscFunctionReturn(0);
1791 #else
1792   PetscFunctionBegin;
1793   PetscFunctionReturn(0);
1794 #endif
1795 }
1796 
1797 /*@
1798   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1799 
1800   Input Parameters:
1801 + sfA - The first PetscSF
1802 - sfB - The second PetscSF
1803 
1804   Output Parameters:
1805 . sfBA - The composite SF
1806 
1807   Level: developer
1808 
1809   Notes:
1810   Currently, the two SFs must be defined on congruent communicators and they must be true star
1811   forests, i.e. the same leaf is not connected with different roots.
1812 
1813   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1814   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1815   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1816   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1817 
1818 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1819 @*/
1820 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1821 {
1822   PetscErrorCode    ierr;
1823   const PetscSFNode *remotePointsA,*remotePointsB;
1824   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1825   const PetscInt    *localPointsA,*localPointsB;
1826   PetscInt          *localPointsBA;
1827   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1828   PetscBool         denseB;
1829 
1830   PetscFunctionBegin;
1831   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1832   PetscSFCheckGraphSet(sfA,1);
1833   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1834   PetscSFCheckGraphSet(sfB,2);
1835   PetscCheckSameComm(sfA,1,sfB,2);
1836   PetscValidPointer(sfBA,3);
1837   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1838   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1839 
1840   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1841   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1842   if (localPointsA) {
1843     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
1844     for (i=0; i<numRootsB; i++) {
1845       reorderedRemotePointsA[i].rank = -1;
1846       reorderedRemotePointsA[i].index = -1;
1847     }
1848     for (i=0; i<numLeavesA; i++) {
1849       if (localPointsA[i] >= numRootsB) continue;
1850       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
1851     }
1852     remotePointsA = reorderedRemotePointsA;
1853   }
1854   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
1855   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
1856   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1857   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1858   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1859 
1860   denseB = (PetscBool)!localPointsB;
1861   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1862     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
1863     else numLeavesBA++;
1864   }
1865   if (denseB) {
1866     localPointsBA  = NULL;
1867     remotePointsBA = leafdataB;
1868   } else {
1869     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
1870     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
1871     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1872       const PetscInt l = localPointsB ? localPointsB[i] : i;
1873 
1874       if (leafdataB[l-minleaf].rank == -1) continue;
1875       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
1876       localPointsBA[numLeavesBA] = l;
1877       numLeavesBA++;
1878     }
1879     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
1880   }
1881   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1882   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1883   PetscFunctionReturn(0);
1884 }
1885 
1886 /*@
1887   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
1888 
1889   Input Parameters:
1890 + sfA - The first PetscSF
1891 - sfB - The second PetscSF
1892 
1893   Output Parameters:
1894 . sfBA - The composite SF.
1895 
1896   Level: developer
1897 
1898   Notes:
1899   Currently, the two SFs must be defined on congruent communicators and they must be true star
1900   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
1901   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
1902 
1903   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
1904   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
1905   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
1906   a Reduce on sfB, on connected roots.
1907 
1908 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
1909 @*/
1910 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1911 {
1912   PetscErrorCode    ierr;
1913   const PetscSFNode *remotePointsA,*remotePointsB;
1914   PetscSFNode       *remotePointsBA;
1915   const PetscInt    *localPointsA,*localPointsB;
1916   PetscSFNode       *reorderedRemotePointsA = NULL;
1917   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
1918 
1919   PetscFunctionBegin;
1920   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1921   PetscSFCheckGraphSet(sfA,1);
1922   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1923   PetscSFCheckGraphSet(sfB,2);
1924   PetscCheckSameComm(sfA,1,sfB,2);
1925   PetscValidPointer(sfBA,3);
1926   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1927   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1928   /* TODO: Check roots of sfB have degree of 1 */
1929 
1930   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
1931   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
1932   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
1933   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
1934   for (i=0; i<maxleaf - minleaf + 1; i++) {
1935     reorderedRemotePointsA[i].rank = -1;
1936     reorderedRemotePointsA[i].index = -1;
1937   }
1938   if (localPointsA) {
1939     for (i=0; i<numLeavesA; i++) {
1940       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
1941       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
1942     }
1943   } else {
1944     for (i=0; i<numLeavesA; i++) {
1945       if (i > maxleaf || i < minleaf) continue;
1946       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
1947     }
1948   }
1949 
1950   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
1951   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
1952   for (i=0; i<numRootsB; i++) {
1953     remotePointsBA[i].rank = -1;
1954     remotePointsBA[i].index = -1;
1955   }
1956 
1957   /* Once we implement the TODO above (check all roots of sfB have degree of 1), we can replace the MPI_MAXLOC
1958      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
1959    */
1960   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,MPI_MAXLOC);CHKERRQ(ierr);
1961   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,MPI_MAXLOC);CHKERRQ(ierr);
1962   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1963   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
1964     if (remotePointsBA[i].rank == -1) continue;
1965     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
1966     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
1967     localPointsBA[numLeavesBA] = i;
1968     numLeavesBA++;
1969   }
1970   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1971   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1972   PetscFunctionReturn(0);
1973 }
1974 
1975 /*
1976   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
1977 
1978   Input Parameters:
1979 . sf - The global PetscSF
1980 
1981   Output Parameters:
1982 . out - The local PetscSF
1983  */
1984 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
1985 {
1986   MPI_Comm           comm;
1987   PetscMPIInt        myrank;
1988   const PetscInt     *ilocal;
1989   const PetscSFNode  *iremote;
1990   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
1991   PetscSFNode        *liremote;
1992   PetscSF            lsf;
1993   PetscErrorCode     ierr;
1994 
1995   PetscFunctionBegin;
1996   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1997   if (sf->ops->CreateLocalSF) {
1998     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
1999   } else {
2000     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2001     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2002     ierr = MPI_Comm_rank(comm,&myrank);CHKERRQ(ierr);
2003 
2004     /* Find out local edges and build a local SF */
2005     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2006     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2007     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2008     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2009 
2010     for (i=j=0; i<nleaves; i++) {
2011       if (iremote[i].rank == (PetscInt)myrank) {
2012         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2013         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2014         liremote[j].index = iremote[i].index;
2015         j++;
2016       }
2017     }
2018     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2019     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2020     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2021     *out = lsf;
2022   }
2023   PetscFunctionReturn(0);
2024 }
2025 
2026 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2027 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2028 {
2029   PetscErrorCode     ierr;
2030   PetscMemType       rootmtype,leafmtype;
2031 
2032   PetscFunctionBegin;
2033   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2034   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2035   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2036   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2037   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2038   if (sf->ops->BcastToZero) {
2039     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2040   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2041   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2042   PetscFunctionReturn(0);
2043 }
2044 
2045