xref: /petsc/src/vec/is/sf/interface/sf.c (revision c4762a1b19cd2af06abeed90e8f9d34fb975dd94)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_USE_DEBUG)
6 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
7     if (PetscUnlikely(!(sf)->graphset))                              \
8       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
9   } while (0)
10 #else
11 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
12 #endif
13 
14 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",0};
15 
16 /*@
17    PetscSFCreate - create a star forest communication context
18 
19    Collective
20 
21    Input Arguments:
22 .  comm - communicator on which the star forest will operate
23 
24    Output Arguments:
25 .  sf - new star forest context
26 
27    Options Database Keys:
28 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
29 .  -sf_type window    -Use MPI-3 one-sided window for communication
30 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
31 
32    Level: intermediate
33 
34    Notes:
35    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
36    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
37    SFs are optimized and they have better performance than general SFs.
38 
39 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
40 @*/
41 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
42 {
43   PetscErrorCode ierr;
44   PetscSF        b;
45 
46   PetscFunctionBegin;
47   PetscValidPointer(sf,2);
48   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
49 
50   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
51 
52   b->nroots    = -1;
53   b->nleaves   = -1;
54   b->minleaf   = PETSC_MAX_INT;
55   b->maxleaf   = PETSC_MIN_INT;
56   b->nranks    = -1;
57   b->rankorder = PETSC_TRUE;
58   b->ingroup   = MPI_GROUP_NULL;
59   b->outgroup  = MPI_GROUP_NULL;
60   b->graphset  = PETSC_FALSE;
61 
62   *sf = b;
63   PetscFunctionReturn(0);
64 }
65 
66 /*@
67    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
68 
69    Collective
70 
71    Input Arguments:
72 .  sf - star forest
73 
74    Level: advanced
75 
76 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
77 @*/
78 PetscErrorCode PetscSFReset(PetscSF sf)
79 {
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
84   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
85   sf->nroots   = -1;
86   sf->nleaves  = -1;
87   sf->minleaf  = PETSC_MAX_INT;
88   sf->maxleaf  = PETSC_MIN_INT;
89   sf->mine     = NULL;
90   sf->remote   = NULL;
91   sf->graphset = PETSC_FALSE;
92   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
93   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
94   sf->nranks = -1;
95   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
96 #if defined(PETSC_HAVE_CUDA)
97   if (sf->rmine_d) {cudaError_t err = cudaFree(sf->rmine_d);CHKERRCUDA(err);sf->rmine_d=NULL;}
98 #endif
99   sf->degreeknown = PETSC_FALSE;
100   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
101   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRQ(ierr);}
102   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRQ(ierr);}
103   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
104   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
105   sf->setupcalled = PETSC_FALSE;
106   PetscFunctionReturn(0);
107 }
108 
109 /*@C
110    PetscSFSetType - Set the PetscSF communication implementation
111 
112    Collective on PetscSF
113 
114    Input Parameters:
115 +  sf - the PetscSF context
116 -  type - a known method
117 
118    Options Database Key:
119 .  -sf_type <type> - Sets the method; use -help for a list
120    of available methods (for instance, window, basic, neighbor)
121 
122    Notes:
123    See "include/petscsf.h" for available methods (for instance)
124 +    PETSCSFWINDOW - MPI-2/3 one-sided
125 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
126 
127   Level: intermediate
128 
129 .seealso: PetscSFType, PetscSFCreate()
130 @*/
131 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
132 {
133   PetscErrorCode ierr,(*r)(PetscSF);
134   PetscBool      match;
135 
136   PetscFunctionBegin;
137   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
138   PetscValidCharPointer(type,2);
139 
140   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
141   if (match) PetscFunctionReturn(0);
142 
143   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
144   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
145   /* Destroy the previous PetscSF implementation context */
146   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
147   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
148   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
149   ierr = (*r)(sf);CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 /*@C
154   PetscSFGetType - Get the PetscSF communication implementation
155 
156   Not Collective
157 
158   Input Parameter:
159 . sf  - the PetscSF context
160 
161   Output Parameter:
162 . type - the PetscSF type name
163 
164   Level: intermediate
165 
166 .seealso: PetscSFSetType(), PetscSFCreate()
167 @*/
168 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
169 {
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
172   PetscValidPointer(type,2);
173   *type = ((PetscObject)sf)->type_name;
174   PetscFunctionReturn(0);
175 }
176 
177 /*@
178    PetscSFDestroy - destroy star forest
179 
180    Collective
181 
182    Input Arguments:
183 .  sf - address of star forest
184 
185    Level: intermediate
186 
187 .seealso: PetscSFCreate(), PetscSFReset()
188 @*/
189 PetscErrorCode PetscSFDestroy(PetscSF *sf)
190 {
191   PetscErrorCode ierr;
192 
193   PetscFunctionBegin;
194   if (!*sf) PetscFunctionReturn(0);
195   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
196   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
197   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
198   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
199   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
204 {
205 #if defined(PETSC_USE_DEBUG)
206   PetscInt           i, nleaves;
207   PetscMPIInt        size;
208   const PetscInt    *ilocal;
209   const PetscSFNode *iremote;
210   PetscErrorCode     ierr;
211 
212   PetscFunctionBegin;
213   if (!sf->graphset) PetscFunctionReturn(0);
214   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
215   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
216   for (i = 0; i < nleaves; i++) {
217     const PetscInt rank = iremote[i].rank;
218     const PetscInt remote = iremote[i].index;
219     const PetscInt leaf = ilocal ? ilocal[i] : i;
220     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
221     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
222     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
223   }
224   PetscFunctionReturn(0);
225 #else
226   PetscFunctionBegin;
227   PetscFunctionReturn(0);
228 #endif
229 }
230 
231 /*@
232    PetscSFSetUp - set up communication structures
233 
234    Collective
235 
236    Input Arguments:
237 .  sf - star forest communication object
238 
239    Level: beginner
240 
241 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
242 @*/
243 PetscErrorCode PetscSFSetUp(PetscSF sf)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
249   PetscSFCheckGraphSet(sf,1);
250   if (sf->setupcalled) PetscFunctionReturn(0);
251   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
252   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);}
253   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
254   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
255   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
256   sf->setupcalled = PETSC_TRUE;
257   PetscFunctionReturn(0);
258 }
259 
260 /*@
261    PetscSFSetFromOptions - set PetscSF options using the options database
262 
263    Logically Collective
264 
265    Input Arguments:
266 .  sf - star forest
267 
268    Options Database Keys:
269 +  -sf_type              - implementation type, see PetscSFSetType()
270 .  -sf_rank_order        - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
271 -  -sf_use_pinned_buffer - use pinned (nonpagable) memory for send/recv buffers on host when communicating GPU data but GPU-aware MPI is not used.
272                            Only available for SF types of basic and neighbor.
273 
274    Level: intermediate
275 @*/
276 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
277 {
278   PetscSFType    deft;
279   char           type[256];
280   PetscErrorCode ierr;
281   PetscBool      flg;
282 
283   PetscFunctionBegin;
284   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
285   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
286   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
287   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
288   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
289   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
290   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
291   ierr = PetscOptionsEnd();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 /*@
296    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
297 
298    Logically Collective
299 
300    Input Arguments:
301 +  sf - star forest
302 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
303 
304    Level: advanced
305 
306 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
307 @*/
308 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
309 {
310   PetscFunctionBegin;
311   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
312   PetscValidLogicalCollectiveBool(sf,flg,2);
313   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
314   sf->rankorder = flg;
315   PetscFunctionReturn(0);
316 }
317 
318 /*@
319    PetscSFSetGraph - Set a parallel star forest
320 
321    Collective
322 
323    Input Arguments:
324 +  sf - star forest
325 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
326 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
327 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
328 during setup in debug mode)
329 .  localmode - copy mode for ilocal
330 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
331 during setup in debug mode)
332 -  remotemode - copy mode for iremote
333 
334    Level: intermediate
335 
336    Notes:
337     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
338 
339    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
340    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
341    needed
342 
343    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
344    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
345 
346 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
347 @*/
348 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
349 {
350   PetscErrorCode ierr;
351 
352   PetscFunctionBegin;
353   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
354   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
355   if (nleaves > 0) PetscValidPointer(iremote,6);
356   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
357   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
358 
359   if (sf->nroots >= 0) { /* Reset only if graph already set */
360     ierr = PetscSFReset(sf);CHKERRQ(ierr);
361   }
362 
363   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
364 
365   sf->nroots  = nroots;
366   sf->nleaves = nleaves;
367 
368   if (nleaves && ilocal) {
369     PetscInt i;
370     PetscInt minleaf = PETSC_MAX_INT;
371     PetscInt maxleaf = PETSC_MIN_INT;
372     int      contiguous = 1;
373     for (i=0; i<nleaves; i++) {
374       minleaf = PetscMin(minleaf,ilocal[i]);
375       maxleaf = PetscMax(maxleaf,ilocal[i]);
376       contiguous &= (ilocal[i] == i);
377     }
378     sf->minleaf = minleaf;
379     sf->maxleaf = maxleaf;
380     if (contiguous) {
381       if (localmode == PETSC_OWN_POINTER) {
382         ierr = PetscFree(ilocal);CHKERRQ(ierr);
383       }
384       ilocal = NULL;
385     }
386   } else {
387     sf->minleaf = 0;
388     sf->maxleaf = nleaves - 1;
389   }
390 
391   if (ilocal) {
392     switch (localmode) {
393     case PETSC_COPY_VALUES:
394       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
395       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
396       sf->mine = sf->mine_alloc;
397       break;
398     case PETSC_OWN_POINTER:
399       sf->mine_alloc = (PetscInt*)ilocal;
400       sf->mine       = sf->mine_alloc;
401       break;
402     case PETSC_USE_POINTER:
403       sf->mine_alloc = NULL;
404       sf->mine       = (PetscInt*)ilocal;
405       break;
406     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
407     }
408   }
409 
410   switch (remotemode) {
411   case PETSC_COPY_VALUES:
412     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
413     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
414     sf->remote = sf->remote_alloc;
415     break;
416   case PETSC_OWN_POINTER:
417     sf->remote_alloc = (PetscSFNode*)iremote;
418     sf->remote       = sf->remote_alloc;
419     break;
420   case PETSC_USE_POINTER:
421     sf->remote_alloc = NULL;
422     sf->remote       = (PetscSFNode*)iremote;
423     break;
424   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
425   }
426 
427   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
428   sf->graphset = PETSC_TRUE;
429   PetscFunctionReturn(0);
430 }
431 
432 /*@
433   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
434 
435   Collective
436 
437   Input Parameters:
438 + sf      - The PetscSF
439 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
440 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
441 
442   Notes:
443   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
444   n and N are local and global sizes of x respectively.
445 
446   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
447   sequential vectors y on all ranks.
448 
449   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
450   sequential vector y on rank 0.
451 
452   In above cases, entries of x are roots and entries of y are leaves.
453 
454   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
455   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
456   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
457   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
458   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
459 
460   In this case, roots and leaves are symmetric.
461 
462   Level: intermediate
463  @*/
464 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
465 {
466   MPI_Comm       comm;
467   PetscInt       n,N,res[2];
468   PetscMPIInt    rank,size;
469   PetscSFType    type;
470   PetscErrorCode ierr;
471 
472   PetscFunctionBegin;
473   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
474   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
475   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
476   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
477 
478   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
479     type = PETSCSFALLTOALL;
480     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
481     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
482     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
483     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
484   } else {
485     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
486     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
487     res[0] = n;
488     res[1] = -n;
489     /* Check if n are same over all ranks so that we can optimize it */
490     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
491     if (res[0] == -res[1]) { /* same n */
492       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
493     } else {
494       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
495     }
496     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
497   }
498   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
499 
500   sf->pattern = pattern;
501   sf->mine    = NULL; /* Contiguous */
502 
503   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
504      Also set other easy stuff.
505    */
506   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
507     sf->nleaves      = N;
508     sf->nroots       = n;
509     sf->nranks       = size;
510     sf->minleaf      = 0;
511     sf->maxleaf      = N - 1;
512   } else if (pattern == PETSCSF_PATTERN_GATHER) {
513     sf->nleaves      = rank ? 0 : N;
514     sf->nroots       = n;
515     sf->nranks       = rank ? 0 : size;
516     sf->minleaf      = 0;
517     sf->maxleaf      = rank ? -1 : N - 1;
518   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
519     sf->nleaves      = size;
520     sf->nroots       = size;
521     sf->nranks       = size;
522     sf->minleaf      = 0;
523     sf->maxleaf      = size - 1;
524   }
525   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
526   sf->graphset = PETSC_TRUE;
527   PetscFunctionReturn(0);
528 }
529 
530 /*@
531    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
532 
533    Collective
534 
535    Input Arguments:
536 
537 .  sf - star forest to invert
538 
539    Output Arguments:
540 .  isf - inverse of sf
541    Level: advanced
542 
543    Notes:
544    All roots must have degree 1.
545 
546    The local space may be a permutation, but cannot be sparse.
547 
548 .seealso: PetscSFSetGraph()
549 @*/
550 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
551 {
552   PetscErrorCode ierr;
553   PetscMPIInt    rank;
554   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
555   const PetscInt *ilocal;
556   PetscSFNode    *roots,*leaves;
557 
558   PetscFunctionBegin;
559   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
560   PetscSFCheckGraphSet(sf,1);
561   PetscValidPointer(isf,2);
562 
563   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
564   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
565 
566   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
567   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
568   for (i=0; i<maxlocal; i++) {
569     leaves[i].rank  = rank;
570     leaves[i].index = i;
571   }
572   for (i=0; i <nroots; i++) {
573     roots[i].rank  = -1;
574     roots[i].index = -1;
575   }
576   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
577   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
578 
579   /* Check whether our leaves are sparse */
580   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
581   if (count == nroots) newilocal = NULL;
582   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
583     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
584     for (i=0,count=0; i<nroots; i++) {
585       if (roots[i].rank >= 0) {
586         newilocal[count]   = i;
587         roots[count].rank  = roots[i].rank;
588         roots[count].index = roots[i].index;
589         count++;
590       }
591     }
592   }
593 
594   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
595   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
596   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
597   PetscFunctionReturn(0);
598 }
599 
600 /*@
601    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
602 
603    Collective
604 
605    Input Arguments:
606 +  sf - communication object to duplicate
607 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
608 
609    Output Arguments:
610 .  newsf - new communication object
611 
612    Level: beginner
613 
614 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
615 @*/
616 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
617 {
618   PetscSFType    type;
619   PetscErrorCode ierr;
620 
621   PetscFunctionBegin;
622   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
623   PetscValidLogicalCollectiveEnum(sf,opt,2);
624   PetscValidPointer(newsf,3);
625   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
626   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
627   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
628   if (opt == PETSCSF_DUPLICATE_GRAPH) {
629     PetscSFCheckGraphSet(sf,1);
630     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
631       PetscInt          nroots,nleaves;
632       const PetscInt    *ilocal;
633       const PetscSFNode *iremote;
634       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
635       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
636     } else {
637       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
638     }
639   }
640   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
641   PetscFunctionReturn(0);
642 }
643 
644 /*@C
645    PetscSFGetGraph - Get the graph specifying a parallel star forest
646 
647    Not Collective
648 
649    Input Arguments:
650 .  sf - star forest
651 
652    Output Arguments:
653 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
654 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
655 .  ilocal - locations of leaves in leafdata buffers
656 -  iremote - remote locations of root vertices for each leaf on the current process
657 
658    Notes:
659    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
660 
661    When called from Fortran, the returned iremote array is a copy and must be deallocated after use. Consequently, if you
662    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
663 
664    Level: intermediate
665 
666 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
667 @*/
668 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
669 {
670   PetscErrorCode ierr;
671 
672   PetscFunctionBegin;
673   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
674   if (sf->ops->GetGraph) {
675     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
676   } else {
677     if (nroots) *nroots = sf->nroots;
678     if (nleaves) *nleaves = sf->nleaves;
679     if (ilocal) *ilocal = sf->mine;
680     if (iremote) *iremote = sf->remote;
681   }
682   PetscFunctionReturn(0);
683 }
684 
685 /*@
686    PetscSFGetLeafRange - Get the active leaf ranges
687 
688    Not Collective
689 
690    Input Arguments:
691 .  sf - star forest
692 
693    Output Arguments:
694 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
695 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
696 
697    Level: developer
698 
699 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
700 @*/
701 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
702 {
703 
704   PetscFunctionBegin;
705   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
706   PetscSFCheckGraphSet(sf,1);
707   if (minleaf) *minleaf = sf->minleaf;
708   if (maxleaf) *maxleaf = sf->maxleaf;
709   PetscFunctionReturn(0);
710 }
711 
712 /*@C
713    PetscSFViewFromOptions - View from Options
714 
715    Collective on PetscSF
716 
717    Input Parameters:
718 +  A - the star forest
719 .  obj - Optional object
720 -  name - command line option
721 
722    Level: intermediate
723 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
724 @*/
725 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
726 {
727   PetscErrorCode ierr;
728 
729   PetscFunctionBegin;
730   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
731   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
732   PetscFunctionReturn(0);
733 }
734 
735 /*@C
736    PetscSFView - view a star forest
737 
738    Collective
739 
740    Input Arguments:
741 +  sf - star forest
742 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
743 
744    Level: beginner
745 
746 .seealso: PetscSFCreate(), PetscSFSetGraph()
747 @*/
748 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
749 {
750   PetscErrorCode    ierr;
751   PetscBool         iascii;
752   PetscViewerFormat format;
753 
754   PetscFunctionBegin;
755   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
756   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
757   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
758   PetscCheckSameComm(sf,1,viewer,2);
759   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
760   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
761   if (iascii) {
762     PetscMPIInt rank;
763     PetscInt    ii,i,j;
764 
765     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
766     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
767     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
768     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
769       if (!sf->graphset) {
770         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
771         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
772         PetscFunctionReturn(0);
773       }
774       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
775       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
776       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
777       for (i=0; i<sf->nleaves; i++) {
778         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
779       }
780       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
781       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
782       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
783         PetscMPIInt *tmpranks,*perm;
784         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
785         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
786         for (i=0; i<sf->nranks; i++) perm[i] = i;
787         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
788         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
789         for (ii=0; ii<sf->nranks; ii++) {
790           i = perm[ii];
791           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
792           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
793             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
794           }
795         }
796         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
797       }
798       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
799       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
800     }
801     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
802   }
803   PetscFunctionReturn(0);
804 }
805 
806 /*@C
807    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
808 
809    Not Collective
810 
811    Input Arguments:
812 .  sf - star forest
813 
814    Output Arguments:
815 +  nranks - number of ranks referenced by local part
816 .  ranks - array of ranks
817 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
818 .  rmine - concatenated array holding local indices referencing each remote rank
819 -  rremote - concatenated array holding remote indices referenced for each remote rank
820 
821    Level: developer
822 
823 .seealso: PetscSFGetLeafRanks()
824 @*/
825 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
826 {
827   PetscErrorCode ierr;
828 
829   PetscFunctionBegin;
830   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
831   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
832   if (sf->ops->GetRootRanks) {
833     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
834   } else {
835     /* The generic implementation */
836     if (nranks)  *nranks  = sf->nranks;
837     if (ranks)   *ranks   = sf->ranks;
838     if (roffset) *roffset = sf->roffset;
839     if (rmine)   *rmine   = sf->rmine;
840     if (rremote) *rremote = sf->rremote;
841   }
842   PetscFunctionReturn(0);
843 }
844 
845 /*@C
846    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
847 
848    Not Collective
849 
850    Input Arguments:
851 .  sf - star forest
852 
853    Output Arguments:
854 +  niranks - number of leaf ranks referencing roots on this process
855 .  iranks - array of ranks
856 .  ioffset - offset in irootloc for each rank (length niranks+1)
857 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
858 
859    Level: developer
860 
861 .seealso: PetscSFGetRootRanks()
862 @*/
863 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
864 {
865   PetscErrorCode ierr;
866 
867   PetscFunctionBegin;
868   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
869   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
870   if (sf->ops->GetLeafRanks) {
871     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
872   } else {
873     PetscSFType type;
874     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
875     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
876   }
877   PetscFunctionReturn(0);
878 }
879 
880 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
881   PetscInt i;
882   for (i=0; i<n; i++) {
883     if (needle == list[i]) return PETSC_TRUE;
884   }
885   return PETSC_FALSE;
886 }
887 
888 /*@C
889    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
890 
891    Collective
892 
893    Input Arguments:
894 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
895 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
896 
897    Level: developer
898 
899 .seealso: PetscSFGetRootRanks()
900 @*/
901 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
902 {
903   PetscErrorCode     ierr;
904   PetscTable         table;
905   PetscTablePosition pos;
906   PetscMPIInt        size,groupsize,*groupranks;
907   PetscInt           *rcount,*ranks;
908   PetscInt           i, irank = -1,orank = -1;
909 
910   PetscFunctionBegin;
911   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
912   PetscSFCheckGraphSet(sf,1);
913   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
914   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
915   for (i=0; i<sf->nleaves; i++) {
916     /* Log 1-based rank */
917     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
918   }
919   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
920   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
921   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
922   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
923   for (i=0; i<sf->nranks; i++) {
924     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
925     ranks[i]--;             /* Convert back to 0-based */
926   }
927   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
928 
929   /* We expect that dgroup is reliably "small" while nranks could be large */
930   {
931     MPI_Group group = MPI_GROUP_NULL;
932     PetscMPIInt *dgroupranks;
933     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
934     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRQ(ierr);
935     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
936     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
937     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
938     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRQ(ierr);}
939     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
940     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
941   }
942 
943   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
944   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i; ) {
945     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
946       if (InList(ranks[i],groupsize,groupranks)) break;
947     }
948     for ( ; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
949       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
950     }
951     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
952       PetscInt tmprank,tmpcount;
953 
954       tmprank             = ranks[i];
955       tmpcount            = rcount[i];
956       ranks[i]            = ranks[sf->ndranks];
957       rcount[i]           = rcount[sf->ndranks];
958       ranks[sf->ndranks]  = tmprank;
959       rcount[sf->ndranks] = tmpcount;
960       sf->ndranks++;
961     }
962   }
963   ierr = PetscFree(groupranks);CHKERRQ(ierr);
964   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
965   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
966   sf->roffset[0] = 0;
967   for (i=0; i<sf->nranks; i++) {
968     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
969     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
970     rcount[i]        = 0;
971   }
972   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
973     /* short circuit */
974     if (orank != sf->remote[i].rank) {
975       /* Search for index of iremote[i].rank in sf->ranks */
976       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
977       if (irank < 0) {
978         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
979         if (irank >= 0) irank += sf->ndranks;
980       }
981       orank = sf->remote[i].rank;
982     }
983     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
984     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
985     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
986     rcount[irank]++;
987   }
988   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
989   PetscFunctionReturn(0);
990 }
991 
992 /*@C
993    PetscSFGetGroups - gets incoming and outgoing process groups
994 
995    Collective
996 
997    Input Argument:
998 .  sf - star forest
999 
1000    Output Arguments:
1001 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1002 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1003 
1004    Level: developer
1005 
1006 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1007 @*/
1008 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1009 {
1010   PetscErrorCode ierr;
1011   MPI_Group      group = MPI_GROUP_NULL;
1012 
1013   PetscFunctionBegin;
1014   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1015   if (sf->ingroup == MPI_GROUP_NULL) {
1016     PetscInt       i;
1017     const PetscInt *indegree;
1018     PetscMPIInt    rank,*outranks,*inranks;
1019     PetscSFNode    *remote;
1020     PetscSF        bgcount;
1021 
1022     /* Compute the number of incoming ranks */
1023     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1024     for (i=0; i<sf->nranks; i++) {
1025       remote[i].rank  = sf->ranks[i];
1026       remote[i].index = 0;
1027     }
1028     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1029     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1030     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1031     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1032     /* Enumerate the incoming ranks */
1033     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1034     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1035     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1036     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1037     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1038     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1039     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRQ(ierr);
1040     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1041     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1042     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1043   }
1044   *incoming = sf->ingroup;
1045 
1046   if (sf->outgroup == MPI_GROUP_NULL) {
1047     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1048     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRQ(ierr);
1049     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1050   }
1051   *outgoing = sf->outgroup;
1052   PetscFunctionReturn(0);
1053 }
1054 
1055 /*@
1056    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1057 
1058    Collective
1059 
1060    Input Argument:
1061 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1062 
1063    Output Arguments:
1064 .  multi - star forest with split roots, such that each root has degree exactly 1
1065 
1066    Level: developer
1067 
1068    Notes:
1069 
1070    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1071    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1072    edge, it is a candidate for future optimization that might involve its removal.
1073 
1074 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1075 @*/
1076 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1077 {
1078   PetscErrorCode ierr;
1079 
1080   PetscFunctionBegin;
1081   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1082   PetscValidPointer(multi,2);
1083   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1084     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1085     *multi = sf->multi;
1086     PetscFunctionReturn(0);
1087   }
1088   if (!sf->multi) {
1089     const PetscInt *indegree;
1090     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1091     PetscSFNode    *remote;
1092     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1093     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1094     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1095     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1096     inoffset[0] = 0;
1097     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1098     for (i=0; i<maxlocal; i++) outones[i] = 1;
1099     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1100     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1101     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1102 #if defined(PETSC_USE_DEBUG)                                 /* Check that the expected number of increments occurred */
1103     for (i=0; i<sf->nroots; i++) {
1104       if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1105     }
1106 #endif
1107     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1108     for (i=0; i<sf->nleaves; i++) {
1109       remote[i].rank  = sf->remote[i].rank;
1110       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1111     }
1112     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1113     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1114     if (sf->rankorder) {        /* Sort the ranks */
1115       PetscMPIInt rank;
1116       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1117       PetscSFNode *newremote;
1118       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1119       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1120       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1121       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1122       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1123       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1124       /* Sort the incoming ranks at each vertex, build the inverse map */
1125       for (i=0; i<sf->nroots; i++) {
1126         PetscInt j;
1127         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1128         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1129         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1130       }
1131       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1132       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1133       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1134       for (i=0; i<sf->nleaves; i++) {
1135         newremote[i].rank  = sf->remote[i].rank;
1136         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1137       }
1138       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1139       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1140     }
1141     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1142   }
1143   *multi = sf->multi;
1144   PetscFunctionReturn(0);
1145 }
1146 
1147 /*@C
1148    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1149 
1150    Collective
1151 
1152    Input Arguments:
1153 +  sf - original star forest
1154 .  nselected  - number of selected roots on this process
1155 -  selected   - indices of the selected roots on this process
1156 
1157    Output Arguments:
1158 .  newsf - new star forest
1159 
1160    Level: advanced
1161 
1162    Note:
1163    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1164    be done by calling PetscSFGetGraph().
1165 
1166 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1167 @*/
1168 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1169 {
1170   PetscInt          i,n,*roots,*rootdata,*leafdata,nroots,nleaves,connected_leaves,*new_ilocal;
1171   const PetscSFNode *iremote;
1172   PetscSFNode       *new_iremote;
1173   PetscSF           tmpsf;
1174   MPI_Comm          comm;
1175   PetscErrorCode    ierr;
1176 
1177   PetscFunctionBegin;
1178   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1179   PetscSFCheckGraphSet(sf,1);
1180   if (nselected) PetscValidPointer(selected,3);
1181   PetscValidPointer(newsf,4);
1182 
1183   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1184   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1185   /* Uniq selected[] and put results in roots[] */
1186   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1187   ierr = PetscMalloc1(nselected,&roots);CHKERRQ(ierr);
1188   ierr = PetscArraycpy(roots,selected,nselected);CHKERRQ(ierr);
1189   ierr = PetscSortedRemoveDupsInt(&nselected,roots);CHKERRQ(ierr);
1190   if (nselected && (roots[0] < 0 || roots[nselected-1] >= sf->nroots)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max root indices %D/%D are not in [0,%D)",roots[0],roots[nselected-1],sf->nroots);
1191 
1192   if (sf->ops->CreateEmbeddedSF) {
1193     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,roots,newsf);CHKERRQ(ierr);
1194   } else {
1195     /* A generic version of creating embedded sf. Note that we called PetscSFSetGraph() twice, which is certainly expensive */
1196     /* Find out which leaves (not leaf data items) are still connected to roots in the embedded sf */
1197     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,NULL,&iremote);CHKERRQ(ierr);
1198     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&tmpsf);CHKERRQ(ierr);
1199     ierr = PetscSFSetGraph(tmpsf,nroots,nleaves,NULL/*contiguous*/,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);CHKERRQ(ierr);
1200     ierr = PetscCalloc2(nroots,&rootdata,nleaves,&leafdata);CHKERRQ(ierr);
1201     for (i=0; i<nselected; ++i) rootdata[roots[i]] = 1;
1202     ierr = PetscSFBcastBegin(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1203     ierr = PetscSFBcastEnd(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1204     ierr = PetscSFDestroy(&tmpsf);CHKERRQ(ierr);
1205 
1206     /* Build newsf with leaves that are still connected */
1207     connected_leaves = 0;
1208     for (i=0; i<nleaves; ++i) connected_leaves += leafdata[i];
1209     ierr = PetscMalloc1(connected_leaves,&new_ilocal);CHKERRQ(ierr);
1210     ierr = PetscMalloc1(connected_leaves,&new_iremote);CHKERRQ(ierr);
1211     for (i=0, n=0; i<nleaves; ++i) {
1212       if (leafdata[i]) {
1213         new_ilocal[n]        = sf->mine ? sf->mine[i] : i;
1214         new_iremote[n].rank  = sf->remote[i].rank;
1215         new_iremote[n].index = sf->remote[i].index;
1216         ++n;
1217       }
1218     }
1219 
1220     if (n != connected_leaves) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"There is a size mismatch in the SF embedding, %d != %d",n,connected_leaves);
1221     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1222     ierr = PetscSFSetGraph(*newsf,nroots,connected_leaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1223     ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
1224   }
1225   ierr = PetscFree(roots);CHKERRQ(ierr);
1226   ierr = PetscSFSetUp(*newsf);CHKERRQ(ierr);
1227   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1228   PetscFunctionReturn(0);
1229 }
1230 
1231 /*@C
1232   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1233 
1234   Collective
1235 
1236   Input Arguments:
1237 + sf - original star forest
1238 . nselected  - number of selected leaves on this process
1239 - selected   - indices of the selected leaves on this process
1240 
1241   Output Arguments:
1242 .  newsf - new star forest
1243 
1244   Level: advanced
1245 
1246 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1247 @*/
1248 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1249 {
1250   const PetscSFNode *iremote;
1251   PetscSFNode       *new_iremote;
1252   const PetscInt    *ilocal;
1253   PetscInt          i,nroots,*leaves,*new_ilocal;
1254   MPI_Comm          comm;
1255   PetscErrorCode    ierr;
1256 
1257   PetscFunctionBegin;
1258   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1259   PetscSFCheckGraphSet(sf,1);
1260   if (nselected) PetscValidPointer(selected,3);
1261   PetscValidPointer(newsf,4);
1262 
1263   /* Uniq selected[] and put results in leaves[] */
1264   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1265   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1266   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1267   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1268   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1269 
1270   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1271   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1272     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1273   } else {
1274     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1275     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1276     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1277     for (i=0; i<nselected; ++i) {
1278       const PetscInt l     = leaves[i];
1279       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1280       new_iremote[i].rank  = iremote[l].rank;
1281       new_iremote[i].index = iremote[l].index;
1282     }
1283     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1284     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1285   }
1286   ierr = PetscFree(leaves);CHKERRQ(ierr);
1287   PetscFunctionReturn(0);
1288 }
1289 
1290 /*@C
1291    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1292 
1293    Collective on PetscSF
1294 
1295    Input Arguments:
1296 +  sf - star forest on which to communicate
1297 .  unit - data type associated with each node
1298 .  rootdata - buffer to broadcast
1299 -  op - operation to use for reduction
1300 
1301    Output Arguments:
1302 .  leafdata - buffer to be reduced with values from each leaf's respective root
1303 
1304    Level: intermediate
1305 
1306 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd()
1307 @*/
1308 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1309 {
1310   PetscErrorCode ierr;
1311   PetscMemType   rootmtype,leafmtype;
1312 
1313   PetscFunctionBegin;
1314   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1315   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1316   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1317   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1318   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1319 #if defined(PETSC_HAVE_CUDA)
1320   /*  Shall we assume rootdata, leafdata are ready to use (instead of being computed by some asynchronous kernels)?
1321     To be similar to MPI, I'd like to have this assumption, since MPI does not have a concept of stream.
1322     But currently this assumption is not enforecd in Petsc. To be safe, I do synchronization here. Otherwise, if
1323     we do not sync now and call the Pack kernel directly on the default NULL stream (assume petsc objects are also
1324     computed on it), we have to sync the NULL stream before calling MPI routines. So, it looks a cudaDeviceSynchronize
1325     is inevitable. We do it now and put pack/unpack kernels to non-NULL streams.
1326    */
1327   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1328 #endif
1329   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1330   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1331   PetscFunctionReturn(0);
1332 }
1333 
1334 /*@C
1335    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1336 
1337    Collective
1338 
1339    Input Arguments:
1340 +  sf - star forest
1341 .  unit - data type
1342 .  rootdata - buffer to broadcast
1343 -  op - operation to use for reduction
1344 
1345    Output Arguments:
1346 .  leafdata - buffer to be reduced with values from each leaf's respective root
1347 
1348    Level: intermediate
1349 
1350 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1351 @*/
1352 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1353 {
1354   PetscErrorCode ierr;
1355   PetscMemType   rootmtype,leafmtype;
1356 
1357   PetscFunctionBegin;
1358   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1359   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1360   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1361   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1362   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1363   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1364   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 /*@C
1369    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1370 
1371    Collective
1372 
1373    Input Arguments:
1374 +  sf - star forest
1375 .  unit - data type
1376 .  leafdata - values to reduce
1377 -  op - reduction operation
1378 
1379    Output Arguments:
1380 .  rootdata - result of reduction of values from all leaves of each root
1381 
1382    Level: intermediate
1383 
1384 .seealso: PetscSFBcastBegin()
1385 @*/
1386 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1387 {
1388   PetscErrorCode ierr;
1389   PetscMemType   rootmtype,leafmtype;
1390 
1391   PetscFunctionBegin;
1392   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1393   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1394   ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1395   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1396   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1397 #if defined(PETSC_HAVE_CUDA)
1398   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1399 #endif
1400   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1401   ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1402   PetscFunctionReturn(0);
1403 }
1404 
1405 /*@C
1406    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1407 
1408    Collective
1409 
1410    Input Arguments:
1411 +  sf - star forest
1412 .  unit - data type
1413 .  leafdata - values to reduce
1414 -  op - reduction operation
1415 
1416    Output Arguments:
1417 .  rootdata - result of reduction of values from all leaves of each root
1418 
1419    Level: intermediate
1420 
1421 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1422 @*/
1423 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1424 {
1425   PetscErrorCode ierr;
1426   PetscMemType   rootmtype,leafmtype;
1427 
1428   PetscFunctionBegin;
1429   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1430   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1431   ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1432   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1433   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1434   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1435   ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1436   PetscFunctionReturn(0);
1437 }
1438 
1439 /*@C
1440    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1441 
1442    Collective
1443 
1444    Input Arguments:
1445 +  sf - star forest
1446 .  unit - data type
1447 .  leafdata - leaf values to use in reduction
1448 -  op - operation to use for reduction
1449 
1450    Output Arguments:
1451 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1452 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1453 
1454    Level: advanced
1455 
1456    Note:
1457    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1458    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1459    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1460    integers.
1461 
1462 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1463 @*/
1464 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1465 {
1466   PetscErrorCode ierr;
1467   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1468 
1469   PetscFunctionBegin;
1470   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1471   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1472   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1473   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1474   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1475   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1476   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1477 #if defined(PETSC_HAVE_CUDA)
1478   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1479 #endif
1480   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1481   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1482   PetscFunctionReturn(0);
1483 }
1484 
1485 /*@C
1486    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1487 
1488    Collective
1489 
1490    Input Arguments:
1491 +  sf - star forest
1492 .  unit - data type
1493 .  leafdata - leaf values to use in reduction
1494 -  op - operation to use for reduction
1495 
1496    Output Arguments:
1497 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1498 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1499 
1500    Level: advanced
1501 
1502 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1503 @*/
1504 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1505 {
1506   PetscErrorCode ierr;
1507   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1508 
1509   PetscFunctionBegin;
1510   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1511   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1512   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1513   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1514   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1515   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1516   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1517   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1518   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1519   PetscFunctionReturn(0);
1520 }
1521 
1522 /*@C
1523    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1524 
1525    Collective
1526 
1527    Input Arguments:
1528 .  sf - star forest
1529 
1530    Output Arguments:
1531 .  degree - degree of each root vertex
1532 
1533    Level: advanced
1534 
1535    Notes:
1536    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1537 
1538 .seealso: PetscSFGatherBegin()
1539 @*/
1540 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1541 {
1542   PetscErrorCode ierr;
1543 
1544   PetscFunctionBegin;
1545   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1546   PetscSFCheckGraphSet(sf,1);
1547   PetscValidPointer(degree,2);
1548   if (!sf->degreeknown) {
1549     PetscInt i, nroots = sf->nroots, maxlocal;
1550     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1551     maxlocal = sf->maxleaf-sf->minleaf+1;
1552     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1553     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1554     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1555     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1556     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1557   }
1558   *degree = NULL;
1559   PetscFunctionReturn(0);
1560 }
1561 
1562 /*@C
1563    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1564 
1565    Collective
1566 
1567    Input Arguments:
1568 .  sf - star forest
1569 
1570    Output Arguments:
1571 .  degree - degree of each root vertex
1572 
1573    Level: developer
1574 
1575    Notes:
1576    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1577 
1578 .seealso:
1579 @*/
1580 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1581 {
1582   PetscErrorCode ierr;
1583 
1584   PetscFunctionBegin;
1585   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1586   PetscSFCheckGraphSet(sf,1);
1587   PetscValidPointer(degree,2);
1588   if (!sf->degreeknown) {
1589     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1590     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1591     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1592     sf->degreeknown = PETSC_TRUE;
1593   }
1594   *degree = sf->degree;
1595   PetscFunctionReturn(0);
1596 }
1597 
1598 
1599 /*@C
1600    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1601    Each multi-root is assigned index of the corresponding original root.
1602 
1603    Collective
1604 
1605    Input Arguments:
1606 +  sf - star forest
1607 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1608 
1609    Output Arguments:
1610 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1611 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1612 
1613    Level: developer
1614 
1615    Notes:
1616    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1617 
1618 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1619 @*/
1620 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1621 {
1622   PetscSF             msf;
1623   PetscInt            i, j, k, nroots, nmroots;
1624   PetscErrorCode      ierr;
1625 
1626   PetscFunctionBegin;
1627   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1628   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1629   if (nroots) PetscValidIntPointer(degree,2);
1630   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1631   PetscValidPointer(multiRootsOrigNumbering,4);
1632   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1633   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1634   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1635   for (i=0,j=0,k=0; i<nroots; i++) {
1636     if (!degree[i]) continue;
1637     for (j=0; j<degree[i]; j++,k++) {
1638       (*multiRootsOrigNumbering)[k] = i;
1639     }
1640   }
1641 #if defined(PETSC_USE_DEBUG)
1642   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1643 #endif
1644   if (nMultiRoots) *nMultiRoots = nmroots;
1645   PetscFunctionReturn(0);
1646 }
1647 
1648 /*@C
1649    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1650 
1651    Collective
1652 
1653    Input Arguments:
1654 +  sf - star forest
1655 .  unit - data type
1656 -  leafdata - leaf data to gather to roots
1657 
1658    Output Argument:
1659 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1660 
1661    Level: intermediate
1662 
1663 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1664 @*/
1665 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1666 {
1667   PetscErrorCode ierr;
1668   PetscSF        multi;
1669 
1670   PetscFunctionBegin;
1671   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1672   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1673   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1674   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1675   PetscFunctionReturn(0);
1676 }
1677 
1678 /*@C
1679    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1680 
1681    Collective
1682 
1683    Input Arguments:
1684 +  sf - star forest
1685 .  unit - data type
1686 -  leafdata - leaf data to gather to roots
1687 
1688    Output Argument:
1689 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1690 
1691    Level: intermediate
1692 
1693 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1694 @*/
1695 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1696 {
1697   PetscErrorCode ierr;
1698   PetscSF        multi;
1699 
1700   PetscFunctionBegin;
1701   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1702   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1703   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1704   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1705   PetscFunctionReturn(0);
1706 }
1707 
1708 /*@C
1709    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1710 
1711    Collective
1712 
1713    Input Arguments:
1714 +  sf - star forest
1715 .  unit - data type
1716 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1717 
1718    Output Argument:
1719 .  leafdata - leaf data to be update with personal data from each respective root
1720 
1721    Level: intermediate
1722 
1723 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1724 @*/
1725 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1726 {
1727   PetscErrorCode ierr;
1728   PetscSF        multi;
1729 
1730   PetscFunctionBegin;
1731   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1732   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1733   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1734   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1735   PetscFunctionReturn(0);
1736 }
1737 
1738 /*@C
1739    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1740 
1741    Collective
1742 
1743    Input Arguments:
1744 +  sf - star forest
1745 .  unit - data type
1746 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1747 
1748    Output Argument:
1749 .  leafdata - leaf data to be update with personal data from each respective root
1750 
1751    Level: intermediate
1752 
1753 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1754 @*/
1755 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1756 {
1757   PetscErrorCode ierr;
1758   PetscSF        multi;
1759 
1760   PetscFunctionBegin;
1761   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1762   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1763   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1764   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1765   PetscFunctionReturn(0);
1766 }
1767 
1768 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1769 {
1770 #if defined(PETSC_USE_DEBUG)
1771   PetscInt        i, n, nleaves;
1772   const PetscInt *ilocal = NULL;
1773   PetscHSetI      seen;
1774   PetscErrorCode  ierr;
1775 
1776   PetscFunctionBegin;
1777   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1778   ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1779   for (i = 0; i < nleaves; i++) {
1780     const PetscInt leaf = ilocal ? ilocal[i] : i;
1781     ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1782   }
1783   ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1784   if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1785   ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1786   PetscFunctionReturn(0);
1787 #else
1788   PetscFunctionBegin;
1789   PetscFunctionReturn(0);
1790 #endif
1791 }
1792 
1793 /*@
1794   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1795 
1796   Input Parameters:
1797 + sfA - The first PetscSF
1798 - sfB - The second PetscSF
1799 
1800   Output Parameters:
1801 . sfBA - The composite SF
1802 
1803   Level: developer
1804 
1805   Notes:
1806   Currently, the two SFs must be defined on congruent communicators and they must be true star
1807   forests, i.e. the same leaf is not connected with different roots.
1808 
1809   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1810   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1811   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1812   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1813 
1814 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1815 @*/
1816 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1817 {
1818   PetscErrorCode    ierr;
1819   const PetscSFNode *remotePointsA,*remotePointsB;
1820   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1821   const PetscInt    *localPointsA,*localPointsB;
1822   PetscInt          *localPointsBA;
1823   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1824   PetscBool         denseB;
1825 
1826   PetscFunctionBegin;
1827   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1828   PetscSFCheckGraphSet(sfA,1);
1829   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1830   PetscSFCheckGraphSet(sfB,2);
1831   PetscCheckSameComm(sfA,1,sfB,2);
1832   PetscValidPointer(sfBA,3);
1833   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1834   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1835 
1836   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1837   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1838   if (localPointsA) {
1839     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
1840     for (i=0; i<numRootsB; i++) {
1841       reorderedRemotePointsA[i].rank = -1;
1842       reorderedRemotePointsA[i].index = -1;
1843     }
1844     for (i=0; i<numLeavesA; i++) {
1845       if (localPointsA[i] >= numRootsB) continue;
1846       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
1847     }
1848     remotePointsA = reorderedRemotePointsA;
1849   }
1850   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
1851   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
1852   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1853   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1854   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1855 
1856   denseB = (PetscBool)!localPointsB;
1857   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1858     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
1859     else numLeavesBA++;
1860   }
1861   if (denseB) {
1862     localPointsBA  = NULL;
1863     remotePointsBA = leafdataB;
1864   } else {
1865     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
1866     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
1867     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1868       const PetscInt l = localPointsB ? localPointsB[i] : i;
1869 
1870       if (leafdataB[l-minleaf].rank == -1) continue;
1871       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
1872       localPointsBA[numLeavesBA] = l;
1873       numLeavesBA++;
1874     }
1875     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
1876   }
1877   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1878   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1879   PetscFunctionReturn(0);
1880 }
1881 
1882 /*@
1883   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
1884 
1885   Input Parameters:
1886 + sfA - The first PetscSF
1887 - sfB - The second PetscSF
1888 
1889   Output Parameters:
1890 . sfBA - The composite SF.
1891 
1892   Level: developer
1893 
1894   Notes:
1895   Currently, the two SFs must be defined on congruent communicators and they must be true star
1896   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
1897   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
1898 
1899   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
1900   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
1901   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
1902   a Reduce on sfB, on connected roots.
1903 
1904 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
1905 @*/
1906 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1907 {
1908   PetscErrorCode    ierr;
1909   const PetscSFNode *remotePointsA,*remotePointsB;
1910   PetscSFNode       *remotePointsBA;
1911   const PetscInt    *localPointsA,*localPointsB;
1912   PetscSFNode       *reorderedRemotePointsA = NULL;
1913   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
1914   MPI_Op            op;
1915 #if defined(PETSC_USE_64BIT_INDICES)
1916   PetscBool         iswin;
1917 #endif
1918 
1919   PetscFunctionBegin;
1920   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1921   PetscSFCheckGraphSet(sfA,1);
1922   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1923   PetscSFCheckGraphSet(sfB,2);
1924   PetscCheckSameComm(sfA,1,sfB,2);
1925   PetscValidPointer(sfBA,3);
1926   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1927   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1928 
1929   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
1930   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
1931 
1932   /* TODO: Check roots of sfB have degree of 1 */
1933   /* Once we implement it, we can replace the MPI_MAXLOC
1934      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
1935      We use MPI_MAXLOC only to have a deterministic output from this routine if
1936      the root condition is not meet.
1937    */
1938   op = MPI_MAXLOC;
1939 #if defined(PETSC_USE_64BIT_INDICES)
1940   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
1941   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
1942   if (iswin) op = MPIU_REPLACE;
1943 #endif
1944 
1945   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
1946   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
1947   for (i=0; i<maxleaf - minleaf + 1; i++) {
1948     reorderedRemotePointsA[i].rank = -1;
1949     reorderedRemotePointsA[i].index = -1;
1950   }
1951   if (localPointsA) {
1952     for (i=0; i<numLeavesA; i++) {
1953       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
1954       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
1955     }
1956   } else {
1957     for (i=0; i<numLeavesA; i++) {
1958       if (i > maxleaf || i < minleaf) continue;
1959       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
1960     }
1961   }
1962 
1963   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
1964   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
1965   for (i=0; i<numRootsB; i++) {
1966     remotePointsBA[i].rank = -1;
1967     remotePointsBA[i].index = -1;
1968   }
1969 
1970   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
1971   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
1972   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1973   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
1974     if (remotePointsBA[i].rank == -1) continue;
1975     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
1976     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
1977     localPointsBA[numLeavesBA] = i;
1978     numLeavesBA++;
1979   }
1980   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1981   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1982   PetscFunctionReturn(0);
1983 }
1984 
1985 /*
1986   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
1987 
1988   Input Parameters:
1989 . sf - The global PetscSF
1990 
1991   Output Parameters:
1992 . out - The local PetscSF
1993  */
1994 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
1995 {
1996   MPI_Comm           comm;
1997   PetscMPIInt        myrank;
1998   const PetscInt     *ilocal;
1999   const PetscSFNode  *iremote;
2000   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
2001   PetscSFNode        *liremote;
2002   PetscSF            lsf;
2003   PetscErrorCode     ierr;
2004 
2005   PetscFunctionBegin;
2006   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2007   if (sf->ops->CreateLocalSF) {
2008     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2009   } else {
2010     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2011     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2012     ierr = MPI_Comm_rank(comm,&myrank);CHKERRQ(ierr);
2013 
2014     /* Find out local edges and build a local SF */
2015     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2016     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2017     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2018     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2019 
2020     for (i=j=0; i<nleaves; i++) {
2021       if (iremote[i].rank == (PetscInt)myrank) {
2022         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2023         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2024         liremote[j].index = iremote[i].index;
2025         j++;
2026       }
2027     }
2028     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2029     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2030     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2031     *out = lsf;
2032   }
2033   PetscFunctionReturn(0);
2034 }
2035 
2036 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2037 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2038 {
2039   PetscErrorCode     ierr;
2040   PetscMemType       rootmtype,leafmtype;
2041 
2042   PetscFunctionBegin;
2043   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2044   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2045   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2046   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2047   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2048   if (sf->ops->BcastToZero) {
2049     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2050   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2051   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2052   PetscFunctionReturn(0);
2053 }
2054 
2055