xref: /petsc/src/vec/is/sf/interface/sf.c (revision 60dd46ca9875962950ab5c8a8f4d94fdb4c0f129)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_USE_DEBUG)
6 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
7     if (PetscUnlikely(!(sf)->graphset))                              \
8       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
9   } while (0)
10 #else
11 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
12 #endif
13 
14 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",0};
15 
16 /*@
17    PetscSFCreate - create a star forest communication context
18 
19    Collective
20 
21    Input Arguments:
22 .  comm - communicator on which the star forest will operate
23 
24    Output Arguments:
25 .  sf - new star forest context
26 
27    Options Database Keys:
28 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
29 .  -sf_type window    -Use MPI-3 one-sided window for communication
30 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
31 
32    Level: intermediate
33 
34    Notes:
35    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
36    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
37    SFs are optimized and they have better performance than general SFs.
38 
39 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
40 @*/
41 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
42 {
43   PetscErrorCode ierr;
44   PetscSF        b;
45 
46   PetscFunctionBegin;
47   PetscValidPointer(sf,2);
48   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
49 
50   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
51 
52   b->nroots    = -1;
53   b->nleaves   = -1;
54   b->minleaf   = PETSC_MAX_INT;
55   b->maxleaf   = PETSC_MIN_INT;
56   b->nranks    = -1;
57   b->rankorder = PETSC_TRUE;
58   b->ingroup   = MPI_GROUP_NULL;
59   b->outgroup  = MPI_GROUP_NULL;
60   b->graphset  = PETSC_FALSE;
61 
62   *sf = b;
63   PetscFunctionReturn(0);
64 }
65 
66 /*@
67    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
68 
69    Collective
70 
71    Input Arguments:
72 .  sf - star forest
73 
74    Level: advanced
75 
76 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
77 @*/
78 PetscErrorCode PetscSFReset(PetscSF sf)
79 {
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
84   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
85   sf->nroots   = -1;
86   sf->nleaves  = -1;
87   sf->minleaf  = PETSC_MAX_INT;
88   sf->maxleaf  = PETSC_MIN_INT;
89   sf->mine     = NULL;
90   sf->remote   = NULL;
91   sf->graphset = PETSC_FALSE;
92   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
93   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
94   sf->nranks = -1;
95   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
96 #if defined(PETSC_HAVE_CUDA)
97   if (sf->rmine_d) {cudaError_t err = cudaFree(sf->rmine_d);CHKERRCUDA(err);sf->rmine_d=NULL;}
98 #endif
99   sf->degreeknown = PETSC_FALSE;
100   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
101   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRQ(ierr);}
102   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRQ(ierr);}
103   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
104   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
105   sf->setupcalled = PETSC_FALSE;
106   PetscFunctionReturn(0);
107 }
108 
109 /*@C
110    PetscSFSetType - Set the PetscSF communication implementation
111 
112    Collective on PetscSF
113 
114    Input Parameters:
115 +  sf - the PetscSF context
116 -  type - a known method
117 
118    Options Database Key:
119 .  -sf_type <type> - Sets the method; use -help for a list
120    of available methods (for instance, window, pt2pt, neighbor)
121 
122    Notes:
123    See "include/petscsf.h" for available methods (for instance)
124 +    PETSCSFWINDOW - MPI-2/3 one-sided
125 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
126 
127   Level: intermediate
128 
129 .seealso: PetscSFType, PetscSFCreate()
130 @*/
131 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
132 {
133   PetscErrorCode ierr,(*r)(PetscSF);
134   PetscBool      match;
135 
136   PetscFunctionBegin;
137   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
138   PetscValidCharPointer(type,2);
139 
140   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
141   if (match) PetscFunctionReturn(0);
142 
143   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
144   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
145   /* Destroy the previous PetscSF implementation context */
146   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
147   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
148   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
149   ierr = (*r)(sf);CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 /*@C
154   PetscSFGetType - Get the PetscSF communication implementation
155 
156   Not Collective
157 
158   Input Parameter:
159 . sf  - the PetscSF context
160 
161   Output Parameter:
162 . type - the PetscSF type name
163 
164   Level: intermediate
165 
166 .seealso: PetscSFSetType(), PetscSFCreate()
167 @*/
168 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
169 {
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
172   PetscValidPointer(type,2);
173   *type = ((PetscObject)sf)->type_name;
174   PetscFunctionReturn(0);
175 }
176 
177 /*@
178    PetscSFDestroy - destroy star forest
179 
180    Collective
181 
182    Input Arguments:
183 .  sf - address of star forest
184 
185    Level: intermediate
186 
187 .seealso: PetscSFCreate(), PetscSFReset()
188 @*/
189 PetscErrorCode PetscSFDestroy(PetscSF *sf)
190 {
191   PetscErrorCode ierr;
192 
193   PetscFunctionBegin;
194   if (!*sf) PetscFunctionReturn(0);
195   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
196   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
197   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
198   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
199   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
204 {
205 #if defined(PETSC_USE_DEBUG)
206   PetscInt           i, nleaves;
207   PetscMPIInt        size;
208   const PetscInt    *ilocal;
209   const PetscSFNode *iremote;
210   PetscErrorCode     ierr;
211 
212   PetscFunctionBegin;
213   if (!sf->graphset) PetscFunctionReturn(0);
214   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
215   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
216   for (i = 0; i < nleaves; i++) {
217     const PetscInt rank = iremote[i].rank;
218     const PetscInt remote = iremote[i].index;
219     const PetscInt leaf = ilocal ? ilocal[i] : i;
220     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
221     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
222     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
223   }
224   PetscFunctionReturn(0);
225 #else
226   PetscFunctionBegin;
227   PetscFunctionReturn(0);
228 #endif
229 }
230 
231 /*@
232    PetscSFSetUp - set up communication structures
233 
234    Collective
235 
236    Input Arguments:
237 .  sf - star forest communication object
238 
239    Level: beginner
240 
241 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
242 @*/
243 PetscErrorCode PetscSFSetUp(PetscSF sf)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
249   PetscSFCheckGraphSet(sf,1);
250   if (sf->setupcalled) PetscFunctionReturn(0);
251   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
252   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);}
253   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
254   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
255   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
256   sf->setupcalled = PETSC_TRUE;
257   PetscFunctionReturn(0);
258 }
259 
260 /*@
261    PetscSFSetFromOptions - set PetscSF options using the options database
262 
263    Logically Collective
264 
265    Input Arguments:
266 .  sf - star forest
267 
268    Options Database Keys:
269 +  -sf_type - implementation type, see PetscSFSetType()
270 -  -sf_rank_order - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
271 
272    Level: intermediate
273 
274 .seealso: PetscSFWindowSetSyncType()
275 @*/
276 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
277 {
278   PetscSFType    deft;
279   char           type[256];
280   PetscErrorCode ierr;
281   PetscBool      flg;
282 
283   PetscFunctionBegin;
284   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
285   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
286   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
287   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
288   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
289   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
290   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
291   ierr = PetscOptionsEnd();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 /*@
296    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
297 
298    Logically Collective
299 
300    Input Arguments:
301 +  sf - star forest
302 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
303 
304    Level: advanced
305 
306 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
307 @*/
308 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
309 {
310 
311   PetscFunctionBegin;
312   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
313   PetscValidLogicalCollectiveBool(sf,flg,2);
314   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
315   sf->rankorder = flg;
316   PetscFunctionReturn(0);
317 }
318 
319 /*@
320    PetscSFSetGraph - Set a parallel star forest
321 
322    Collective
323 
324    Input Arguments:
325 +  sf - star forest
326 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
327 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
328 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
329 during setup in debug mode)
330 .  localmode - copy mode for ilocal
331 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
332 during setup in debug mode)
333 -  remotemode - copy mode for iremote
334 
335    Level: intermediate
336 
337    Notes:
338     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
339 
340    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
341    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
342    needed
343 
344    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
345    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
346 
347 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
348 @*/
349 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
350 {
351   PetscErrorCode ierr;
352 
353   PetscFunctionBegin;
354   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
355   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
356   if (nleaves > 0) PetscValidPointer(iremote,6);
357   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
358   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
359 
360   ierr = PetscSFReset(sf);CHKERRQ(ierr);
361   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
362 
363   sf->nroots  = nroots;
364   sf->nleaves = nleaves;
365 
366   if (nleaves && ilocal) {
367     PetscInt i;
368     PetscInt minleaf = PETSC_MAX_INT;
369     PetscInt maxleaf = PETSC_MIN_INT;
370     int      contiguous = 1;
371     for (i=0; i<nleaves; i++) {
372       minleaf = PetscMin(minleaf,ilocal[i]);
373       maxleaf = PetscMax(maxleaf,ilocal[i]);
374       contiguous &= (ilocal[i] == i);
375     }
376     sf->minleaf = minleaf;
377     sf->maxleaf = maxleaf;
378     if (contiguous) {
379       if (localmode == PETSC_OWN_POINTER) {
380         ierr = PetscFree(ilocal);CHKERRQ(ierr);
381       }
382       ilocal = NULL;
383     }
384   } else {
385     sf->minleaf = 0;
386     sf->maxleaf = nleaves - 1;
387   }
388 
389   if (ilocal) {
390     switch (localmode) {
391     case PETSC_COPY_VALUES:
392       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
393       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
394       sf->mine = sf->mine_alloc;
395       break;
396     case PETSC_OWN_POINTER:
397       sf->mine_alloc = (PetscInt*)ilocal;
398       sf->mine       = sf->mine_alloc;
399       break;
400     case PETSC_USE_POINTER:
401       sf->mine_alloc = NULL;
402       sf->mine       = (PetscInt*)ilocal;
403       break;
404     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
405     }
406   }
407 
408   switch (remotemode) {
409   case PETSC_COPY_VALUES:
410     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
411     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
412     sf->remote = sf->remote_alloc;
413     break;
414   case PETSC_OWN_POINTER:
415     sf->remote_alloc = (PetscSFNode*)iremote;
416     sf->remote       = sf->remote_alloc;
417     break;
418   case PETSC_USE_POINTER:
419     sf->remote_alloc = NULL;
420     sf->remote       = (PetscSFNode*)iremote;
421     break;
422   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
423   }
424 
425   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
426   sf->graphset = PETSC_TRUE;
427   PetscFunctionReturn(0);
428 }
429 
430 /*@
431   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
432 
433   Collective
434 
435   Input Parameters:
436 + sf      - The PetscSF
437 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
438 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
439 
440   Notes:
441   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
442   n and N are local and global sizes of x respectively.
443 
444   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
445   sequential vectors y on all ranks.
446 
447   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
448   sequential vector y on rank 0.
449 
450   In above cases, entries of x are roots and entries of y are leaves.
451 
452   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
453   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
454   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
455   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
456   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
457 
458   In this case, roots and leaves are symmetric.
459 
460   Level: intermediate
461  @*/
462 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
463 {
464   MPI_Comm       comm;
465   PetscInt       n,N,res[2];
466   PetscMPIInt    rank,size;
467   PetscSFType    type;
468   PetscErrorCode ierr;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
472   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
473   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
474   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
475 
476   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
477     type = PETSCSFALLTOALL;
478     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
479     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
480     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
481     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
482   } else {
483     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
484     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
485     res[0] = n;
486     res[1] = -n;
487     /* Check if n are same over all ranks so that we can optimize it */
488     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
489     if (res[0] == -res[1]) { /* same n */
490       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
491     } else {
492       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
493     }
494     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
495   }
496   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
497 
498   sf->pattern = pattern;
499   sf->mine    = NULL; /* Contiguous */
500 
501   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
502      Also set other easy stuff.
503    */
504   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
505     sf->nleaves      = N;
506     sf->nroots       = n;
507     sf->nranks       = size;
508     sf->minleaf      = 0;
509     sf->maxleaf      = N - 1;
510   } else if (pattern == PETSCSF_PATTERN_GATHER) {
511     sf->nleaves      = rank ? 0 : N;
512     sf->nroots       = n;
513     sf->nranks       = rank ? 0 : size;
514     sf->minleaf      = 0;
515     sf->maxleaf      = rank ? -1 : N - 1;
516   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
517     sf->nleaves      = size;
518     sf->nroots       = size;
519     sf->nranks       = size;
520     sf->minleaf      = 0;
521     sf->maxleaf      = size - 1;
522   }
523   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
524   sf->graphset = PETSC_TRUE;
525   PetscFunctionReturn(0);
526 }
527 
528 /*@
529    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
530 
531    Collective
532 
533    Input Arguments:
534 
535 .  sf - star forest to invert
536 
537    Output Arguments:
538 .  isf - inverse of sf
539    Level: advanced
540 
541    Notes:
542    All roots must have degree 1.
543 
544    The local space may be a permutation, but cannot be sparse.
545 
546 .seealso: PetscSFSetGraph()
547 @*/
548 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
549 {
550   PetscErrorCode ierr;
551   PetscMPIInt    rank;
552   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
553   const PetscInt *ilocal;
554   PetscSFNode    *roots,*leaves;
555 
556   PetscFunctionBegin;
557   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
558   PetscSFCheckGraphSet(sf,1);
559   PetscValidPointer(isf,2);
560 
561   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
562   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
563 
564   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
565   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
566   for (i=0; i<maxlocal; i++) {
567     leaves[i].rank  = rank;
568     leaves[i].index = i;
569   }
570   for (i=0; i <nroots; i++) {
571     roots[i].rank  = -1;
572     roots[i].index = -1;
573   }
574   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
575   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
576 
577   /* Check whether our leaves are sparse */
578   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
579   if (count == nroots) newilocal = NULL;
580   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
581     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
582     for (i=0,count=0; i<nroots; i++) {
583       if (roots[i].rank >= 0) {
584         newilocal[count]   = i;
585         roots[count].rank  = roots[i].rank;
586         roots[count].index = roots[i].index;
587         count++;
588       }
589     }
590   }
591 
592   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
593   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
594   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
595   PetscFunctionReturn(0);
596 }
597 
598 /*@
599    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
600 
601    Collective
602 
603    Input Arguments:
604 +  sf - communication object to duplicate
605 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
606 
607    Output Arguments:
608 .  newsf - new communication object
609 
610    Level: beginner
611 
612 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
613 @*/
614 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
615 {
616   PetscSFType    type;
617   PetscErrorCode ierr;
618 
619   PetscFunctionBegin;
620   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
621   PetscValidLogicalCollectiveEnum(sf,opt,2);
622   PetscValidPointer(newsf,3);
623   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
624   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
625   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
626   if (opt == PETSCSF_DUPLICATE_GRAPH) {
627     PetscSFCheckGraphSet(sf,1);
628     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
629       PetscInt          nroots,nleaves;
630       const PetscInt    *ilocal;
631       const PetscSFNode *iremote;
632       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
633       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
634     } else {
635       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
636     }
637   }
638   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
639   PetscFunctionReturn(0);
640 }
641 
642 /*@C
643    PetscSFGetGraph - Get the graph specifying a parallel star forest
644 
645    Not Collective
646 
647    Input Arguments:
648 .  sf - star forest
649 
650    Output Arguments:
651 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
652 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
653 .  ilocal - locations of leaves in leafdata buffers
654 -  iremote - remote locations of root vertices for each leaf on the current process
655 
656    Notes:
657    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
658 
659    Level: intermediate
660 
661 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
662 @*/
663 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
664 {
665   PetscErrorCode ierr;
666 
667   PetscFunctionBegin;
668   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
669   if (sf->ops->GetGraph) {
670     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
671   } else {
672     if (nroots) *nroots = sf->nroots;
673     if (nleaves) *nleaves = sf->nleaves;
674     if (ilocal) *ilocal = sf->mine;
675     if (iremote) *iremote = sf->remote;
676   }
677   PetscFunctionReturn(0);
678 }
679 
680 /*@
681    PetscSFGetLeafRange - Get the active leaf ranges
682 
683    Not Collective
684 
685    Input Arguments:
686 .  sf - star forest
687 
688    Output Arguments:
689 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
690 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
691 
692    Level: developer
693 
694 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
695 @*/
696 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
697 {
698 
699   PetscFunctionBegin;
700   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
701   PetscSFCheckGraphSet(sf,1);
702   if (minleaf) *minleaf = sf->minleaf;
703   if (maxleaf) *maxleaf = sf->maxleaf;
704   PetscFunctionReturn(0);
705 }
706 
707 /*@C
708    PetscSFViewFromOptions - View from Options
709 
710    Collective on PetscSF
711 
712    Input Parameters:
713 +  A - the star forest
714 -  obj - Optional object
715 .  name - command line option
716 
717    Level: intermediate
718 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
719 @*/
720 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
721 {
722   PetscErrorCode ierr;
723 
724   PetscFunctionBegin;
725   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
726   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
727   PetscFunctionReturn(0);
728 }
729 
730 /*@C
731    PetscSFView - view a star forest
732 
733    Collective
734 
735    Input Arguments:
736 +  sf - star forest
737 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
738 
739    Level: beginner
740 
741 .seealso: PetscSFCreate(), PetscSFSetGraph()
742 @*/
743 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
744 {
745   PetscErrorCode    ierr;
746   PetscBool         iascii;
747   PetscViewerFormat format;
748 
749   PetscFunctionBegin;
750   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
751   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
752   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
753   PetscCheckSameComm(sf,1,viewer,2);
754   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
755   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
756   if (iascii) {
757     PetscMPIInt rank;
758     PetscInt    ii,i,j;
759 
760     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
761     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
762     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
763     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
764       if (!sf->graphset) {
765         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
766         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
767         PetscFunctionReturn(0);
768       }
769       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
770       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
771       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
772       for (i=0; i<sf->nleaves; i++) {
773         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
774       }
775       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
776       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
777       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
778         PetscMPIInt *tmpranks,*perm;
779         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
780         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
781         for (i=0; i<sf->nranks; i++) perm[i] = i;
782         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
783         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
784         for (ii=0; ii<sf->nranks; ii++) {
785           i = perm[ii];
786           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
787           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
788             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
789           }
790         }
791         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
792       }
793       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
794       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
795     }
796     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
797   }
798   PetscFunctionReturn(0);
799 }
800 
801 /*@C
802    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
803 
804    Not Collective
805 
806    Input Arguments:
807 .  sf - star forest
808 
809    Output Arguments:
810 +  nranks - number of ranks referenced by local part
811 .  ranks - array of ranks
812 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
813 .  rmine - concatenated array holding local indices referencing each remote rank
814 -  rremote - concatenated array holding remote indices referenced for each remote rank
815 
816    Level: developer
817 
818 .seealso: PetscSFGetLeafRanks()
819 @*/
820 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
821 {
822   PetscErrorCode ierr;
823 
824   PetscFunctionBegin;
825   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
826   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
827   if (sf->ops->GetRootRanks) {
828     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
829   } else {
830     /* The generic implementation */
831     if (nranks)  *nranks  = sf->nranks;
832     if (ranks)   *ranks   = sf->ranks;
833     if (roffset) *roffset = sf->roffset;
834     if (rmine)   *rmine   = sf->rmine;
835     if (rremote) *rremote = sf->rremote;
836   }
837   PetscFunctionReturn(0);
838 }
839 
840 /*@C
841    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
842 
843    Not Collective
844 
845    Input Arguments:
846 .  sf - star forest
847 
848    Output Arguments:
849 +  niranks - number of leaf ranks referencing roots on this process
850 .  iranks - array of ranks
851 .  ioffset - offset in irootloc for each rank (length niranks+1)
852 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
853 
854    Level: developer
855 
856 .seealso: PetscSFGetRootRanks()
857 @*/
858 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
859 {
860   PetscErrorCode ierr;
861 
862   PetscFunctionBegin;
863   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
864   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
865   if (sf->ops->GetLeafRanks) {
866     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
867   } else {
868     PetscSFType type;
869     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
870     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
871   }
872   PetscFunctionReturn(0);
873 }
874 
875 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
876   PetscInt i;
877   for (i=0; i<n; i++) {
878     if (needle == list[i]) return PETSC_TRUE;
879   }
880   return PETSC_FALSE;
881 }
882 
883 /*@C
884    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
885 
886    Collective
887 
888    Input Arguments:
889 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
890 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
891 
892    Level: developer
893 
894 .seealso: PetscSFGetRootRanks()
895 @*/
896 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
897 {
898   PetscErrorCode     ierr;
899   PetscTable         table;
900   PetscTablePosition pos;
901   PetscMPIInt        size,groupsize,*groupranks;
902   PetscInt           *rcount,*ranks;
903   PetscInt           i, irank = -1,orank = -1;
904 
905   PetscFunctionBegin;
906   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
907   PetscSFCheckGraphSet(sf,1);
908   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
909   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
910   for (i=0; i<sf->nleaves; i++) {
911     /* Log 1-based rank */
912     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
913   }
914   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
915   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
916   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
917   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
918   for (i=0; i<sf->nranks; i++) {
919     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
920     ranks[i]--;             /* Convert back to 0-based */
921   }
922   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
923 
924   /* We expect that dgroup is reliably "small" while nranks could be large */
925   {
926     MPI_Group group = MPI_GROUP_NULL;
927     PetscMPIInt *dgroupranks;
928     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
929     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRQ(ierr);
930     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
931     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
932     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
933     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRQ(ierr);}
934     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
935     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
936   }
937 
938   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
939   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i; ) {
940     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
941       if (InList(ranks[i],groupsize,groupranks)) break;
942     }
943     for ( ; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
944       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
945     }
946     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
947       PetscInt    tmprank,tmpcount;
948 
949       tmprank             = ranks[i];
950       tmpcount            = rcount[i];
951       ranks[i]            = ranks[sf->ndranks];
952       rcount[i]           = rcount[sf->ndranks];
953       ranks[sf->ndranks]  = tmprank;
954       rcount[sf->ndranks] = tmpcount;
955       sf->ndranks++;
956     }
957   }
958   ierr = PetscFree(groupranks);CHKERRQ(ierr);
959   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
960   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
961   sf->roffset[0] = 0;
962   for (i=0; i<sf->nranks; i++) {
963     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
964     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
965     rcount[i]        = 0;
966   }
967   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
968     /* short circuit */
969     if (orank != sf->remote[i].rank) {
970       /* Search for index of iremote[i].rank in sf->ranks */
971       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
972       if (irank < 0) {
973         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
974         if (irank >= 0) irank += sf->ndranks;
975       }
976       orank = sf->remote[i].rank;
977     }
978     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
979     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
980     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
981     rcount[irank]++;
982   }
983   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
984   PetscFunctionReturn(0);
985 }
986 
987 /*@C
988    PetscSFGetGroups - gets incoming and outgoing process groups
989 
990    Collective
991 
992    Input Argument:
993 .  sf - star forest
994 
995    Output Arguments:
996 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
997 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
998 
999    Level: developer
1000 
1001 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1002 @*/
1003 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1004 {
1005   PetscErrorCode ierr;
1006   MPI_Group      group = MPI_GROUP_NULL;
1007 
1008   PetscFunctionBegin;
1009   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining groups");
1010   if (sf->ingroup == MPI_GROUP_NULL) {
1011     PetscInt       i;
1012     const PetscInt *indegree;
1013     PetscMPIInt    rank,*outranks,*inranks;
1014     PetscSFNode    *remote;
1015     PetscSF        bgcount;
1016 
1017     /* Compute the number of incoming ranks */
1018     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1019     for (i=0; i<sf->nranks; i++) {
1020       remote[i].rank  = sf->ranks[i];
1021       remote[i].index = 0;
1022     }
1023     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1024     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1025     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1026     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1027 
1028     /* Enumerate the incoming ranks */
1029     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1030     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1031     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1032     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1033     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1034     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1035     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRQ(ierr);
1036     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1037     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1038     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1039   }
1040   *incoming = sf->ingroup;
1041 
1042   if (sf->outgroup == MPI_GROUP_NULL) {
1043     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1044     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRQ(ierr);
1045     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1046   }
1047   *outgoing = sf->outgroup;
1048   PetscFunctionReturn(0);
1049 }
1050 
1051 /*@
1052    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1053 
1054    Collective
1055 
1056    Input Argument:
1057 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1058 
1059    Output Arguments:
1060 .  multi - star forest with split roots, such that each root has degree exactly 1
1061 
1062    Level: developer
1063 
1064    Notes:
1065 
1066    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1067    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1068    edge, it is a candidate for future optimization that might involve its removal.
1069 
1070 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1071 @*/
1072 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1073 {
1074   PetscErrorCode ierr;
1075 
1076   PetscFunctionBegin;
1077   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1078   PetscValidPointer(multi,2);
1079   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1080     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1081     *multi = sf->multi;
1082     PetscFunctionReturn(0);
1083   }
1084   if (!sf->multi) {
1085     const PetscInt *indegree;
1086     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1087     PetscSFNode    *remote;
1088     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1089     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1090     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1091     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1092     inoffset[0] = 0;
1093     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1094     for (i=0; i<maxlocal; i++) outones[i] = 1;
1095     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1096     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1097     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1098 #if 0
1099 #if defined(PETSC_USE_DEBUG)                                 /* Check that the expected number of increments occurred */
1100     for (i=0; i<sf->nroots; i++) {
1101       if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1102     }
1103 #endif
1104 #endif
1105     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1106     for (i=0; i<sf->nleaves; i++) {
1107       remote[i].rank  = sf->remote[i].rank;
1108       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1109     }
1110     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1111     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1112     if (sf->rankorder) {        /* Sort the ranks */
1113       PetscMPIInt rank;
1114       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1115       PetscSFNode *newremote;
1116       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1117       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1118       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1119       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1120       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1121       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1122       /* Sort the incoming ranks at each vertex, build the inverse map */
1123       for (i=0; i<sf->nroots; i++) {
1124         PetscInt j;
1125         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1126         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1127         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1128       }
1129       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1130       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1131       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1132       for (i=0; i<sf->nleaves; i++) {
1133         newremote[i].rank  = sf->remote[i].rank;
1134         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1135       }
1136       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1137       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1138     }
1139     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1140   }
1141   *multi = sf->multi;
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 /*@C
1146    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1147 
1148    Collective
1149 
1150    Input Arguments:
1151 +  sf - original star forest
1152 .  nselected  - number of selected roots on this process
1153 -  selected   - indices of the selected roots on this process
1154 
1155    Output Arguments:
1156 .  newsf - new star forest
1157 
1158    Level: advanced
1159 
1160    Note:
1161    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1162    be done by calling PetscSFGetGraph().
1163 
1164 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1165 @*/
1166 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1167 {
1168   PetscInt          i,n,*roots,*rootdata,*leafdata,nroots,nleaves,connected_leaves,*new_ilocal;
1169   const PetscSFNode *iremote;
1170   PetscSFNode       *new_iremote;
1171   PetscSF           tmpsf;
1172   MPI_Comm          comm;
1173   PetscErrorCode    ierr;
1174 
1175   PetscFunctionBegin;
1176   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1177   PetscSFCheckGraphSet(sf,1);
1178   if (nselected) PetscValidPointer(selected,3);
1179   PetscValidPointer(newsf,4);
1180 
1181   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1182 
1183   /* Uniq selected[] and put results in roots[] */
1184   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1185   ierr = PetscMalloc1(nselected,&roots);CHKERRQ(ierr);
1186   ierr = PetscArraycpy(roots,selected,nselected);CHKERRQ(ierr);
1187   ierr = PetscSortedRemoveDupsInt(&nselected,roots);CHKERRQ(ierr);
1188   if (nselected && (roots[0] < 0 || roots[nselected-1] >= sf->nroots)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max root indices %D/%D are not in [0,%D)",roots[0],roots[nselected-1],sf->nroots);
1189 
1190   if (sf->ops->CreateEmbeddedSF) {
1191     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,roots,newsf);CHKERRQ(ierr);
1192   } else {
1193     /* A generic version of creating embedded sf. Note that we called PetscSFSetGraph() twice, which is certainly expensive */
1194     /* Find out which leaves (not leaf data items) are still connected to roots in the embedded sf */
1195     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,NULL,&iremote);CHKERRQ(ierr);
1196     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&tmpsf);CHKERRQ(ierr);
1197     ierr = PetscSFSetGraph(tmpsf,nroots,nleaves,NULL/*contiguous*/,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);CHKERRQ(ierr);
1198     ierr = PetscCalloc2(nroots,&rootdata,nleaves,&leafdata);CHKERRQ(ierr);
1199     for (i=0; i<nselected; ++i) rootdata[roots[i]] = 1;
1200     ierr = PetscSFBcastBegin(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1201     ierr = PetscSFBcastEnd(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1202     ierr = PetscSFDestroy(&tmpsf);CHKERRQ(ierr);
1203 
1204     /* Build newsf with leaves that are still connected */
1205     connected_leaves = 0;
1206     for (i=0; i<nleaves; ++i) connected_leaves += leafdata[i];
1207     ierr = PetscMalloc1(connected_leaves,&new_ilocal);CHKERRQ(ierr);
1208     ierr = PetscMalloc1(connected_leaves,&new_iremote);CHKERRQ(ierr);
1209     for (i=0, n=0; i<nleaves; ++i) {
1210       if (leafdata[i]) {
1211         new_ilocal[n]        = sf->mine ? sf->mine[i] : i;
1212         new_iremote[n].rank  = sf->remote[i].rank;
1213         new_iremote[n].index = sf->remote[i].index;
1214         ++n;
1215       }
1216     }
1217 
1218     if (n != connected_leaves) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"There is a size mismatch in the SF embedding, %d != %d",n,connected_leaves);
1219     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1220     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1221     ierr = PetscSFSetGraph(*newsf,nroots,connected_leaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1222     ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
1223   }
1224   ierr = PetscFree(roots);CHKERRQ(ierr);
1225   PetscFunctionReturn(0);
1226 }
1227 
1228 /*@C
1229   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1230 
1231   Collective
1232 
1233   Input Arguments:
1234 + sf - original star forest
1235 . nselected  - number of selected leaves on this process
1236 - selected   - indices of the selected leaves on this process
1237 
1238   Output Arguments:
1239 .  newsf - new star forest
1240 
1241   Level: advanced
1242 
1243 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1244 @*/
1245 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1246 {
1247   const PetscSFNode *iremote;
1248   PetscSFNode       *new_iremote;
1249   const PetscInt    *ilocal;
1250   PetscInt          i,nroots,*leaves,*new_ilocal;
1251   MPI_Comm          comm;
1252   PetscErrorCode    ierr;
1253 
1254   PetscFunctionBegin;
1255   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1256   PetscSFCheckGraphSet(sf,1);
1257   if (nselected) PetscValidPointer(selected,3);
1258   PetscValidPointer(newsf,4);
1259 
1260   /* Uniq selected[] and put results in leaves[] */
1261   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1262   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1263   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1264   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1265   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1266 
1267   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1268   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1269     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1270   } else {
1271     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1272     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1273     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1274     for (i=0; i<nselected; ++i) {
1275       const PetscInt l     = leaves[i];
1276       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1277       new_iremote[i].rank  = iremote[l].rank;
1278       new_iremote[i].index = iremote[l].index;
1279     }
1280     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1281     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1282     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1283   }
1284   ierr = PetscFree(leaves);CHKERRQ(ierr);
1285   PetscFunctionReturn(0);
1286 }
1287 
1288 /*@C
1289    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1290 
1291    Collective on PetscSF
1292 
1293    Input Arguments:
1294 +  sf - star forest on which to communicate
1295 .  unit - data type associated with each node
1296 .  rootdata - buffer to broadcast
1297 -  op - operation to use for reduction
1298 
1299    Output Arguments:
1300 .  leafdata - buffer to be reduced with values from each leaf's respective root
1301 
1302    Level: intermediate
1303 
1304 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd()
1305 @*/
1306 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1307 {
1308   PetscErrorCode ierr;
1309   PetscMemType   rootmtype,leafmtype;
1310 
1311   PetscFunctionBegin;
1312   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1313   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1314   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1315   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1316   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1317 #if defined(PETSC_HAVE_CUDA)
1318   /*  Shall we assume rootdata, leafdata are ready to use (instead of being computed by some asynchronous kernels)?
1319     To be similar to MPI, I'd like to have this assumption, since MPI does not have a concept of stream.
1320     But currently this assumption is not enforecd in Petsc. To be safe, I do synchronization here. Otherwise, if
1321     we do not sync now and call the Pack kernel directly on the default NULL stream (assume petsc objects are also
1322     computed on it), we have to sync the NULL stream before calling MPI routines. So, it looks a cudaDeviceSynchronize
1323     is inevitable. We do it now and put pack/unpack kernels to non-NULL streams.
1324    */
1325   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1326 #endif
1327   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1328   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1329   PetscFunctionReturn(0);
1330 }
1331 
1332 /*@C
1333    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1334 
1335    Collective
1336 
1337    Input Arguments:
1338 +  sf - star forest
1339 .  unit - data type
1340 .  rootdata - buffer to broadcast
1341 -  op - operation to use for reduction
1342 
1343    Output Arguments:
1344 .  leafdata - buffer to be reduced with values from each leaf's respective root
1345 
1346    Level: intermediate
1347 
1348 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1349 @*/
1350 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1351 {
1352   PetscErrorCode ierr;
1353   PetscMemType   rootmtype,leafmtype;
1354 
1355   PetscFunctionBegin;
1356   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1357   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1358   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1359   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1360   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1361   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1362   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1363   PetscFunctionReturn(0);
1364 }
1365 
1366 /*@C
1367    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1368 
1369    Collective
1370 
1371    Input Arguments:
1372 +  sf - star forest
1373 .  unit - data type
1374 .  leafdata - values to reduce
1375 -  op - reduction operation
1376 
1377    Output Arguments:
1378 .  rootdata - result of reduction of values from all leaves of each root
1379 
1380    Level: intermediate
1381 
1382 .seealso: PetscSFBcastBegin()
1383 @*/
1384 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1385 {
1386   PetscErrorCode ierr;
1387   PetscMemType   rootmtype,leafmtype;
1388 
1389   PetscFunctionBegin;
1390   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1391   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1392   ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1393   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1394   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1395 #if defined(PETSC_HAVE_CUDA)
1396   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1397 #endif
1398   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1399   ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 /*@C
1404    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1405 
1406    Collective
1407 
1408    Input Arguments:
1409 +  sf - star forest
1410 .  unit - data type
1411 .  leafdata - values to reduce
1412 -  op - reduction operation
1413 
1414    Output Arguments:
1415 .  rootdata - result of reduction of values from all leaves of each root
1416 
1417    Level: intermediate
1418 
1419 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1420 @*/
1421 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1422 {
1423   PetscErrorCode ierr;
1424   PetscMemType   rootmtype,leafmtype;
1425 
1426   PetscFunctionBegin;
1427   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1428   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1429   ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1430   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1431   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1432   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1433   ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1434   PetscFunctionReturn(0);
1435 }
1436 
1437 /*@C
1438    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1439 
1440    Collective
1441 
1442    Input Arguments:
1443 +  sf - star forest
1444 .  unit - data type
1445 .  leafdata - leaf values to use in reduction
1446 -  op - operation to use for reduction
1447 
1448    Output Arguments:
1449 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1450 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1451 
1452    Level: advanced
1453 
1454    Note:
1455    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1456    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1457    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1458    integers.
1459 
1460 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1461 @*/
1462 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1463 {
1464   PetscErrorCode ierr;
1465   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1466 
1467   PetscFunctionBegin;
1468   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1469   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1470   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1471   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1472   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1473   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1474   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1475 #if defined(PETSC_HAVE_CUDA)
1476   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1477 #endif
1478   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1479   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1480   PetscFunctionReturn(0);
1481 }
1482 
1483 /*@C
1484    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1485 
1486    Collective
1487 
1488    Input Arguments:
1489 +  sf - star forest
1490 .  unit - data type
1491 .  leafdata - leaf values to use in reduction
1492 -  op - operation to use for reduction
1493 
1494    Output Arguments:
1495 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1496 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1497 
1498    Level: advanced
1499 
1500 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1501 @*/
1502 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1503 {
1504   PetscErrorCode ierr;
1505   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1506 
1507   PetscFunctionBegin;
1508   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1509   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1510   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1511   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1512   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1513   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1514   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1515   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1516   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1517   PetscFunctionReturn(0);
1518 }
1519 
1520 /*@C
1521    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1522 
1523    Collective
1524 
1525    Input Arguments:
1526 .  sf - star forest
1527 
1528    Output Arguments:
1529 .  degree - degree of each root vertex
1530 
1531    Level: advanced
1532 
1533    Notes:
1534    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1535 
1536 .seealso: PetscSFGatherBegin()
1537 @*/
1538 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1539 {
1540   PetscErrorCode ierr;
1541 
1542   PetscFunctionBegin;
1543   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1544   PetscSFCheckGraphSet(sf,1);
1545   PetscValidPointer(degree,2);
1546   if (!sf->degreeknown) {
1547     PetscInt i, nroots = sf->nroots, maxlocal = sf->maxleaf+1;  /* TODO: We should use PetscSFGetLeafRange() */
1548     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1549     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1550     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1551     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1552     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1553     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1554   }
1555   *degree = NULL;
1556   PetscFunctionReturn(0);
1557 }
1558 
1559 /*@C
1560    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1561 
1562    Collective
1563 
1564    Input Arguments:
1565 .  sf - star forest
1566 
1567    Output Arguments:
1568 .  degree - degree of each root vertex
1569 
1570    Level: developer
1571 
1572    Notes:
1573    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1574 
1575 .seealso:
1576 @*/
1577 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1578 {
1579   PetscErrorCode ierr;
1580 
1581   PetscFunctionBegin;
1582   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1583   PetscSFCheckGraphSet(sf,1);
1584   PetscValidPointer(degree,2);
1585   if (!sf->degreeknown) {
1586     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1587     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1588     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1589     sf->degreeknown = PETSC_TRUE;
1590   }
1591   *degree = sf->degree;
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 
1596 /*@C
1597    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1598    Each multi-root is assigned index of the corresponding original root.
1599 
1600    Collective
1601 
1602    Input Arguments:
1603 +  sf - star forest
1604 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1605 
1606    Output Arguments:
1607 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1608 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1609 
1610    Level: developer
1611 
1612    Notes:
1613    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1614 
1615 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1616 @*/
1617 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1618 {
1619   PetscSF             msf;
1620   PetscInt            i, j, k, nroots, nmroots;
1621   PetscErrorCode      ierr;
1622 
1623   PetscFunctionBegin;
1624   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1625   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1626   if (nroots) PetscValidIntPointer(degree,2);
1627   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1628   PetscValidPointer(multiRootsOrigNumbering,4);
1629   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1630   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1631   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1632   for (i=0,j=0,k=0; i<nroots; i++) {
1633     if (!degree[i]) continue;
1634     for (j=0; j<degree[i]; j++,k++) {
1635       (*multiRootsOrigNumbering)[k] = i;
1636     }
1637   }
1638 #if defined(PETSC_USE_DEBUG)
1639   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1640 #endif
1641   if (nMultiRoots) *nMultiRoots = nmroots;
1642   PetscFunctionReturn(0);
1643 }
1644 
1645 /*@C
1646    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1647 
1648    Collective
1649 
1650    Input Arguments:
1651 +  sf - star forest
1652 .  unit - data type
1653 -  leafdata - leaf data to gather to roots
1654 
1655    Output Argument:
1656 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1657 
1658    Level: intermediate
1659 
1660 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1661 @*/
1662 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1663 {
1664   PetscErrorCode ierr;
1665   PetscSF        multi;
1666 
1667   PetscFunctionBegin;
1668   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1669   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1670   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1671   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1672   PetscFunctionReturn(0);
1673 }
1674 
1675 /*@C
1676    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1677 
1678    Collective
1679 
1680    Input Arguments:
1681 +  sf - star forest
1682 .  unit - data type
1683 -  leafdata - leaf data to gather to roots
1684 
1685    Output Argument:
1686 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1687 
1688    Level: intermediate
1689 
1690 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1691 @*/
1692 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1693 {
1694   PetscErrorCode ierr;
1695   PetscSF        multi;
1696 
1697   PetscFunctionBegin;
1698   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1699   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1700   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1701   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1702   PetscFunctionReturn(0);
1703 }
1704 
1705 /*@C
1706    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1707 
1708    Collective
1709 
1710    Input Arguments:
1711 +  sf - star forest
1712 .  unit - data type
1713 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1714 
1715    Output Argument:
1716 .  leafdata - leaf data to be update with personal data from each respective root
1717 
1718    Level: intermediate
1719 
1720 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1721 @*/
1722 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1723 {
1724   PetscErrorCode ierr;
1725   PetscSF        multi;
1726 
1727   PetscFunctionBegin;
1728   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1729   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1730   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1731   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1732   PetscFunctionReturn(0);
1733 }
1734 
1735 /*@C
1736    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1737 
1738    Collective
1739 
1740    Input Arguments:
1741 +  sf - star forest
1742 .  unit - data type
1743 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1744 
1745    Output Argument:
1746 .  leafdata - leaf data to be update with personal data from each respective root
1747 
1748    Level: intermediate
1749 
1750 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1751 @*/
1752 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1753 {
1754   PetscErrorCode ierr;
1755   PetscSF        multi;
1756 
1757   PetscFunctionBegin;
1758   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1759   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1760   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1761   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1762   PetscFunctionReturn(0);
1763 }
1764 
1765 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1766 {
1767 #if defined(PETSC_USE_DEBUG)
1768   PetscInt        i, n, nleaves;
1769   const PetscInt *ilocal = NULL;
1770   PetscHSetI      seen;
1771   PetscErrorCode  ierr;
1772 
1773   PetscFunctionBegin;
1774   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1775   ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1776   for (i = 0; i < nleaves; i++) {
1777     const PetscInt leaf = ilocal ? ilocal[i] : i;
1778     ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1779   }
1780   ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1781   if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1782   ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1783   PetscFunctionReturn(0);
1784 #else
1785   PetscFunctionBegin;
1786   PetscFunctionReturn(0);
1787 #endif
1788 }
1789 
1790 /*@
1791   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1792 
1793   Input Parameters:
1794 + sfA - The first PetscSF
1795 - sfB - The second PetscSF
1796 
1797   Output Parameters:
1798 . sfBA - The composite SF
1799 
1800   Level: developer
1801 
1802   Notes:
1803   Currently, the two SFs must be defined on congruent communicators and they must be true star
1804   forests, i.e. the same leaf is not connected with different roots.
1805 
1806   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1807   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1808   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1809   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1810 
1811 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1812 @*/
1813 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1814 {
1815   PetscErrorCode    ierr;
1816   const PetscSFNode *remotePointsA,*remotePointsB;
1817   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1818   const PetscInt    *localPointsA,*localPointsB;
1819   PetscInt          *localPointsBA;
1820   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1821   PetscBool         denseB;
1822 
1823   PetscFunctionBegin;
1824   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1825   PetscSFCheckGraphSet(sfA,1);
1826   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1827   PetscSFCheckGraphSet(sfB,2);
1828   PetscCheckSameComm(sfA,1,sfB,2);
1829   PetscValidPointer(sfBA,3);
1830   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1831   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1832 
1833   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1834   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1835   if (localPointsA) {
1836     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
1837     for (i=0; i<numRootsB; i++) {
1838       reorderedRemotePointsA[i].rank = -1;
1839       reorderedRemotePointsA[i].index = -1;
1840     }
1841     for (i=0; i<numLeavesA; i++) {
1842       if (localPointsA[i] >= numRootsB) continue;
1843       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
1844     }
1845     remotePointsA = reorderedRemotePointsA;
1846   }
1847   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
1848   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
1849   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1850   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1851   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1852 
1853   denseB = (PetscBool)!localPointsB;
1854   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1855     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
1856     else numLeavesBA++;
1857   }
1858   if (denseB) {
1859     localPointsBA  = NULL;
1860     remotePointsBA = leafdataB;
1861   } else {
1862     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
1863     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
1864     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1865       const PetscInt l = localPointsB ? localPointsB[i] : i;
1866 
1867       if (leafdataB[l-minleaf].rank == -1) continue;
1868       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
1869       localPointsBA[numLeavesBA] = l;
1870       numLeavesBA++;
1871     }
1872     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
1873   }
1874   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1875   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1876   PetscFunctionReturn(0);
1877 }
1878 
1879 /*@
1880   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
1881 
1882   Input Parameters:
1883 + sfA - The first PetscSF
1884 - sfB - The second PetscSF
1885 
1886   Output Parameters:
1887 . sfBA - The composite SF.
1888 
1889   Level: developer
1890 
1891   Notes:
1892   Currently, the two SFs must be defined on congruent communicators and they must be true star
1893   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
1894   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
1895 
1896   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
1897   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
1898   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
1899   a Reduce on sfB, on connected roots.
1900 
1901 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
1902 @*/
1903 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1904 {
1905   PetscErrorCode    ierr;
1906   const PetscSFNode *remotePointsA,*remotePointsB;
1907   PetscSFNode       *remotePointsBA;
1908   const PetscInt    *localPointsA,*localPointsB;
1909   PetscSFNode       *reorderedRemotePointsA = NULL;
1910   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
1911 
1912   PetscFunctionBegin;
1913   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1914   PetscSFCheckGraphSet(sfA,1);
1915   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1916   PetscSFCheckGraphSet(sfB,2);
1917   PetscCheckSameComm(sfA,1,sfB,2);
1918   PetscValidPointer(sfBA,3);
1919   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1920   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1921   /* TODO: Check roots of sfB have degree of 1 */
1922 
1923   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
1924   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
1925   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
1926   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
1927   for (i=0; i<maxleaf - minleaf + 1; i++) {
1928     reorderedRemotePointsA[i].rank = -1;
1929     reorderedRemotePointsA[i].index = -1;
1930   }
1931   if (localPointsA) {
1932     for (i=0; i<numLeavesA; i++) {
1933       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
1934       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
1935     }
1936   } else {
1937     for (i=0; i<numLeavesA; i++) {
1938       if (i > maxleaf || i < minleaf) continue;
1939       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
1940     }
1941   }
1942 
1943   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
1944   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
1945   for (i=0; i<numRootsB; i++) {
1946     remotePointsBA[i].rank = -1;
1947     remotePointsBA[i].index = -1;
1948   }
1949 
1950   /* Once we implement the TODO above (check all roots of sfB have degree of 1), we can replace the MPI_MAXLOC
1951      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
1952    */
1953   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,MPI_MAXLOC);CHKERRQ(ierr);
1954   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,MPI_MAXLOC);CHKERRQ(ierr);
1955   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1956   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
1957     if (remotePointsBA[i].rank == -1) continue;
1958     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
1959     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
1960     localPointsBA[numLeavesBA] = i;
1961     numLeavesBA++;
1962   }
1963   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1964   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1965   PetscFunctionReturn(0);
1966 }
1967 
1968 /*
1969   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
1970 
1971   Input Parameters:
1972 . sf - The global PetscSF
1973 
1974   Output Parameters:
1975 . out - The local PetscSF
1976  */
1977 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
1978 {
1979   MPI_Comm           comm;
1980   PetscMPIInt        myrank;
1981   const PetscInt     *ilocal;
1982   const PetscSFNode  *iremote;
1983   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
1984   PetscSFNode        *liremote;
1985   PetscSF            lsf;
1986   PetscErrorCode     ierr;
1987 
1988   PetscFunctionBegin;
1989   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1990   if (sf->ops->CreateLocalSF) {
1991     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
1992   } else {
1993     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
1994     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1995     ierr = MPI_Comm_rank(comm,&myrank);CHKERRQ(ierr);
1996 
1997     /* Find out local edges and build a local SF */
1998     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1999     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2000     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2001     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2002 
2003     for (i=j=0; i<nleaves; i++) {
2004       if (iremote[i].rank == (PetscInt)myrank) {
2005         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2006         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2007         liremote[j].index = iremote[i].index;
2008         j++;
2009       }
2010     }
2011     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2012     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2013     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2014     *out = lsf;
2015   }
2016   PetscFunctionReturn(0);
2017 }
2018 
2019 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2020 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2021 {
2022   PetscErrorCode     ierr;
2023   PetscMemType       rootmtype,leafmtype;
2024 
2025   PetscFunctionBegin;
2026   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2027   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2028   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2029   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2030   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2031   if (sf->ops->BcastToZero) {
2032     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2033   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2034   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2035   PetscFunctionReturn(0);
2036 }
2037 
2038