xref: /petsc/src/vec/is/sf/interface/sf.c (revision 3ee9839e4674d2e0e9bcea975330f3458ebcb61e)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_USE_DEBUG)
6 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
7     if (PetscUnlikely(!(sf)->graphset))                              \
8       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
9   } while (0)
10 #else
11 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
12 #endif
13 
14 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",0};
15 
16 /*@
17    PetscSFCreate - create a star forest communication context
18 
19    Collective
20 
21    Input Arguments:
22 .  comm - communicator on which the star forest will operate
23 
24    Output Arguments:
25 .  sf - new star forest context
26 
27    Options Database Keys:
28 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
29 .  -sf_type window    -Use MPI-3 one-sided window for communication
30 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
31 
32    Level: intermediate
33 
34    Notes:
35    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
36    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
37    SFs are optimized and they have better performance than general SFs.
38 
39 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
40 @*/
41 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
42 {
43   PetscErrorCode ierr;
44   PetscSF        b;
45 
46   PetscFunctionBegin;
47   PetscValidPointer(sf,2);
48   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
49 
50   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
51 
52   b->nroots    = -1;
53   b->nleaves   = -1;
54   b->minleaf   = PETSC_MAX_INT;
55   b->maxleaf   = PETSC_MIN_INT;
56   b->nranks    = -1;
57   b->rankorder = PETSC_TRUE;
58   b->ingroup   = MPI_GROUP_NULL;
59   b->outgroup  = MPI_GROUP_NULL;
60   b->graphset  = PETSC_FALSE;
61 
62   *sf = b;
63   PetscFunctionReturn(0);
64 }
65 
66 /*@
67    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
68 
69    Collective
70 
71    Input Arguments:
72 .  sf - star forest
73 
74    Level: advanced
75 
76 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
77 @*/
78 PetscErrorCode PetscSFReset(PetscSF sf)
79 {
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
84   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
85   sf->nroots   = -1;
86   sf->nleaves  = -1;
87   sf->minleaf  = PETSC_MAX_INT;
88   sf->maxleaf  = PETSC_MIN_INT;
89   sf->mine     = NULL;
90   sf->remote   = NULL;
91   sf->graphset = PETSC_FALSE;
92   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
93   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
94   sf->nranks = -1;
95   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
96 #if defined(PETSC_HAVE_CUDA)
97   if (sf->rmine_d) {cudaError_t err = cudaFree(sf->rmine_d);CHKERRCUDA(err);sf->rmine_d=NULL;}
98 #endif
99   sf->degreeknown = PETSC_FALSE;
100   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
101   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRQ(ierr);}
102   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRQ(ierr);}
103   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
104   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
105   sf->setupcalled = PETSC_FALSE;
106   PetscFunctionReturn(0);
107 }
108 
109 /*@C
110    PetscSFSetType - Set the PetscSF communication implementation
111 
112    Collective on PetscSF
113 
114    Input Parameters:
115 +  sf - the PetscSF context
116 -  type - a known method
117 
118    Options Database Key:
119 .  -sf_type <type> - Sets the method; use -help for a list
120    of available methods (for instance, window, pt2pt, neighbor)
121 
122    Notes:
123    See "include/petscsf.h" for available methods (for instance)
124 +    PETSCSFWINDOW - MPI-2/3 one-sided
125 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
126 
127   Level: intermediate
128 
129 .seealso: PetscSFType, PetscSFCreate()
130 @*/
131 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
132 {
133   PetscErrorCode ierr,(*r)(PetscSF);
134   PetscBool      match;
135 
136   PetscFunctionBegin;
137   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
138   PetscValidCharPointer(type,2);
139 
140   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
141   if (match) PetscFunctionReturn(0);
142 
143   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
144   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
145   /* Destroy the previous PetscSF implementation context */
146   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
147   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
148   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
149   ierr = (*r)(sf);CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 /*@C
154   PetscSFGetType - Get the PetscSF communication implementation
155 
156   Not Collective
157 
158   Input Parameter:
159 . sf  - the PetscSF context
160 
161   Output Parameter:
162 . type - the PetscSF type name
163 
164   Level: intermediate
165 
166 .seealso: PetscSFSetType(), PetscSFCreate()
167 @*/
168 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
169 {
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
172   PetscValidPointer(type,2);
173   *type = ((PetscObject)sf)->type_name;
174   PetscFunctionReturn(0);
175 }
176 
177 /*@
178    PetscSFDestroy - destroy star forest
179 
180    Collective
181 
182    Input Arguments:
183 .  sf - address of star forest
184 
185    Level: intermediate
186 
187 .seealso: PetscSFCreate(), PetscSFReset()
188 @*/
189 PetscErrorCode PetscSFDestroy(PetscSF *sf)
190 {
191   PetscErrorCode ierr;
192 
193   PetscFunctionBegin;
194   if (!*sf) PetscFunctionReturn(0);
195   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
196   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
197   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
198   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
199   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
204 {
205 #if defined(PETSC_USE_DEBUG)
206   PetscInt           i, nleaves;
207   PetscMPIInt        size;
208   const PetscInt    *ilocal;
209   const PetscSFNode *iremote;
210   PetscErrorCode     ierr;
211 
212   PetscFunctionBegin;
213   if (!sf->graphset) PetscFunctionReturn(0);
214   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
215   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
216   for (i = 0; i < nleaves; i++) {
217     const PetscInt rank = iremote[i].rank;
218     const PetscInt remote = iremote[i].index;
219     const PetscInt leaf = ilocal ? ilocal[i] : i;
220     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
221     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
222     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
223   }
224   PetscFunctionReturn(0);
225 #else
226   PetscFunctionBegin;
227   PetscFunctionReturn(0);
228 #endif
229 }
230 
231 /*@
232    PetscSFSetUp - set up communication structures
233 
234    Collective
235 
236    Input Arguments:
237 .  sf - star forest communication object
238 
239    Level: beginner
240 
241 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
242 @*/
243 PetscErrorCode PetscSFSetUp(PetscSF sf)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
249   PetscSFCheckGraphSet(sf,1);
250   if (sf->setupcalled) PetscFunctionReturn(0);
251   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
252   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);}
253   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
254   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
255   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
256   sf->setupcalled = PETSC_TRUE;
257   PetscFunctionReturn(0);
258 }
259 
260 /*@
261    PetscSFSetFromOptions - set PetscSF options using the options database
262 
263    Logically Collective
264 
265    Input Arguments:
266 .  sf - star forest
267 
268    Options Database Keys:
269 +  -sf_type - implementation type, see PetscSFSetType()
270 -  -sf_rank_order - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
271 
272    Level: intermediate
273 
274 .seealso: PetscSFWindowSetSyncType()
275 @*/
276 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
277 {
278   PetscSFType    deft;
279   char           type[256];
280   PetscErrorCode ierr;
281   PetscBool      flg;
282 
283   PetscFunctionBegin;
284   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
285   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
286   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
287   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
288   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
289   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
290   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
291   ierr = PetscOptionsEnd();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 /*@
296    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
297 
298    Logically Collective
299 
300    Input Arguments:
301 +  sf - star forest
302 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
303 
304    Level: advanced
305 
306 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
307 @*/
308 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
309 {
310 
311   PetscFunctionBegin;
312   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
313   PetscValidLogicalCollectiveBool(sf,flg,2);
314   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
315   sf->rankorder = flg;
316   PetscFunctionReturn(0);
317 }
318 
319 /*@
320    PetscSFSetGraph - Set a parallel star forest
321 
322    Collective
323 
324    Input Arguments:
325 +  sf - star forest
326 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
327 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
328 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
329 during setup in debug mode)
330 .  localmode - copy mode for ilocal
331 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
332 during setup in debug mode)
333 -  remotemode - copy mode for iremote
334 
335    Level: intermediate
336 
337    Notes:
338     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
339 
340    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
341    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
342    needed
343 
344    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
345    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
346 
347 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
348 @*/
349 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
350 {
351   PetscErrorCode ierr;
352 
353   PetscFunctionBegin;
354   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
355   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
356   if (nleaves > 0) PetscValidPointer(iremote,6);
357   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
358   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
359 
360   ierr = PetscSFReset(sf);CHKERRQ(ierr);
361   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
362 
363   sf->nroots  = nroots;
364   sf->nleaves = nleaves;
365 
366   if (nleaves && ilocal) {
367     PetscInt i;
368     PetscInt minleaf = PETSC_MAX_INT;
369     PetscInt maxleaf = PETSC_MIN_INT;
370     int      contiguous = 1;
371     for (i=0; i<nleaves; i++) {
372       minleaf = PetscMin(minleaf,ilocal[i]);
373       maxleaf = PetscMax(maxleaf,ilocal[i]);
374       contiguous &= (ilocal[i] == i);
375     }
376     sf->minleaf = minleaf;
377     sf->maxleaf = maxleaf;
378     if (contiguous) {
379       if (localmode == PETSC_OWN_POINTER) {
380         ierr = PetscFree(ilocal);CHKERRQ(ierr);
381       }
382       ilocal = NULL;
383     }
384   } else {
385     sf->minleaf = 0;
386     sf->maxleaf = nleaves - 1;
387   }
388 
389   if (ilocal) {
390     switch (localmode) {
391     case PETSC_COPY_VALUES:
392       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
393       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
394       sf->mine = sf->mine_alloc;
395       break;
396     case PETSC_OWN_POINTER:
397       sf->mine_alloc = (PetscInt*)ilocal;
398       sf->mine       = sf->mine_alloc;
399       break;
400     case PETSC_USE_POINTER:
401       sf->mine_alloc = NULL;
402       sf->mine       = (PetscInt*)ilocal;
403       break;
404     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
405     }
406   }
407 
408   switch (remotemode) {
409   case PETSC_COPY_VALUES:
410     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
411     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
412     sf->remote = sf->remote_alloc;
413     break;
414   case PETSC_OWN_POINTER:
415     sf->remote_alloc = (PetscSFNode*)iremote;
416     sf->remote       = sf->remote_alloc;
417     break;
418   case PETSC_USE_POINTER:
419     sf->remote_alloc = NULL;
420     sf->remote       = (PetscSFNode*)iremote;
421     break;
422   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
423   }
424 
425   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
426   sf->graphset = PETSC_TRUE;
427   PetscFunctionReturn(0);
428 }
429 
430 /*@
431   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
432 
433   Collective
434 
435   Input Parameters:
436 + sf      - The PetscSF
437 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
438 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
439 
440   Notes:
441   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
442   n and N are local and global sizes of x respectively.
443 
444   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
445   sequential vectors y on all ranks.
446 
447   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
448   sequential vector y on rank 0.
449 
450   In above cases, entries of x are roots and entries of y are leaves.
451 
452   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
453   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
454   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
455   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
456   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
457 
458   In this case, roots and leaves are symmetric.
459 
460   Level: intermediate
461  @*/
462 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
463 {
464   MPI_Comm       comm;
465   PetscInt       n,N,res[2];
466   PetscMPIInt    rank,size;
467   PetscSFType    type;
468   PetscErrorCode ierr;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
472   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
473   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
474   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
475 
476   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
477     type = PETSCSFALLTOALL;
478     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
479     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
480     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
481     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
482   } else {
483     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
484     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
485     res[0] = n;
486     res[1] = -n;
487     /* Check if n are same over all ranks so that we can optimize it */
488     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
489     if (res[0] == -res[1]) { /* same n */
490       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
491     } else {
492       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
493     }
494     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
495   }
496   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
497 
498   sf->pattern = pattern;
499   sf->mine    = NULL; /* Contiguous */
500 
501   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
502      Also set other easy stuff.
503    */
504   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
505     sf->nleaves      = N;
506     sf->nroots       = n;
507     sf->nranks       = size;
508     sf->minleaf      = 0;
509     sf->maxleaf      = N - 1;
510   } else if (pattern == PETSCSF_PATTERN_GATHER) {
511     sf->nleaves      = rank ? 0 : N;
512     sf->nroots       = n;
513     sf->nranks       = rank ? 0 : size;
514     sf->minleaf      = 0;
515     sf->maxleaf      = rank ? -1 : N - 1;
516   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
517     sf->nleaves      = size;
518     sf->nroots       = size;
519     sf->nranks       = size;
520     sf->minleaf      = 0;
521     sf->maxleaf      = size - 1;
522   }
523   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
524   sf->graphset = PETSC_TRUE;
525   PetscFunctionReturn(0);
526 }
527 
528 /*@
529    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
530 
531    Collective
532 
533    Input Arguments:
534 
535 .  sf - star forest to invert
536 
537    Output Arguments:
538 .  isf - inverse of sf
539    Level: advanced
540 
541    Notes:
542    All roots must have degree 1.
543 
544    The local space may be a permutation, but cannot be sparse.
545 
546 .seealso: PetscSFSetGraph()
547 @*/
548 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
549 {
550   PetscErrorCode ierr;
551   PetscMPIInt    rank;
552   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
553   const PetscInt *ilocal;
554   PetscSFNode    *roots,*leaves;
555 
556   PetscFunctionBegin;
557   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
558   PetscSFCheckGraphSet(sf,1);
559   PetscValidPointer(isf,2);
560 
561   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
562   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
563 
564   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
565   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
566   for (i=0; i<maxlocal; i++) {
567     leaves[i].rank  = rank;
568     leaves[i].index = i;
569   }
570   for (i=0; i <nroots; i++) {
571     roots[i].rank  = -1;
572     roots[i].index = -1;
573   }
574   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
575   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
576 
577   /* Check whether our leaves are sparse */
578   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
579   if (count == nroots) newilocal = NULL;
580   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
581     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
582     for (i=0,count=0; i<nroots; i++) {
583       if (roots[i].rank >= 0) {
584         newilocal[count]   = i;
585         roots[count].rank  = roots[i].rank;
586         roots[count].index = roots[i].index;
587         count++;
588       }
589     }
590   }
591 
592   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
593   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
594   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
595   PetscFunctionReturn(0);
596 }
597 
598 /*@
599    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
600 
601    Collective
602 
603    Input Arguments:
604 +  sf - communication object to duplicate
605 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
606 
607    Output Arguments:
608 .  newsf - new communication object
609 
610    Level: beginner
611 
612 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
613 @*/
614 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
615 {
616   PetscSFType    type;
617   PetscErrorCode ierr;
618 
619   PetscFunctionBegin;
620   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
621   PetscValidLogicalCollectiveEnum(sf,opt,2);
622   PetscValidPointer(newsf,3);
623   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
624   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
625   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
626   if (opt == PETSCSF_DUPLICATE_GRAPH) {
627     PetscSFCheckGraphSet(sf,1);
628     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
629       PetscInt          nroots,nleaves;
630       const PetscInt    *ilocal;
631       const PetscSFNode *iremote;
632       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
633       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
634     } else {
635       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
636     }
637   }
638   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
639   PetscFunctionReturn(0);
640 }
641 
642 /*@C
643    PetscSFGetGraph - Get the graph specifying a parallel star forest
644 
645    Not Collective
646 
647    Input Arguments:
648 .  sf - star forest
649 
650    Output Arguments:
651 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
652 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
653 .  ilocal - locations of leaves in leafdata buffers
654 -  iremote - remote locations of root vertices for each leaf on the current process
655 
656    Notes:
657    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
658 
659    Level: intermediate
660 
661 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
662 @*/
663 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
664 {
665   PetscErrorCode ierr;
666 
667   PetscFunctionBegin;
668   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
669   if (sf->ops->GetGraph) {
670     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
671   } else {
672     if (nroots) *nroots = sf->nroots;
673     if (nleaves) *nleaves = sf->nleaves;
674     if (ilocal) *ilocal = sf->mine;
675     if (iremote) *iremote = sf->remote;
676   }
677   PetscFunctionReturn(0);
678 }
679 
680 /*@
681    PetscSFGetLeafRange - Get the active leaf ranges
682 
683    Not Collective
684 
685    Input Arguments:
686 .  sf - star forest
687 
688    Output Arguments:
689 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
690 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
691 
692    Level: developer
693 
694 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
695 @*/
696 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
697 {
698 
699   PetscFunctionBegin;
700   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
701   PetscSFCheckGraphSet(sf,1);
702   if (minleaf) *minleaf = sf->minleaf;
703   if (maxleaf) *maxleaf = sf->maxleaf;
704   PetscFunctionReturn(0);
705 }
706 
707 /*@C
708    PetscSFView - view a star forest
709 
710    Collective
711 
712    Input Arguments:
713 +  sf - star forest
714 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
715 
716    Level: beginner
717 
718 .seealso: PetscSFCreate(), PetscSFSetGraph()
719 @*/
720 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
721 {
722   PetscErrorCode    ierr;
723   PetscBool         iascii;
724   PetscViewerFormat format;
725 
726   PetscFunctionBegin;
727   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
728   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
729   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
730   PetscCheckSameComm(sf,1,viewer,2);
731   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
732   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
733   if (iascii) {
734     PetscMPIInt rank;
735     PetscInt    ii,i,j;
736 
737     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
738     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
739     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
740     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
741       if (!sf->graphset) {
742         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
743         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
744         PetscFunctionReturn(0);
745       }
746       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
747       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
748       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
749       for (i=0; i<sf->nleaves; i++) {
750         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
751       }
752       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
753       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
754       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
755         PetscMPIInt *tmpranks,*perm;
756         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
757         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
758         for (i=0; i<sf->nranks; i++) perm[i] = i;
759         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
760         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
761         for (ii=0; ii<sf->nranks; ii++) {
762           i = perm[ii];
763           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
764           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
765             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
766           }
767         }
768         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
769       }
770       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
771       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
772     }
773     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
774   }
775   PetscFunctionReturn(0);
776 }
777 
778 /*@C
779    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
780 
781    Not Collective
782 
783    Input Arguments:
784 .  sf - star forest
785 
786    Output Arguments:
787 +  nranks - number of ranks referenced by local part
788 .  ranks - array of ranks
789 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
790 .  rmine - concatenated array holding local indices referencing each remote rank
791 -  rremote - concatenated array holding remote indices referenced for each remote rank
792 
793    Level: developer
794 
795 .seealso: PetscSFGetLeafRanks()
796 @*/
797 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
798 {
799   PetscErrorCode ierr;
800 
801   PetscFunctionBegin;
802   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
803   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
804   if (sf->ops->GetRootRanks) {
805     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
806   } else {
807     /* The generic implementation */
808     if (nranks)  *nranks  = sf->nranks;
809     if (ranks)   *ranks   = sf->ranks;
810     if (roffset) *roffset = sf->roffset;
811     if (rmine)   *rmine   = sf->rmine;
812     if (rremote) *rremote = sf->rremote;
813   }
814   PetscFunctionReturn(0);
815 }
816 
817 /*@C
818    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
819 
820    Not Collective
821 
822    Input Arguments:
823 .  sf - star forest
824 
825    Output Arguments:
826 +  niranks - number of leaf ranks referencing roots on this process
827 .  iranks - array of ranks
828 .  ioffset - offset in irootloc for each rank (length niranks+1)
829 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
830 
831    Level: developer
832 
833 .seealso: PetscSFGetRootRanks()
834 @*/
835 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
836 {
837   PetscErrorCode ierr;
838 
839   PetscFunctionBegin;
840   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
841   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
842   if (sf->ops->GetLeafRanks) {
843     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
844   } else {
845     PetscSFType type;
846     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
847     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
848   }
849   PetscFunctionReturn(0);
850 }
851 
852 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
853   PetscInt i;
854   for (i=0; i<n; i++) {
855     if (needle == list[i]) return PETSC_TRUE;
856   }
857   return PETSC_FALSE;
858 }
859 
860 /*@C
861    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
862 
863    Collective
864 
865    Input Arguments:
866 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
867 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
868 
869    Level: developer
870 
871 .seealso: PetscSFGetRootRanks()
872 @*/
873 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
874 {
875   PetscErrorCode     ierr;
876   PetscTable         table;
877   PetscTablePosition pos;
878   PetscMPIInt        size,groupsize,*groupranks;
879   PetscInt           *rcount,*ranks;
880   PetscInt           i, irank = -1,orank = -1;
881 
882   PetscFunctionBegin;
883   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
884   PetscSFCheckGraphSet(sf,1);
885   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
886   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
887   for (i=0; i<sf->nleaves; i++) {
888     /* Log 1-based rank */
889     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
890   }
891   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
892   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
893   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
894   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
895   for (i=0; i<sf->nranks; i++) {
896     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
897     ranks[i]--;             /* Convert back to 0-based */
898   }
899   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
900 
901   /* We expect that dgroup is reliably "small" while nranks could be large */
902   {
903     MPI_Group group = MPI_GROUP_NULL;
904     PetscMPIInt *dgroupranks;
905     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
906     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRQ(ierr);
907     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
908     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
909     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
910     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRQ(ierr);}
911     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
912     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
913   }
914 
915   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
916   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i; ) {
917     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
918       if (InList(ranks[i],groupsize,groupranks)) break;
919     }
920     for ( ; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
921       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
922     }
923     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
924       PetscInt    tmprank,tmpcount;
925 
926       tmprank             = ranks[i];
927       tmpcount            = rcount[i];
928       ranks[i]            = ranks[sf->ndranks];
929       rcount[i]           = rcount[sf->ndranks];
930       ranks[sf->ndranks]  = tmprank;
931       rcount[sf->ndranks] = tmpcount;
932       sf->ndranks++;
933     }
934   }
935   ierr = PetscFree(groupranks);CHKERRQ(ierr);
936   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
937   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
938   sf->roffset[0] = 0;
939   for (i=0; i<sf->nranks; i++) {
940     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
941     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
942     rcount[i]        = 0;
943   }
944   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
945     /* short circuit */
946     if (orank != sf->remote[i].rank) {
947       /* Search for index of iremote[i].rank in sf->ranks */
948       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
949       if (irank < 0) {
950         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
951         if (irank >= 0) irank += sf->ndranks;
952       }
953       orank = sf->remote[i].rank;
954     }
955     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
956     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
957     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
958     rcount[irank]++;
959   }
960   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
961   PetscFunctionReturn(0);
962 }
963 
964 /*@C
965    PetscSFGetGroups - gets incoming and outgoing process groups
966 
967    Collective
968 
969    Input Argument:
970 .  sf - star forest
971 
972    Output Arguments:
973 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
974 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
975 
976    Level: developer
977 
978 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
979 @*/
980 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
981 {
982   PetscErrorCode ierr;
983   MPI_Group      group = MPI_GROUP_NULL;
984 
985   PetscFunctionBegin;
986   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining groups");
987   if (sf->ingroup == MPI_GROUP_NULL) {
988     PetscInt       i;
989     const PetscInt *indegree;
990     PetscMPIInt    rank,*outranks,*inranks;
991     PetscSFNode    *remote;
992     PetscSF        bgcount;
993 
994     /* Compute the number of incoming ranks */
995     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
996     for (i=0; i<sf->nranks; i++) {
997       remote[i].rank  = sf->ranks[i];
998       remote[i].index = 0;
999     }
1000     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1001     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1002     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1003     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1004 
1005     /* Enumerate the incoming ranks */
1006     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1007     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1008     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1009     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1010     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1011     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1012     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRQ(ierr);
1013     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1014     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1015     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1016   }
1017   *incoming = sf->ingroup;
1018 
1019   if (sf->outgroup == MPI_GROUP_NULL) {
1020     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1021     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRQ(ierr);
1022     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1023   }
1024   *outgoing = sf->outgroup;
1025   PetscFunctionReturn(0);
1026 }
1027 
1028 /*@
1029    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1030 
1031    Collective
1032 
1033    Input Argument:
1034 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1035 
1036    Output Arguments:
1037 .  multi - star forest with split roots, such that each root has degree exactly 1
1038 
1039    Level: developer
1040 
1041    Notes:
1042 
1043    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1044    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1045    edge, it is a candidate for future optimization that might involve its removal.
1046 
1047 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1048 @*/
1049 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1050 {
1051   PetscErrorCode ierr;
1052 
1053   PetscFunctionBegin;
1054   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1055   PetscValidPointer(multi,2);
1056   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1057     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1058     *multi = sf->multi;
1059     PetscFunctionReturn(0);
1060   }
1061   if (!sf->multi) {
1062     const PetscInt *indegree;
1063     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1064     PetscSFNode    *remote;
1065     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1066     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1067     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1068     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1069     inoffset[0] = 0;
1070     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1071     for (i=0; i<maxlocal; i++) outones[i] = 1;
1072     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1073     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1074     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1075 #if 0
1076 #if defined(PETSC_USE_DEBUG)                                 /* Check that the expected number of increments occurred */
1077     for (i=0; i<sf->nroots; i++) {
1078       if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1079     }
1080 #endif
1081 #endif
1082     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1083     for (i=0; i<sf->nleaves; i++) {
1084       remote[i].rank  = sf->remote[i].rank;
1085       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1086     }
1087     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1088     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1089     if (sf->rankorder) {        /* Sort the ranks */
1090       PetscMPIInt rank;
1091       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1092       PetscSFNode *newremote;
1093       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1094       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1095       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1096       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1097       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1098       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1099       /* Sort the incoming ranks at each vertex, build the inverse map */
1100       for (i=0; i<sf->nroots; i++) {
1101         PetscInt j;
1102         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1103         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1104         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1105       }
1106       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1107       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1108       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1109       for (i=0; i<sf->nleaves; i++) {
1110         newremote[i].rank  = sf->remote[i].rank;
1111         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1112       }
1113       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1114       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1115     }
1116     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1117   }
1118   *multi = sf->multi;
1119   PetscFunctionReturn(0);
1120 }
1121 
1122 /*@C
1123    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1124 
1125    Collective
1126 
1127    Input Arguments:
1128 +  sf - original star forest
1129 .  nselected  - number of selected roots on this process
1130 -  selected   - indices of the selected roots on this process
1131 
1132    Output Arguments:
1133 .  newsf - new star forest
1134 
1135    Level: advanced
1136 
1137    Note:
1138    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1139    be done by calling PetscSFGetGraph().
1140 
1141 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1142 @*/
1143 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1144 {
1145   PetscInt          i,n,*roots,*rootdata,*leafdata,nroots,nleaves,connected_leaves,*new_ilocal;
1146   const PetscSFNode *iremote;
1147   PetscSFNode       *new_iremote;
1148   PetscSF           tmpsf;
1149   MPI_Comm          comm;
1150   PetscErrorCode    ierr;
1151 
1152   PetscFunctionBegin;
1153   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1154   PetscSFCheckGraphSet(sf,1);
1155   if (nselected) PetscValidPointer(selected,3);
1156   PetscValidPointer(newsf,4);
1157 
1158   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1159 
1160   /* Uniq selected[] and put results in roots[] */
1161   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1162   ierr = PetscMalloc1(nselected,&roots);CHKERRQ(ierr);
1163   ierr = PetscArraycpy(roots,selected,nselected);CHKERRQ(ierr);
1164   ierr = PetscSortedRemoveDupsInt(&nselected,roots);CHKERRQ(ierr);
1165   if (nselected && (roots[0] < 0 || roots[nselected-1] >= sf->nroots)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max root indices %D/%D are not in [0,%D)",roots[0],roots[nselected-1],sf->nroots);
1166 
1167   if (sf->ops->CreateEmbeddedSF) {
1168     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,roots,newsf);CHKERRQ(ierr);
1169   } else {
1170     /* A generic version of creating embedded sf. Note that we called PetscSFSetGraph() twice, which is certainly expensive */
1171     /* Find out which leaves (not leaf data items) are still connected to roots in the embedded sf */
1172     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,NULL,&iremote);CHKERRQ(ierr);
1173     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&tmpsf);CHKERRQ(ierr);
1174     ierr = PetscSFSetGraph(tmpsf,nroots,nleaves,NULL/*contiguous*/,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);CHKERRQ(ierr);
1175     ierr = PetscCalloc2(nroots,&rootdata,nleaves,&leafdata);CHKERRQ(ierr);
1176     for (i=0; i<nselected; ++i) rootdata[roots[i]] = 1;
1177     ierr = PetscSFBcastBegin(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1178     ierr = PetscSFBcastEnd(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1179     ierr = PetscSFDestroy(&tmpsf);CHKERRQ(ierr);
1180 
1181     /* Build newsf with leaves that are still connected */
1182     connected_leaves = 0;
1183     for (i=0; i<nleaves; ++i) connected_leaves += leafdata[i];
1184     ierr = PetscMalloc1(connected_leaves,&new_ilocal);CHKERRQ(ierr);
1185     ierr = PetscMalloc1(connected_leaves,&new_iremote);CHKERRQ(ierr);
1186     for (i=0, n=0; i<nleaves; ++i) {
1187       if (leafdata[i]) {
1188         new_ilocal[n]        = sf->mine ? sf->mine[i] : i;
1189         new_iremote[n].rank  = sf->remote[i].rank;
1190         new_iremote[n].index = sf->remote[i].index;
1191         ++n;
1192       }
1193     }
1194 
1195     if (n != connected_leaves) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"There is a size mismatch in the SF embedding, %d != %d",n,connected_leaves);
1196     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1197     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1198     ierr = PetscSFSetGraph(*newsf,nroots,connected_leaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1199     ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
1200   }
1201   ierr = PetscFree(roots);CHKERRQ(ierr);
1202   PetscFunctionReturn(0);
1203 }
1204 
1205 /*@C
1206   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1207 
1208   Collective
1209 
1210   Input Arguments:
1211 + sf - original star forest
1212 . nselected  - number of selected leaves on this process
1213 - selected   - indices of the selected leaves on this process
1214 
1215   Output Arguments:
1216 .  newsf - new star forest
1217 
1218   Level: advanced
1219 
1220 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1221 @*/
1222 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1223 {
1224   const PetscSFNode *iremote;
1225   PetscSFNode       *new_iremote;
1226   const PetscInt    *ilocal;
1227   PetscInt          i,nroots,*leaves,*new_ilocal;
1228   MPI_Comm          comm;
1229   PetscErrorCode    ierr;
1230 
1231   PetscFunctionBegin;
1232   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1233   PetscSFCheckGraphSet(sf,1);
1234   if (nselected) PetscValidPointer(selected,3);
1235   PetscValidPointer(newsf,4);
1236 
1237   /* Uniq selected[] and put results in leaves[] */
1238   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1239   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1240   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1241   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1242   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1243 
1244   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1245   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1246     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1247   } else {
1248     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1249     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1250     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1251     for (i=0; i<nselected; ++i) {
1252       const PetscInt l     = leaves[i];
1253       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1254       new_iremote[i].rank  = iremote[l].rank;
1255       new_iremote[i].index = iremote[l].index;
1256     }
1257     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1258     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1259     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1260   }
1261   ierr = PetscFree(leaves);CHKERRQ(ierr);
1262   PetscFunctionReturn(0);
1263 }
1264 
1265 /*@C
1266    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1267 
1268    Collective on PetscSF
1269 
1270    Input Arguments:
1271 +  sf - star forest on which to communicate
1272 .  unit - data type associated with each node
1273 .  rootdata - buffer to broadcast
1274 -  op - operation to use for reduction
1275 
1276    Output Arguments:
1277 .  leafdata - buffer to be reduced with values from each leaf's respective root
1278 
1279    Level: intermediate
1280 
1281 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd()
1282 @*/
1283 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1284 {
1285   PetscErrorCode ierr;
1286   PetscMemType   rootmtype,leafmtype;
1287 
1288   PetscFunctionBegin;
1289   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1290   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1291   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1292   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1293   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1294 #if defined(PETSC_HAVE_CUDA)
1295   /*  Shall we assume rootdata, leafdata are ready to use (instead of being computed by some asynchronous kernels)?
1296     To be similar to MPI, I'd like to have this assumption, since MPI does not have a concept of stream.
1297     But currently this assumption is not enforecd in Petsc. To be safe, I do synchronization here. Otherwise, if
1298     we do not sync now and call the Pack kernel directly on the default NULL stream (assume petsc objects are also
1299     computed on it), we have to sync the NULL stream before calling MPI routines. So, it looks a cudaDeviceSynchronize
1300     is inevitable. We do it now and put pack/unpack kernels to non-NULL streams.
1301    */
1302   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1303 #endif
1304   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1305   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1306   PetscFunctionReturn(0);
1307 }
1308 
1309 /*@C
1310    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1311 
1312    Collective
1313 
1314    Input Arguments:
1315 +  sf - star forest
1316 .  unit - data type
1317 .  rootdata - buffer to broadcast
1318 -  op - operation to use for reduction
1319 
1320    Output Arguments:
1321 .  leafdata - buffer to be reduced with values from each leaf's respective root
1322 
1323    Level: intermediate
1324 
1325 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1326 @*/
1327 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1328 {
1329   PetscErrorCode ierr;
1330   PetscMemType   rootmtype,leafmtype;
1331 
1332   PetscFunctionBegin;
1333   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1334   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1335   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1336   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1337   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1338   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1339   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1340   PetscFunctionReturn(0);
1341 }
1342 
1343 /*@C
1344    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1345 
1346    Collective
1347 
1348    Input Arguments:
1349 +  sf - star forest
1350 .  unit - data type
1351 .  leafdata - values to reduce
1352 -  op - reduction operation
1353 
1354    Output Arguments:
1355 .  rootdata - result of reduction of values from all leaves of each root
1356 
1357    Level: intermediate
1358 
1359 .seealso: PetscSFBcastBegin()
1360 @*/
1361 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1362 {
1363   PetscErrorCode ierr;
1364   PetscMemType   rootmtype,leafmtype;
1365 
1366   PetscFunctionBegin;
1367   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1368   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1369   ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1370   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1371   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1372 #if defined(PETSC_HAVE_CUDA)
1373   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1374 #endif
1375   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1376   ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 /*@C
1381    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1382 
1383    Collective
1384 
1385    Input Arguments:
1386 +  sf - star forest
1387 .  unit - data type
1388 .  leafdata - values to reduce
1389 -  op - reduction operation
1390 
1391    Output Arguments:
1392 .  rootdata - result of reduction of values from all leaves of each root
1393 
1394    Level: intermediate
1395 
1396 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1397 @*/
1398 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1399 {
1400   PetscErrorCode ierr;
1401   PetscMemType   rootmtype,leafmtype;
1402 
1403   PetscFunctionBegin;
1404   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1405   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1406   ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1407   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1408   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1409   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1410   ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1411   PetscFunctionReturn(0);
1412 }
1413 
1414 /*@C
1415    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1416 
1417    Collective
1418 
1419    Input Arguments:
1420 +  sf - star forest
1421 .  unit - data type
1422 .  leafdata - leaf values to use in reduction
1423 -  op - operation to use for reduction
1424 
1425    Output Arguments:
1426 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1427 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1428 
1429    Level: advanced
1430 
1431    Note:
1432    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1433    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1434    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1435    integers.
1436 
1437 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1438 @*/
1439 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1440 {
1441   PetscErrorCode ierr;
1442   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1443 
1444   PetscFunctionBegin;
1445   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1446   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1447   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1448   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1449   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1450   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1451   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1452 #if defined(PETSC_HAVE_CUDA)
1453   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1454 #endif
1455   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1456   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1457   PetscFunctionReturn(0);
1458 }
1459 
1460 /*@C
1461    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1462 
1463    Collective
1464 
1465    Input Arguments:
1466 +  sf - star forest
1467 .  unit - data type
1468 .  leafdata - leaf values to use in reduction
1469 -  op - operation to use for reduction
1470 
1471    Output Arguments:
1472 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1473 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1474 
1475    Level: advanced
1476 
1477 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1478 @*/
1479 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1480 {
1481   PetscErrorCode ierr;
1482   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1483 
1484   PetscFunctionBegin;
1485   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1486   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1487   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1488   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1489   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1490   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1491   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1492   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1493   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1494   PetscFunctionReturn(0);
1495 }
1496 
1497 /*@C
1498    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1499 
1500    Collective
1501 
1502    Input Arguments:
1503 .  sf - star forest
1504 
1505    Output Arguments:
1506 .  degree - degree of each root vertex
1507 
1508    Level: advanced
1509 
1510    Notes:
1511    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1512 
1513 .seealso: PetscSFGatherBegin()
1514 @*/
1515 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1516 {
1517   PetscErrorCode ierr;
1518 
1519   PetscFunctionBegin;
1520   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1521   PetscSFCheckGraphSet(sf,1);
1522   PetscValidPointer(degree,2);
1523   if (!sf->degreeknown) {
1524     PetscInt i, nroots = sf->nroots, maxlocal = sf->maxleaf+1;  /* TODO: We should use PetscSFGetLeafRange() */
1525     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1526     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1527     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1528     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1529     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1530     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1531   }
1532   *degree = NULL;
1533   PetscFunctionReturn(0);
1534 }
1535 
1536 /*@C
1537    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1538 
1539    Collective
1540 
1541    Input Arguments:
1542 .  sf - star forest
1543 
1544    Output Arguments:
1545 .  degree - degree of each root vertex
1546 
1547    Level: developer
1548 
1549    Notes:
1550    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1551 
1552 .seealso:
1553 @*/
1554 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1555 {
1556   PetscErrorCode ierr;
1557 
1558   PetscFunctionBegin;
1559   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1560   PetscSFCheckGraphSet(sf,1);
1561   PetscValidPointer(degree,2);
1562   if (!sf->degreeknown) {
1563     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1564     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1565     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1566     sf->degreeknown = PETSC_TRUE;
1567   }
1568   *degree = sf->degree;
1569   PetscFunctionReturn(0);
1570 }
1571 
1572 
1573 /*@C
1574    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1575    Each multi-root is assigned index of the corresponding original root.
1576 
1577    Collective
1578 
1579    Input Arguments:
1580 +  sf - star forest
1581 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1582 
1583    Output Arguments:
1584 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1585 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1586 
1587    Level: developer
1588 
1589    Notes:
1590    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1591 
1592 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1593 @*/
1594 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1595 {
1596   PetscSF             msf;
1597   PetscInt            i, j, k, nroots, nmroots;
1598   PetscErrorCode      ierr;
1599 
1600   PetscFunctionBegin;
1601   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1602   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1603   if (nroots) PetscValidIntPointer(degree,2);
1604   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1605   PetscValidPointer(multiRootsOrigNumbering,4);
1606   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1607   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1608   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1609   for (i=0,j=0,k=0; i<nroots; i++) {
1610     if (!degree[i]) continue;
1611     for (j=0; j<degree[i]; j++,k++) {
1612       (*multiRootsOrigNumbering)[k] = i;
1613     }
1614   }
1615 #if defined(PETSC_USE_DEBUG)
1616   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1617 #endif
1618   if (nMultiRoots) *nMultiRoots = nmroots;
1619   PetscFunctionReturn(0);
1620 }
1621 
1622 /*@C
1623    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1624 
1625    Collective
1626 
1627    Input Arguments:
1628 +  sf - star forest
1629 .  unit - data type
1630 -  leafdata - leaf data to gather to roots
1631 
1632    Output Argument:
1633 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1634 
1635    Level: intermediate
1636 
1637 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1638 @*/
1639 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1640 {
1641   PetscErrorCode ierr;
1642   PetscSF        multi;
1643 
1644   PetscFunctionBegin;
1645   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1646   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1647   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1648   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1649   PetscFunctionReturn(0);
1650 }
1651 
1652 /*@C
1653    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1654 
1655    Collective
1656 
1657    Input Arguments:
1658 +  sf - star forest
1659 .  unit - data type
1660 -  leafdata - leaf data to gather to roots
1661 
1662    Output Argument:
1663 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1664 
1665    Level: intermediate
1666 
1667 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1668 @*/
1669 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1670 {
1671   PetscErrorCode ierr;
1672   PetscSF        multi;
1673 
1674   PetscFunctionBegin;
1675   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1676   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1677   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1678   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1679   PetscFunctionReturn(0);
1680 }
1681 
1682 /*@C
1683    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1684 
1685    Collective
1686 
1687    Input Arguments:
1688 +  sf - star forest
1689 .  unit - data type
1690 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1691 
1692    Output Argument:
1693 .  leafdata - leaf data to be update with personal data from each respective root
1694 
1695    Level: intermediate
1696 
1697 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1698 @*/
1699 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1700 {
1701   PetscErrorCode ierr;
1702   PetscSF        multi;
1703 
1704   PetscFunctionBegin;
1705   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1706   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1707   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1708   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1709   PetscFunctionReturn(0);
1710 }
1711 
1712 /*@C
1713    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1714 
1715    Collective
1716 
1717    Input Arguments:
1718 +  sf - star forest
1719 .  unit - data type
1720 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1721 
1722    Output Argument:
1723 .  leafdata - leaf data to be update with personal data from each respective root
1724 
1725    Level: intermediate
1726 
1727 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1728 @*/
1729 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1730 {
1731   PetscErrorCode ierr;
1732   PetscSF        multi;
1733 
1734   PetscFunctionBegin;
1735   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1736   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1737   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1738   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1739   PetscFunctionReturn(0);
1740 }
1741 
1742 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1743 {
1744 #if defined(PETSC_USE_DEBUG)
1745   PetscInt        i, n, nleaves;
1746   const PetscInt *ilocal = NULL;
1747   PetscHSetI      seen;
1748   PetscErrorCode  ierr;
1749 
1750   PetscFunctionBegin;
1751   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1752   ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1753   for (i = 0; i < nleaves; i++) {
1754     const PetscInt leaf = ilocal ? ilocal[i] : i;
1755     ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1756   }
1757   ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1758   if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1759   ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1760   PetscFunctionReturn(0);
1761 #else
1762   PetscFunctionBegin;
1763   PetscFunctionReturn(0);
1764 #endif
1765 }
1766 /*@
1767   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1768 
1769   Input Parameters:
1770 + sfA - The first PetscSF, whose local space may be a permutation, but can not be sparse.
1771 - sfB - The second PetscSF, whose number of roots must be equal to number of leaves of sfA on each processor
1772 
1773   Output Parameters:
1774 . sfBA - The composite SF. Doing a Bcast on the new SF is equvalent to doing Bcast on sfA, then Bcast on sfB
1775 
1776   Level: developer
1777 
1778   Notes:
1779   For the resulting composed SF to be valid, the input SFs must be true star forests: the leaves must be unique. This is
1780   checked in debug mode.
1781 
1782 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1783 @*/
1784 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1785 {
1786   PetscErrorCode    ierr;
1787   MPI_Comm          comm;
1788   const PetscSFNode *remotePointsA,*remotePointsB;
1789   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1790   const PetscInt    *localPointsA,*localPointsB,*localPointsBA;
1791   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf;
1792 
1793   PetscFunctionBegin;
1794   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1795   PetscSFCheckGraphSet(sfA,1);
1796   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1797   PetscSFCheckGraphSet(sfB,2);
1798   PetscValidPointer(sfBA,3);
1799   ierr = PetscObjectGetComm((PetscObject)sfA,&comm);CHKERRQ(ierr);
1800   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1801   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1802   ierr = PetscSFGetLeafRange(sfA,&minleaf,&maxleaf);CHKERRQ(ierr);
1803   if (numRootsB != numLeavesA) SETERRQ(comm,PETSC_ERR_ARG_INCOMP,"The second SF's number of roots must be equal to the first SF's number of leaves");
1804   if (maxleaf+1 != numLeavesA || minleaf) SETERRQ(comm,PETSC_ERR_ARG_INCOMP,"The first SF can not have sparse local space");
1805   /* The above check is fast, but not sufficient, since we cannot guarantee that the SF has unique leaves. So in debug
1806    mode, check properly. */
1807   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1808   if (localPointsA) {
1809     /* Local space is dense permutation of identity. Need to rewire order of the remote points */
1810     ierr = PetscMalloc1(numLeavesA,&reorderedRemotePointsA);CHKERRQ(ierr);
1811     for (i=0; i<numLeavesA; i++) reorderedRemotePointsA[localPointsA[i]-minleaf] = remotePointsA[i];
1812     remotePointsA = reorderedRemotePointsA;
1813   }
1814   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
1815   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
1816   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1817   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1818   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1819 
1820   /* sfB's leaves must be unique, otherwise BcastAndOp(B o A) != BcastAndOp(B) o BcastAndOp(A) */
1821   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1822   if (minleaf == 0 && maxleaf + 1 == numLeavesB) { /* Local space of sfB is an identity or permutation */
1823     localPointsBA  = NULL;
1824     remotePointsBA = leafdataB;
1825   } else {
1826     localPointsBA  = localPointsB;
1827     ierr = PetscMalloc1(numLeavesB,&remotePointsBA);CHKERRQ(ierr);
1828     for (i=0; i<numLeavesB; i++) remotePointsBA[i] = leafdataB[localPointsB[i]-minleaf];
1829     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
1830   }
1831   ierr = PetscSFCreate(comm, sfBA);CHKERRQ(ierr);
1832   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesB,localPointsBA,PETSC_COPY_VALUES,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1833   PetscFunctionReturn(0);
1834 }
1835 
1836 /*@
1837   PetscSFComposeInverse - Compose a new PetscSF by putting inverse of the second SF under the first one
1838 
1839   Input Parameters:
1840 + sfA - The first PetscSF, whose local space may be a permutation, but can not be sparse.
1841 - sfB - The second PetscSF, whose number of leaves must be equal to number of leaves of sfA on each processor. All roots must have degree 1.
1842 
1843   Output Parameters:
1844 . sfBA - The composite SF. Doing a Bcast on the new SF is equvalent to doing Bcast on sfA, then Bcast on inverse of sfB
1845 
1846   Level: developer
1847 
1848 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph()
1849 @*/
1850 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1851 {
1852   PetscErrorCode    ierr;
1853   MPI_Comm          comm;
1854   const PetscSFNode *remotePointsA,*remotePointsB;
1855   PetscSFNode       *remotePointsBA;
1856   const PetscInt    *localPointsA,*localPointsB;
1857   PetscInt          numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf;
1858 
1859   PetscFunctionBegin;
1860   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1861   PetscSFCheckGraphSet(sfA,1);
1862   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1863   PetscSFCheckGraphSet(sfB,2);
1864   PetscValidPointer(sfBA,3);
1865   ierr = PetscObjectGetComm((PetscObject) sfA, &comm);CHKERRQ(ierr);
1866   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
1867   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
1868   ierr = PetscSFGetLeafRange(sfA,&minleaf,&maxleaf);CHKERRQ(ierr);
1869   if (maxleaf+1-minleaf != numLeavesA) SETERRQ(comm,PETSC_ERR_ARG_INCOMP,"The first SF can not have sparse local space");
1870   if (numLeavesA != numLeavesB) SETERRQ(comm,PETSC_ERR_ARG_INCOMP,"The second SF's number of leaves must be equal to the first SF's number of leaves");
1871   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
1872   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,remotePointsA,remotePointsBA,MPIU_REPLACE);CHKERRQ(ierr);
1873   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,remotePointsA,remotePointsBA,MPIU_REPLACE);CHKERRQ(ierr);
1874   ierr = PetscSFCreate(comm,sfBA);CHKERRQ(ierr);
1875   ierr = PetscSFSetGraph(*sfBA,numRootsA,numRootsB,NULL,PETSC_COPY_VALUES,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1876   PetscFunctionReturn(0);
1877 }
1878 
1879 /*
1880   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
1881 
1882   Input Parameters:
1883 . sf - The global PetscSF
1884 
1885   Output Parameters:
1886 . out - The local PetscSF
1887  */
1888 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
1889 {
1890   MPI_Comm           comm;
1891   PetscMPIInt        myrank;
1892   const PetscInt     *ilocal;
1893   const PetscSFNode  *iremote;
1894   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
1895   PetscSFNode        *liremote;
1896   PetscSF            lsf;
1897   PetscErrorCode     ierr;
1898 
1899   PetscFunctionBegin;
1900   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1901   if (sf->ops->CreateLocalSF) {
1902     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
1903   } else {
1904     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
1905     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1906     ierr = MPI_Comm_rank(comm,&myrank);CHKERRQ(ierr);
1907 
1908     /* Find out local edges and build a local SF */
1909     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1910     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
1911     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
1912     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
1913 
1914     for (i=j=0; i<nleaves; i++) {
1915       if (iremote[i].rank == (PetscInt)myrank) {
1916         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
1917         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
1918         liremote[j].index = iremote[i].index;
1919         j++;
1920       }
1921     }
1922     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
1923     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1924     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
1925     *out = lsf;
1926   }
1927   PetscFunctionReturn(0);
1928 }
1929 
1930 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
1931 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
1932 {
1933   PetscErrorCode     ierr;
1934   PetscMemType       rootmtype,leafmtype;
1935 
1936   PetscFunctionBegin;
1937   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1938   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1939   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1940   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1941   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1942   if (sf->ops->BcastToZero) {
1943     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
1944   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
1945   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1946   PetscFunctionReturn(0);
1947 }
1948 
1949