xref: /petsc/src/vec/is/sf/interface/sf.c (revision 9c8bf5420c6f7064591f60b3ab9f6265d71985cc)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_USE_DEBUG)
6 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
7     if (PetscUnlikely(!(sf)->graphset))                              \
8       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
9   } while (0)
10 #else
11 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
12 #endif
13 
14 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",0};
15 
16 /*@
17    PetscSFCreate - create a star forest communication context
18 
19    Collective
20 
21    Input Arguments:
22 .  comm - communicator on which the star forest will operate
23 
24    Output Arguments:
25 .  sf - new star forest context
26 
27    Options Database Keys:
28 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
29 .  -sf_type window    -Use MPI-3 one-sided window for communication
30 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
31 
32    Level: intermediate
33 
34    Notes:
35    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
36    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
37    SFs are optimized and they have better performance than general SFs.
38 
39 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
40 @*/
41 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
42 {
43   PetscErrorCode ierr;
44   PetscSF        b;
45 
46   PetscFunctionBegin;
47   PetscValidPointer(sf,2);
48   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
49 
50   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
51 
52   b->nroots    = -1;
53   b->nleaves   = -1;
54   b->minleaf   = PETSC_MAX_INT;
55   b->maxleaf   = PETSC_MIN_INT;
56   b->nranks    = -1;
57   b->rankorder = PETSC_TRUE;
58   b->ingroup   = MPI_GROUP_NULL;
59   b->outgroup  = MPI_GROUP_NULL;
60   b->graphset  = PETSC_FALSE;
61 
62   *sf = b;
63   PetscFunctionReturn(0);
64 }
65 
66 /*@
67    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
68 
69    Collective
70 
71    Input Arguments:
72 .  sf - star forest
73 
74    Level: advanced
75 
76 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
77 @*/
78 PetscErrorCode PetscSFReset(PetscSF sf)
79 {
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
84   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
85   sf->nroots   = -1;
86   sf->nleaves  = -1;
87   sf->minleaf  = PETSC_MAX_INT;
88   sf->maxleaf  = PETSC_MIN_INT;
89   sf->mine     = NULL;
90   sf->remote   = NULL;
91   sf->graphset = PETSC_FALSE;
92   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
93   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
94   sf->nranks = -1;
95   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
96 #if defined(PETSC_HAVE_CUDA)
97   if (sf->rmine_d) {cudaError_t err = cudaFree(sf->rmine_d);CHKERRCUDA(err);sf->rmine_d=NULL;}
98 #endif
99   sf->degreeknown = PETSC_FALSE;
100   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
101   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRQ(ierr);}
102   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRQ(ierr);}
103   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
104   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
105   sf->setupcalled = PETSC_FALSE;
106   PetscFunctionReturn(0);
107 }
108 
109 /*@C
110    PetscSFSetType - Set the PetscSF communication implementation
111 
112    Collective on PetscSF
113 
114    Input Parameters:
115 +  sf - the PetscSF context
116 -  type - a known method
117 
118    Options Database Key:
119 .  -sf_type <type> - Sets the method; use -help for a list
120    of available methods (for instance, window, pt2pt, neighbor)
121 
122    Notes:
123    See "include/petscsf.h" for available methods (for instance)
124 +    PETSCSFWINDOW - MPI-2/3 one-sided
125 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
126 
127   Level: intermediate
128 
129 .seealso: PetscSFType, PetscSFCreate()
130 @*/
131 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
132 {
133   PetscErrorCode ierr,(*r)(PetscSF);
134   PetscBool      match;
135 
136   PetscFunctionBegin;
137   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
138   PetscValidCharPointer(type,2);
139 
140   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
141   if (match) PetscFunctionReturn(0);
142 
143   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
144   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
145   /* Destroy the previous PetscSF implementation context */
146   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
147   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
148   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
149   ierr = (*r)(sf);CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 /*@C
154   PetscSFGetType - Get the PetscSF communication implementation
155 
156   Not Collective
157 
158   Input Parameter:
159 . sf  - the PetscSF context
160 
161   Output Parameter:
162 . type - the PetscSF type name
163 
164   Level: intermediate
165 
166 .seealso: PetscSFSetType(), PetscSFCreate()
167 @*/
168 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
169 {
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
172   PetscValidPointer(type,2);
173   *type = ((PetscObject)sf)->type_name;
174   PetscFunctionReturn(0);
175 }
176 
177 /*@
178    PetscSFDestroy - destroy star forest
179 
180    Collective
181 
182    Input Arguments:
183 .  sf - address of star forest
184 
185    Level: intermediate
186 
187 .seealso: PetscSFCreate(), PetscSFReset()
188 @*/
189 PetscErrorCode PetscSFDestroy(PetscSF *sf)
190 {
191   PetscErrorCode ierr;
192 
193   PetscFunctionBegin;
194   if (!*sf) PetscFunctionReturn(0);
195   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
196   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
197   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
198   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
199   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
204 {
205 #if defined(PETSC_USE_DEBUG)
206   PetscInt           i, nleaves;
207   PetscMPIInt        size;
208   const PetscInt    *ilocal;
209   const PetscSFNode *iremote;
210   PetscErrorCode     ierr;
211 
212   PetscFunctionBegin;
213   if (!sf->graphset) PetscFunctionReturn(0);
214   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
215   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
216   for (i = 0; i < nleaves; i++) {
217     const PetscInt rank = iremote[i].rank;
218     const PetscInt remote = iremote[i].index;
219     const PetscInt leaf = ilocal ? ilocal[i] : i;
220     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
221     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
222     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
223   }
224   PetscFunctionReturn(0);
225 #else
226   PetscFunctionBegin;
227   PetscFunctionReturn(0);
228 #endif
229 }
230 
231 /*@
232    PetscSFSetUp - set up communication structures
233 
234    Collective
235 
236    Input Arguments:
237 .  sf - star forest communication object
238 
239    Level: beginner
240 
241 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
242 @*/
243 PetscErrorCode PetscSFSetUp(PetscSF sf)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
249   PetscSFCheckGraphSet(sf,1);
250   if (sf->setupcalled) PetscFunctionReturn(0);
251   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
252   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);}
253   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
254   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
255   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
256   sf->setupcalled = PETSC_TRUE;
257   PetscFunctionReturn(0);
258 }
259 
260 /*@
261    PetscSFSetFromOptions - set PetscSF options using the options database
262 
263    Logically Collective
264 
265    Input Arguments:
266 .  sf - star forest
267 
268    Options Database Keys:
269 +  -sf_type - implementation type, see PetscSFSetType()
270 -  -sf_rank_order - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
271 
272    Level: intermediate
273 
274 .seealso: PetscSFWindowSetSyncType()
275 @*/
276 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
277 {
278   PetscSFType    deft;
279   char           type[256];
280   PetscErrorCode ierr;
281   PetscBool      flg;
282 
283   PetscFunctionBegin;
284   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
285   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
286   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
287   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
288   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
289   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
290   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
291   ierr = PetscOptionsEnd();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 /*@
296    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
297 
298    Logically Collective
299 
300    Input Arguments:
301 +  sf - star forest
302 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
303 
304    Level: advanced
305 
306 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
307 @*/
308 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
309 {
310 
311   PetscFunctionBegin;
312   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
313   PetscValidLogicalCollectiveBool(sf,flg,2);
314   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
315   sf->rankorder = flg;
316   PetscFunctionReturn(0);
317 }
318 
319 /*@
320    PetscSFSetGraph - Set a parallel star forest
321 
322    Collective
323 
324    Input Arguments:
325 +  sf - star forest
326 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
327 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
328 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
329 during setup in debug mode)
330 .  localmode - copy mode for ilocal
331 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
332 during setup in debug mode)
333 -  remotemode - copy mode for iremote
334 
335    Level: intermediate
336 
337    Notes:
338     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
339 
340    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
341    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
342    needed
343 
344    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
345    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
346 
347 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
348 @*/
349 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
350 {
351   PetscErrorCode ierr;
352 
353   PetscFunctionBegin;
354   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
355   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
356   if (nleaves > 0) PetscValidPointer(iremote,6);
357   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
358   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
359 
360   ierr = PetscSFReset(sf);CHKERRQ(ierr);
361   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
362 
363   sf->nroots  = nroots;
364   sf->nleaves = nleaves;
365 
366   if (nleaves && ilocal) {
367     PetscInt i;
368     PetscInt minleaf = PETSC_MAX_INT;
369     PetscInt maxleaf = PETSC_MIN_INT;
370     int      contiguous = 1;
371     for (i=0; i<nleaves; i++) {
372       minleaf = PetscMin(minleaf,ilocal[i]);
373       maxleaf = PetscMax(maxleaf,ilocal[i]);
374       contiguous &= (ilocal[i] == i);
375     }
376     sf->minleaf = minleaf;
377     sf->maxleaf = maxleaf;
378     if (contiguous) {
379       if (localmode == PETSC_OWN_POINTER) {
380         ierr = PetscFree(ilocal);CHKERRQ(ierr);
381       }
382       ilocal = NULL;
383     }
384   } else {
385     sf->minleaf = 0;
386     sf->maxleaf = nleaves - 1;
387   }
388 
389   if (ilocal) {
390     switch (localmode) {
391     case PETSC_COPY_VALUES:
392       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
393       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
394       sf->mine = sf->mine_alloc;
395       break;
396     case PETSC_OWN_POINTER:
397       sf->mine_alloc = (PetscInt*)ilocal;
398       sf->mine       = sf->mine_alloc;
399       break;
400     case PETSC_USE_POINTER:
401       sf->mine_alloc = NULL;
402       sf->mine       = (PetscInt*)ilocal;
403       break;
404     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
405     }
406   }
407 
408   switch (remotemode) {
409   case PETSC_COPY_VALUES:
410     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
411     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
412     sf->remote = sf->remote_alloc;
413     break;
414   case PETSC_OWN_POINTER:
415     sf->remote_alloc = (PetscSFNode*)iremote;
416     sf->remote       = sf->remote_alloc;
417     break;
418   case PETSC_USE_POINTER:
419     sf->remote_alloc = NULL;
420     sf->remote       = (PetscSFNode*)iremote;
421     break;
422   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
423   }
424 
425   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
426   sf->graphset = PETSC_TRUE;
427   PetscFunctionReturn(0);
428 }
429 
430 /*@
431   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
432 
433   Collective
434 
435   Input Parameters:
436 + sf      - The PetscSF
437 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
438 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
439 
440   Notes:
441   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
442   n and N are local and global sizes of x respectively.
443 
444   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
445   sequential vectors y on all ranks.
446 
447   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
448   sequential vector y on rank 0.
449 
450   In above cases, entries of x are roots and entries of y are leaves.
451 
452   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
453   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
454   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
455   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
456   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
457 
458   In this case, roots and leaves are symmetric.
459 
460   Level: intermediate
461  @*/
462 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
463 {
464   MPI_Comm       comm;
465   PetscInt       n,N,res[2];
466   PetscMPIInt    rank,size;
467   PetscSFType    type;
468   PetscErrorCode ierr;
469 
470   PetscFunctionBegin;
471   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
472   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
473   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
474   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
475 
476   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
477     type = PETSCSFALLTOALL;
478     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
479     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
480     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
481     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
482   } else {
483     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
484     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
485     res[0] = n;
486     res[1] = -n;
487     /* Check if n are same over all ranks so that we can optimize it */
488     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
489     if (res[0] == -res[1]) { /* same n */
490       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
491     } else {
492       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
493     }
494     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
495   }
496   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
497 
498   sf->pattern = pattern;
499   sf->mine    = NULL; /* Contiguous */
500 
501   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
502      Also set other easy stuff.
503    */
504   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
505     sf->nleaves      = N;
506     sf->nroots       = n;
507     sf->nranks       = size;
508     sf->minleaf      = 0;
509     sf->maxleaf      = N - 1;
510   } else if (pattern == PETSCSF_PATTERN_GATHER) {
511     sf->nleaves      = rank ? 0 : N;
512     sf->nroots       = n;
513     sf->nranks       = rank ? 0 : size;
514     sf->minleaf      = 0;
515     sf->maxleaf      = rank ? -1 : N - 1;
516   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
517     sf->nleaves      = size;
518     sf->nroots       = size;
519     sf->nranks       = size;
520     sf->minleaf      = 0;
521     sf->maxleaf      = size - 1;
522   }
523   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
524   sf->graphset = PETSC_TRUE;
525   PetscFunctionReturn(0);
526 }
527 
528 /*@
529    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
530 
531    Collective
532 
533    Input Arguments:
534 
535 .  sf - star forest to invert
536 
537    Output Arguments:
538 .  isf - inverse of sf
539    Level: advanced
540 
541    Notes:
542    All roots must have degree 1.
543 
544    The local space may be a permutation, but cannot be sparse.
545 
546 .seealso: PetscSFSetGraph()
547 @*/
548 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
549 {
550   PetscErrorCode ierr;
551   PetscMPIInt    rank;
552   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
553   const PetscInt *ilocal;
554   PetscSFNode    *roots,*leaves;
555 
556   PetscFunctionBegin;
557   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
558   PetscSFCheckGraphSet(sf,1);
559   PetscValidPointer(isf,2);
560 
561   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
562   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
563 
564   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
565   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
566   for (i=0; i<maxlocal; i++) {
567     leaves[i].rank  = rank;
568     leaves[i].index = i;
569   }
570   for (i=0; i <nroots; i++) {
571     roots[i].rank  = -1;
572     roots[i].index = -1;
573   }
574   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
575   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
576 
577   /* Check whether our leaves are sparse */
578   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
579   if (count == nroots) newilocal = NULL;
580   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
581     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
582     for (i=0,count=0; i<nroots; i++) {
583       if (roots[i].rank >= 0) {
584         newilocal[count]   = i;
585         roots[count].rank  = roots[i].rank;
586         roots[count].index = roots[i].index;
587         count++;
588       }
589     }
590   }
591 
592   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
593   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
594   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
595   PetscFunctionReturn(0);
596 }
597 
598 /*@
599    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
600 
601    Collective
602 
603    Input Arguments:
604 +  sf - communication object to duplicate
605 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
606 
607    Output Arguments:
608 .  newsf - new communication object
609 
610    Level: beginner
611 
612 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
613 @*/
614 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
615 {
616   PetscSFType    type;
617   PetscErrorCode ierr;
618 
619   PetscFunctionBegin;
620   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
621   PetscValidLogicalCollectiveEnum(sf,opt,2);
622   PetscValidPointer(newsf,3);
623   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
624   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
625   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
626   if (opt == PETSCSF_DUPLICATE_GRAPH) {
627     PetscSFCheckGraphSet(sf,1);
628     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
629       PetscInt          nroots,nleaves;
630       const PetscInt    *ilocal;
631       const PetscSFNode *iremote;
632       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
633       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
634     } else {
635       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
636     }
637   }
638   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
639   PetscFunctionReturn(0);
640 }
641 
642 /*@C
643    PetscSFGetGraph - Get the graph specifying a parallel star forest
644 
645    Not Collective
646 
647    Input Arguments:
648 .  sf - star forest
649 
650    Output Arguments:
651 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
652 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
653 .  ilocal - locations of leaves in leafdata buffers
654 -  iremote - remote locations of root vertices for each leaf on the current process
655 
656    Notes:
657    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
658 
659    When called from Fortran, the returned iremote array is a copy and must deallocated after use. Consequently, if you
660    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
661 
662    Level: intermediate
663 
664 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
665 @*/
666 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
667 {
668   PetscErrorCode ierr;
669 
670   PetscFunctionBegin;
671   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
672   if (sf->ops->GetGraph) {
673     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
674   } else {
675     if (nroots) *nroots = sf->nroots;
676     if (nleaves) *nleaves = sf->nleaves;
677     if (ilocal) *ilocal = sf->mine;
678     if (iremote) *iremote = sf->remote;
679   }
680   PetscFunctionReturn(0);
681 }
682 
683 /*@
684    PetscSFGetLeafRange - Get the active leaf ranges
685 
686    Not Collective
687 
688    Input Arguments:
689 .  sf - star forest
690 
691    Output Arguments:
692 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
693 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
694 
695    Level: developer
696 
697 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
698 @*/
699 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
700 {
701 
702   PetscFunctionBegin;
703   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
704   PetscSFCheckGraphSet(sf,1);
705   if (minleaf) *minleaf = sf->minleaf;
706   if (maxleaf) *maxleaf = sf->maxleaf;
707   PetscFunctionReturn(0);
708 }
709 
710 /*@C
711    PetscSFView - view a star forest
712 
713    Collective
714 
715    Input Arguments:
716 +  sf - star forest
717 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
718 
719    Level: beginner
720 
721 .seealso: PetscSFCreate(), PetscSFSetGraph()
722 @*/
723 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
724 {
725   PetscErrorCode    ierr;
726   PetscBool         iascii;
727   PetscViewerFormat format;
728 
729   PetscFunctionBegin;
730   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
731   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
732   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
733   PetscCheckSameComm(sf,1,viewer,2);
734   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
735   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
736   if (iascii) {
737     PetscMPIInt rank;
738     PetscInt    ii,i,j;
739 
740     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
741     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
742     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
743     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
744       if (!sf->graphset) {
745         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
746         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
747         PetscFunctionReturn(0);
748       }
749       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
750       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
751       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
752       for (i=0; i<sf->nleaves; i++) {
753         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
754       }
755       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
756       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
757       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
758         PetscMPIInt *tmpranks,*perm;
759         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
760         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
761         for (i=0; i<sf->nranks; i++) perm[i] = i;
762         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
763         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
764         for (ii=0; ii<sf->nranks; ii++) {
765           i = perm[ii];
766           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
767           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
768             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
769           }
770         }
771         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
772       }
773       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
774       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
775     }
776     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
777   }
778   PetscFunctionReturn(0);
779 }
780 
781 /*@C
782    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
783 
784    Not Collective
785 
786    Input Arguments:
787 .  sf - star forest
788 
789    Output Arguments:
790 +  nranks - number of ranks referenced by local part
791 .  ranks - array of ranks
792 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
793 .  rmine - concatenated array holding local indices referencing each remote rank
794 -  rremote - concatenated array holding remote indices referenced for each remote rank
795 
796    Level: developer
797 
798 .seealso: PetscSFGetLeafRanks()
799 @*/
800 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
801 {
802   PetscErrorCode ierr;
803 
804   PetscFunctionBegin;
805   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
806   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
807   if (sf->ops->GetRootRanks) {
808     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
809   } else {
810     /* The generic implementation */
811     if (nranks)  *nranks  = sf->nranks;
812     if (ranks)   *ranks   = sf->ranks;
813     if (roffset) *roffset = sf->roffset;
814     if (rmine)   *rmine   = sf->rmine;
815     if (rremote) *rremote = sf->rremote;
816   }
817   PetscFunctionReturn(0);
818 }
819 
820 /*@C
821    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
822 
823    Not Collective
824 
825    Input Arguments:
826 .  sf - star forest
827 
828    Output Arguments:
829 +  niranks - number of leaf ranks referencing roots on this process
830 .  iranks - array of ranks
831 .  ioffset - offset in irootloc for each rank (length niranks+1)
832 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
833 
834    Level: developer
835 
836 .seealso: PetscSFGetRootRanks()
837 @*/
838 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
839 {
840   PetscErrorCode ierr;
841 
842   PetscFunctionBegin;
843   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
844   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
845   if (sf->ops->GetLeafRanks) {
846     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
847   } else {
848     PetscSFType type;
849     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
850     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
851   }
852   PetscFunctionReturn(0);
853 }
854 
855 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
856   PetscInt i;
857   for (i=0; i<n; i++) {
858     if (needle == list[i]) return PETSC_TRUE;
859   }
860   return PETSC_FALSE;
861 }
862 
863 /*@C
864    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
865 
866    Collective
867 
868    Input Arguments:
869 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
870 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
871 
872    Level: developer
873 
874 .seealso: PetscSFGetRootRanks()
875 @*/
876 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
877 {
878   PetscErrorCode     ierr;
879   PetscTable         table;
880   PetscTablePosition pos;
881   PetscMPIInt        size,groupsize,*groupranks;
882   PetscInt           *rcount,*ranks;
883   PetscInt           i, irank = -1,orank = -1;
884 
885   PetscFunctionBegin;
886   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
887   PetscSFCheckGraphSet(sf,1);
888   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
889   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
890   for (i=0; i<sf->nleaves; i++) {
891     /* Log 1-based rank */
892     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
893   }
894   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
895   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
896   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
897   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
898   for (i=0; i<sf->nranks; i++) {
899     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
900     ranks[i]--;             /* Convert back to 0-based */
901   }
902   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
903 
904   /* We expect that dgroup is reliably "small" while nranks could be large */
905   {
906     MPI_Group group = MPI_GROUP_NULL;
907     PetscMPIInt *dgroupranks;
908     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
909     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRQ(ierr);
910     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
911     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
912     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
913     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRQ(ierr);}
914     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
915     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
916   }
917 
918   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
919   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i; ) {
920     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
921       if (InList(ranks[i],groupsize,groupranks)) break;
922     }
923     for ( ; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
924       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
925     }
926     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
927       PetscInt    tmprank,tmpcount;
928 
929       tmprank             = ranks[i];
930       tmpcount            = rcount[i];
931       ranks[i]            = ranks[sf->ndranks];
932       rcount[i]           = rcount[sf->ndranks];
933       ranks[sf->ndranks]  = tmprank;
934       rcount[sf->ndranks] = tmpcount;
935       sf->ndranks++;
936     }
937   }
938   ierr = PetscFree(groupranks);CHKERRQ(ierr);
939   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
940   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
941   sf->roffset[0] = 0;
942   for (i=0; i<sf->nranks; i++) {
943     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
944     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
945     rcount[i]        = 0;
946   }
947   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
948     /* short circuit */
949     if (orank != sf->remote[i].rank) {
950       /* Search for index of iremote[i].rank in sf->ranks */
951       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
952       if (irank < 0) {
953         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
954         if (irank >= 0) irank += sf->ndranks;
955       }
956       orank = sf->remote[i].rank;
957     }
958     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
959     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
960     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
961     rcount[irank]++;
962   }
963   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
964   PetscFunctionReturn(0);
965 }
966 
967 /*@C
968    PetscSFGetGroups - gets incoming and outgoing process groups
969 
970    Collective
971 
972    Input Argument:
973 .  sf - star forest
974 
975    Output Arguments:
976 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
977 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
978 
979    Level: developer
980 
981 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
982 @*/
983 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
984 {
985   PetscErrorCode ierr;
986   MPI_Group      group = MPI_GROUP_NULL;
987 
988   PetscFunctionBegin;
989   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining groups");
990   if (sf->ingroup == MPI_GROUP_NULL) {
991     PetscInt       i;
992     const PetscInt *indegree;
993     PetscMPIInt    rank,*outranks,*inranks;
994     PetscSFNode    *remote;
995     PetscSF        bgcount;
996 
997     /* Compute the number of incoming ranks */
998     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
999     for (i=0; i<sf->nranks; i++) {
1000       remote[i].rank  = sf->ranks[i];
1001       remote[i].index = 0;
1002     }
1003     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1004     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1005     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1006     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1007 
1008     /* Enumerate the incoming ranks */
1009     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1010     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1011     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1012     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1013     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1014     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1015     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRQ(ierr);
1016     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1017     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1018     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1019   }
1020   *incoming = sf->ingroup;
1021 
1022   if (sf->outgroup == MPI_GROUP_NULL) {
1023     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1024     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRQ(ierr);
1025     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1026   }
1027   *outgoing = sf->outgroup;
1028   PetscFunctionReturn(0);
1029 }
1030 
1031 /*@
1032    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1033 
1034    Collective
1035 
1036    Input Argument:
1037 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1038 
1039    Output Arguments:
1040 .  multi - star forest with split roots, such that each root has degree exactly 1
1041 
1042    Level: developer
1043 
1044    Notes:
1045 
1046    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1047    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1048    edge, it is a candidate for future optimization that might involve its removal.
1049 
1050 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1051 @*/
1052 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1053 {
1054   PetscErrorCode ierr;
1055 
1056   PetscFunctionBegin;
1057   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1058   PetscValidPointer(multi,2);
1059   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1060     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1061     *multi = sf->multi;
1062     PetscFunctionReturn(0);
1063   }
1064   if (!sf->multi) {
1065     const PetscInt *indegree;
1066     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1067     PetscSFNode    *remote;
1068     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1069     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1070     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1071     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1072     inoffset[0] = 0;
1073     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1074     for (i=0; i<maxlocal; i++) outones[i] = 1;
1075     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1076     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1077     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1078 #if 0
1079 #if defined(PETSC_USE_DEBUG)                                 /* Check that the expected number of increments occurred */
1080     for (i=0; i<sf->nroots; i++) {
1081       if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1082     }
1083 #endif
1084 #endif
1085     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1086     for (i=0; i<sf->nleaves; i++) {
1087       remote[i].rank  = sf->remote[i].rank;
1088       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1089     }
1090     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1091     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1092     if (sf->rankorder) {        /* Sort the ranks */
1093       PetscMPIInt rank;
1094       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1095       PetscSFNode *newremote;
1096       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1097       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1098       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1099       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1100       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1101       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1102       /* Sort the incoming ranks at each vertex, build the inverse map */
1103       for (i=0; i<sf->nroots; i++) {
1104         PetscInt j;
1105         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1106         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1107         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1108       }
1109       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1110       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1111       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1112       for (i=0; i<sf->nleaves; i++) {
1113         newremote[i].rank  = sf->remote[i].rank;
1114         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1115       }
1116       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1117       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1118     }
1119     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1120   }
1121   *multi = sf->multi;
1122   PetscFunctionReturn(0);
1123 }
1124 
1125 /*@C
1126    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1127 
1128    Collective
1129 
1130    Input Arguments:
1131 +  sf - original star forest
1132 .  nselected  - number of selected roots on this process
1133 -  selected   - indices of the selected roots on this process
1134 
1135    Output Arguments:
1136 .  newsf - new star forest
1137 
1138    Level: advanced
1139 
1140    Note:
1141    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1142    be done by calling PetscSFGetGraph().
1143 
1144 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1145 @*/
1146 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1147 {
1148   PetscInt          i,n,*roots,*rootdata,*leafdata,nroots,nleaves,connected_leaves,*new_ilocal;
1149   const PetscSFNode *iremote;
1150   PetscSFNode       *new_iremote;
1151   PetscSF           tmpsf;
1152   MPI_Comm          comm;
1153   PetscErrorCode    ierr;
1154 
1155   PetscFunctionBegin;
1156   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1157   PetscSFCheckGraphSet(sf,1);
1158   if (nselected) PetscValidPointer(selected,3);
1159   PetscValidPointer(newsf,4);
1160 
1161   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1162 
1163   /* Uniq selected[] and put results in roots[] */
1164   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1165   ierr = PetscMalloc1(nselected,&roots);CHKERRQ(ierr);
1166   ierr = PetscArraycpy(roots,selected,nselected);CHKERRQ(ierr);
1167   ierr = PetscSortedRemoveDupsInt(&nselected,roots);CHKERRQ(ierr);
1168   if (nselected && (roots[0] < 0 || roots[nselected-1] >= sf->nroots)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max root indices %D/%D are not in [0,%D)",roots[0],roots[nselected-1],sf->nroots);
1169 
1170   if (sf->ops->CreateEmbeddedSF) {
1171     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,roots,newsf);CHKERRQ(ierr);
1172   } else {
1173     /* A generic version of creating embedded sf. Note that we called PetscSFSetGraph() twice, which is certainly expensive */
1174     /* Find out which leaves (not leaf data items) are still connected to roots in the embedded sf */
1175     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,NULL,&iremote);CHKERRQ(ierr);
1176     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&tmpsf);CHKERRQ(ierr);
1177     ierr = PetscSFSetGraph(tmpsf,nroots,nleaves,NULL/*contiguous*/,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);CHKERRQ(ierr);
1178     ierr = PetscCalloc2(nroots,&rootdata,nleaves,&leafdata);CHKERRQ(ierr);
1179     for (i=0; i<nselected; ++i) rootdata[roots[i]] = 1;
1180     ierr = PetscSFBcastBegin(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1181     ierr = PetscSFBcastEnd(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1182     ierr = PetscSFDestroy(&tmpsf);CHKERRQ(ierr);
1183 
1184     /* Build newsf with leaves that are still connected */
1185     connected_leaves = 0;
1186     for (i=0; i<nleaves; ++i) connected_leaves += leafdata[i];
1187     ierr = PetscMalloc1(connected_leaves,&new_ilocal);CHKERRQ(ierr);
1188     ierr = PetscMalloc1(connected_leaves,&new_iremote);CHKERRQ(ierr);
1189     for (i=0, n=0; i<nleaves; ++i) {
1190       if (leafdata[i]) {
1191         new_ilocal[n]        = sf->mine ? sf->mine[i] : i;
1192         new_iremote[n].rank  = sf->remote[i].rank;
1193         new_iremote[n].index = sf->remote[i].index;
1194         ++n;
1195       }
1196     }
1197 
1198     if (n != connected_leaves) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"There is a size mismatch in the SF embedding, %d != %d",n,connected_leaves);
1199     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1200     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1201     ierr = PetscSFSetGraph(*newsf,nroots,connected_leaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1202     ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
1203   }
1204   ierr = PetscFree(roots);CHKERRQ(ierr);
1205   PetscFunctionReturn(0);
1206 }
1207 
1208 /*@C
1209   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1210 
1211   Collective
1212 
1213   Input Arguments:
1214 + sf - original star forest
1215 . nselected  - number of selected leaves on this process
1216 - selected   - indices of the selected leaves on this process
1217 
1218   Output Arguments:
1219 .  newsf - new star forest
1220 
1221   Level: advanced
1222 
1223 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1224 @*/
1225 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1226 {
1227   const PetscSFNode *iremote;
1228   PetscSFNode       *new_iremote;
1229   const PetscInt    *ilocal;
1230   PetscInt          i,nroots,*leaves,*new_ilocal;
1231   MPI_Comm          comm;
1232   PetscErrorCode    ierr;
1233 
1234   PetscFunctionBegin;
1235   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1236   PetscSFCheckGraphSet(sf,1);
1237   if (nselected) PetscValidPointer(selected,3);
1238   PetscValidPointer(newsf,4);
1239 
1240   /* Uniq selected[] and put results in leaves[] */
1241   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1242   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1243   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1244   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1245   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1246 
1247   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1248   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1249     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1250   } else {
1251     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1252     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1253     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1254     for (i=0; i<nselected; ++i) {
1255       const PetscInt l     = leaves[i];
1256       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1257       new_iremote[i].rank  = iremote[l].rank;
1258       new_iremote[i].index = iremote[l].index;
1259     }
1260     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1261     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1262     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1263   }
1264   ierr = PetscFree(leaves);CHKERRQ(ierr);
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 /*@C
1269    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1270 
1271    Collective on PetscSF
1272 
1273    Input Arguments:
1274 +  sf - star forest on which to communicate
1275 .  unit - data type associated with each node
1276 .  rootdata - buffer to broadcast
1277 -  op - operation to use for reduction
1278 
1279    Output Arguments:
1280 .  leafdata - buffer to be reduced with values from each leaf's respective root
1281 
1282    Level: intermediate
1283 
1284 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd()
1285 @*/
1286 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1287 {
1288   PetscErrorCode ierr;
1289   PetscMemType   rootmtype,leafmtype;
1290 
1291   PetscFunctionBegin;
1292   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1293   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1294   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1295   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1296   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1297 #if defined(PETSC_HAVE_CUDA)
1298   /*  Shall we assume rootdata, leafdata are ready to use (instead of being computed by some asynchronous kernels)?
1299     To be similar to MPI, I'd like to have this assumption, since MPI does not have a concept of stream.
1300     But currently this assumption is not enforecd in Petsc. To be safe, I do synchronization here. Otherwise, if
1301     we do not sync now and call the Pack kernel directly on the default NULL stream (assume petsc objects are also
1302     computed on it), we have to sync the NULL stream before calling MPI routines. So, it looks a cudaDeviceSynchronize
1303     is inevitable. We do it now and put pack/unpack kernels to non-NULL streams.
1304    */
1305   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1306 #endif
1307   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1308   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1309   PetscFunctionReturn(0);
1310 }
1311 
1312 /*@C
1313    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1314 
1315    Collective
1316 
1317    Input Arguments:
1318 +  sf - star forest
1319 .  unit - data type
1320 .  rootdata - buffer to broadcast
1321 -  op - operation to use for reduction
1322 
1323    Output Arguments:
1324 .  leafdata - buffer to be reduced with values from each leaf's respective root
1325 
1326    Level: intermediate
1327 
1328 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1329 @*/
1330 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1331 {
1332   PetscErrorCode ierr;
1333   PetscMemType   rootmtype,leafmtype;
1334 
1335   PetscFunctionBegin;
1336   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1337   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1338   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1339   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1340   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1341   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1342   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1343   PetscFunctionReturn(0);
1344 }
1345 
1346 /*@C
1347    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1348 
1349    Collective
1350 
1351    Input Arguments:
1352 +  sf - star forest
1353 .  unit - data type
1354 .  leafdata - values to reduce
1355 -  op - reduction operation
1356 
1357    Output Arguments:
1358 .  rootdata - result of reduction of values from all leaves of each root
1359 
1360    Level: intermediate
1361 
1362 .seealso: PetscSFBcastBegin()
1363 @*/
1364 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1365 {
1366   PetscErrorCode ierr;
1367   PetscMemType   rootmtype,leafmtype;
1368 
1369   PetscFunctionBegin;
1370   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1371   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1372   ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1373   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1374   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1375 #if defined(PETSC_HAVE_CUDA)
1376   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1377 #endif
1378   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1379   ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1380   PetscFunctionReturn(0);
1381 }
1382 
1383 /*@C
1384    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1385 
1386    Collective
1387 
1388    Input Arguments:
1389 +  sf - star forest
1390 .  unit - data type
1391 .  leafdata - values to reduce
1392 -  op - reduction operation
1393 
1394    Output Arguments:
1395 .  rootdata - result of reduction of values from all leaves of each root
1396 
1397    Level: intermediate
1398 
1399 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1400 @*/
1401 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1402 {
1403   PetscErrorCode ierr;
1404   PetscMemType   rootmtype,leafmtype;
1405 
1406   PetscFunctionBegin;
1407   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1408   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1409   ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1410   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1411   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1412   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1413   ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1414   PetscFunctionReturn(0);
1415 }
1416 
1417 /*@C
1418    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1419 
1420    Collective
1421 
1422    Input Arguments:
1423 +  sf - star forest
1424 .  unit - data type
1425 .  leafdata - leaf values to use in reduction
1426 -  op - operation to use for reduction
1427 
1428    Output Arguments:
1429 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1430 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1431 
1432    Level: advanced
1433 
1434    Note:
1435    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1436    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1437    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1438    integers.
1439 
1440 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1441 @*/
1442 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1443 {
1444   PetscErrorCode ierr;
1445   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1446 
1447   PetscFunctionBegin;
1448   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1449   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1450   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1451   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1452   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1453   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1454   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1455 #if defined(PETSC_HAVE_CUDA)
1456   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1457 #endif
1458   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1459   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1460   PetscFunctionReturn(0);
1461 }
1462 
1463 /*@C
1464    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1465 
1466    Collective
1467 
1468    Input Arguments:
1469 +  sf - star forest
1470 .  unit - data type
1471 .  leafdata - leaf values to use in reduction
1472 -  op - operation to use for reduction
1473 
1474    Output Arguments:
1475 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1476 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1477 
1478    Level: advanced
1479 
1480 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1481 @*/
1482 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1483 {
1484   PetscErrorCode ierr;
1485   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1486 
1487   PetscFunctionBegin;
1488   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1489   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1490   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1491   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1492   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1493   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1494   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1495   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1496   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1497   PetscFunctionReturn(0);
1498 }
1499 
1500 /*@C
1501    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1502 
1503    Collective
1504 
1505    Input Arguments:
1506 .  sf - star forest
1507 
1508    Output Arguments:
1509 .  degree - degree of each root vertex
1510 
1511    Level: advanced
1512 
1513    Notes:
1514    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1515 
1516 .seealso: PetscSFGatherBegin()
1517 @*/
1518 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1519 {
1520   PetscErrorCode ierr;
1521 
1522   PetscFunctionBegin;
1523   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1524   PetscSFCheckGraphSet(sf,1);
1525   PetscValidPointer(degree,2);
1526   if (!sf->degreeknown) {
1527     PetscInt i, nroots = sf->nroots, maxlocal = sf->maxleaf+1;  /* TODO: We should use PetscSFGetLeafRange() */
1528     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1529     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1530     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1531     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1532     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1533     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1534   }
1535   *degree = NULL;
1536   PetscFunctionReturn(0);
1537 }
1538 
1539 /*@C
1540    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1541 
1542    Collective
1543 
1544    Input Arguments:
1545 .  sf - star forest
1546 
1547    Output Arguments:
1548 .  degree - degree of each root vertex
1549 
1550    Level: developer
1551 
1552    Notes:
1553    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1554 
1555 .seealso:
1556 @*/
1557 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1558 {
1559   PetscErrorCode ierr;
1560 
1561   PetscFunctionBegin;
1562   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1563   PetscSFCheckGraphSet(sf,1);
1564   PetscValidPointer(degree,2);
1565   if (!sf->degreeknown) {
1566     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1567     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1568     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1569     sf->degreeknown = PETSC_TRUE;
1570   }
1571   *degree = sf->degree;
1572   PetscFunctionReturn(0);
1573 }
1574 
1575 
1576 /*@C
1577    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1578    Each multi-root is assigned index of the corresponding original root.
1579 
1580    Collective
1581 
1582    Input Arguments:
1583 +  sf - star forest
1584 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1585 
1586    Output Arguments:
1587 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1588 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1589 
1590    Level: developer
1591 
1592    Notes:
1593    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1594 
1595 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1596 @*/
1597 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1598 {
1599   PetscSF             msf;
1600   PetscInt            i, j, k, nroots, nmroots;
1601   PetscErrorCode      ierr;
1602 
1603   PetscFunctionBegin;
1604   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1605   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1606   if (nroots) PetscValidIntPointer(degree,2);
1607   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1608   PetscValidPointer(multiRootsOrigNumbering,4);
1609   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1610   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1611   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1612   for (i=0,j=0,k=0; i<nroots; i++) {
1613     if (!degree[i]) continue;
1614     for (j=0; j<degree[i]; j++,k++) {
1615       (*multiRootsOrigNumbering)[k] = i;
1616     }
1617   }
1618 #if defined(PETSC_USE_DEBUG)
1619   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1620 #endif
1621   if (nMultiRoots) *nMultiRoots = nmroots;
1622   PetscFunctionReturn(0);
1623 }
1624 
1625 /*@C
1626    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1627 
1628    Collective
1629 
1630    Input Arguments:
1631 +  sf - star forest
1632 .  unit - data type
1633 -  leafdata - leaf data to gather to roots
1634 
1635    Output Argument:
1636 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1637 
1638    Level: intermediate
1639 
1640 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1641 @*/
1642 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1643 {
1644   PetscErrorCode ierr;
1645   PetscSF        multi;
1646 
1647   PetscFunctionBegin;
1648   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1649   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1650   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1651   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1652   PetscFunctionReturn(0);
1653 }
1654 
1655 /*@C
1656    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1657 
1658    Collective
1659 
1660    Input Arguments:
1661 +  sf - star forest
1662 .  unit - data type
1663 -  leafdata - leaf data to gather to roots
1664 
1665    Output Argument:
1666 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1667 
1668    Level: intermediate
1669 
1670 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1671 @*/
1672 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1673 {
1674   PetscErrorCode ierr;
1675   PetscSF        multi;
1676 
1677   PetscFunctionBegin;
1678   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1679   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1680   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1681   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1682   PetscFunctionReturn(0);
1683 }
1684 
1685 /*@C
1686    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1687 
1688    Collective
1689 
1690    Input Arguments:
1691 +  sf - star forest
1692 .  unit - data type
1693 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1694 
1695    Output Argument:
1696 .  leafdata - leaf data to be update with personal data from each respective root
1697 
1698    Level: intermediate
1699 
1700 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1701 @*/
1702 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1703 {
1704   PetscErrorCode ierr;
1705   PetscSF        multi;
1706 
1707   PetscFunctionBegin;
1708   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1709   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1710   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1711   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1712   PetscFunctionReturn(0);
1713 }
1714 
1715 /*@C
1716    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1717 
1718    Collective
1719 
1720    Input Arguments:
1721 +  sf - star forest
1722 .  unit - data type
1723 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1724 
1725    Output Argument:
1726 .  leafdata - leaf data to be update with personal data from each respective root
1727 
1728    Level: intermediate
1729 
1730 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1731 @*/
1732 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1733 {
1734   PetscErrorCode ierr;
1735   PetscSF        multi;
1736 
1737   PetscFunctionBegin;
1738   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1739   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1740   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1741   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1746 {
1747 #if defined(PETSC_USE_DEBUG)
1748   PetscInt        i, n, nleaves;
1749   const PetscInt *ilocal = NULL;
1750   PetscHSetI      seen;
1751   PetscErrorCode  ierr;
1752 
1753   PetscFunctionBegin;
1754   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1755   ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1756   for (i = 0; i < nleaves; i++) {
1757     const PetscInt leaf = ilocal ? ilocal[i] : i;
1758     ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1759   }
1760   ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1761   if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1762   ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1763   PetscFunctionReturn(0);
1764 #else
1765   PetscFunctionBegin;
1766   PetscFunctionReturn(0);
1767 #endif
1768 }
1769 /*@
1770   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1771 
1772   Input Parameters:
1773 + sfA - The first PetscSF, whose local space may be a permutation, but can not be sparse.
1774 - sfB - The second PetscSF, whose number of roots must be equal to number of leaves of sfA on each processor
1775 
1776   Output Parameters:
1777 . sfBA - The composite SF. Doing a Bcast on the new SF is equvalent to doing Bcast on sfA, then Bcast on sfB
1778 
1779   Level: developer
1780 
1781   Notes:
1782   For the resulting composed SF to be valid, the input SFs must be true star forests: the leaves must be unique. This is
1783   checked in debug mode.
1784 
1785 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1786 @*/
1787 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1788 {
1789   PetscErrorCode    ierr;
1790   MPI_Comm          comm;
1791   const PetscSFNode *remotePointsA,*remotePointsB;
1792   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1793   const PetscInt    *localPointsA,*localPointsB,*localPointsBA;
1794   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf;
1795 
1796   PetscFunctionBegin;
1797   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1798   PetscSFCheckGraphSet(sfA,1);
1799   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1800   PetscSFCheckGraphSet(sfB,2);
1801   PetscValidPointer(sfBA,3);
1802   ierr = PetscObjectGetComm((PetscObject)sfA,&comm);CHKERRQ(ierr);
1803   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1804   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1805   ierr = PetscSFGetLeafRange(sfA,&minleaf,&maxleaf);CHKERRQ(ierr);
1806   if (numRootsB != numLeavesA) SETERRQ(comm,PETSC_ERR_ARG_INCOMP,"The second SF's number of roots must be equal to the first SF's number of leaves");
1807   if (maxleaf+1 != numLeavesA || minleaf) SETERRQ(comm,PETSC_ERR_ARG_INCOMP,"The first SF can not have sparse local space");
1808   /* The above check is fast, but not sufficient, since we cannot guarantee that the SF has unique leaves. So in debug
1809    mode, check properly. */
1810   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1811   if (localPointsA) {
1812     /* Local space is dense permutation of identity. Need to rewire order of the remote points */
1813     ierr = PetscMalloc1(numLeavesA,&reorderedRemotePointsA);CHKERRQ(ierr);
1814     for (i=0; i<numLeavesA; i++) reorderedRemotePointsA[localPointsA[i]-minleaf] = remotePointsA[i];
1815     remotePointsA = reorderedRemotePointsA;
1816   }
1817   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
1818   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
1819   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1820   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1821   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1822 
1823   /* sfB's leaves must be unique, otherwise BcastAndOp(B o A) != BcastAndOp(B) o BcastAndOp(A) */
1824   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1825   if (minleaf == 0 && maxleaf + 1 == numLeavesB) { /* Local space of sfB is an identity or permutation */
1826     localPointsBA  = NULL;
1827     remotePointsBA = leafdataB;
1828   } else {
1829     localPointsBA  = localPointsB;
1830     ierr = PetscMalloc1(numLeavesB,&remotePointsBA);CHKERRQ(ierr);
1831     for (i=0; i<numLeavesB; i++) remotePointsBA[i] = leafdataB[localPointsB[i]-minleaf];
1832     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
1833   }
1834   ierr = PetscSFCreate(comm, sfBA);CHKERRQ(ierr);
1835   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesB,localPointsBA,PETSC_COPY_VALUES,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1836   PetscFunctionReturn(0);
1837 }
1838 
1839 /*@
1840   PetscSFComposeInverse - Compose a new PetscSF by putting inverse of the second SF under the first one
1841 
1842   Input Parameters:
1843 + sfA - The first PetscSF, whose local space may be a permutation, but can not be sparse.
1844 - sfB - The second PetscSF, whose number of leaves must be equal to number of leaves of sfA on each processor. All roots must have degree 1.
1845 
1846   Output Parameters:
1847 . sfBA - The composite SF. Doing a Bcast on the new SF is equvalent to doing Bcast on sfA, then Bcast on inverse of sfB
1848 
1849   Level: developer
1850 
1851 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph()
1852 @*/
1853 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1854 {
1855   PetscErrorCode    ierr;
1856   MPI_Comm          comm;
1857   const PetscSFNode *remotePointsA,*remotePointsB;
1858   PetscSFNode       *remotePointsBA;
1859   const PetscInt    *localPointsA,*localPointsB;
1860   PetscInt          numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf;
1861 
1862   PetscFunctionBegin;
1863   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1864   PetscSFCheckGraphSet(sfA,1);
1865   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1866   PetscSFCheckGraphSet(sfB,2);
1867   PetscValidPointer(sfBA,3);
1868   ierr = PetscObjectGetComm((PetscObject) sfA, &comm);CHKERRQ(ierr);
1869   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
1870   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
1871   ierr = PetscSFGetLeafRange(sfA,&minleaf,&maxleaf);CHKERRQ(ierr);
1872   if (maxleaf+1-minleaf != numLeavesA) SETERRQ(comm,PETSC_ERR_ARG_INCOMP,"The first SF can not have sparse local space");
1873   if (numLeavesA != numLeavesB) SETERRQ(comm,PETSC_ERR_ARG_INCOMP,"The second SF's number of leaves must be equal to the first SF's number of leaves");
1874   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
1875   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,remotePointsA,remotePointsBA,MPIU_REPLACE);CHKERRQ(ierr);
1876   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,remotePointsA,remotePointsBA,MPIU_REPLACE);CHKERRQ(ierr);
1877   ierr = PetscSFCreate(comm,sfBA);CHKERRQ(ierr);
1878   ierr = PetscSFSetGraph(*sfBA,numRootsA,numRootsB,NULL,PETSC_COPY_VALUES,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1879   PetscFunctionReturn(0);
1880 }
1881 
1882 /*
1883   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
1884 
1885   Input Parameters:
1886 . sf - The global PetscSF
1887 
1888   Output Parameters:
1889 . out - The local PetscSF
1890  */
1891 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
1892 {
1893   MPI_Comm           comm;
1894   PetscMPIInt        myrank;
1895   const PetscInt     *ilocal;
1896   const PetscSFNode  *iremote;
1897   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
1898   PetscSFNode        *liremote;
1899   PetscSF            lsf;
1900   PetscErrorCode     ierr;
1901 
1902   PetscFunctionBegin;
1903   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1904   if (sf->ops->CreateLocalSF) {
1905     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
1906   } else {
1907     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
1908     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1909     ierr = MPI_Comm_rank(comm,&myrank);CHKERRQ(ierr);
1910 
1911     /* Find out local edges and build a local SF */
1912     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1913     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
1914     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
1915     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
1916 
1917     for (i=j=0; i<nleaves; i++) {
1918       if (iremote[i].rank == (PetscInt)myrank) {
1919         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
1920         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
1921         liremote[j].index = iremote[i].index;
1922         j++;
1923       }
1924     }
1925     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
1926     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1927     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
1928     *out = lsf;
1929   }
1930   PetscFunctionReturn(0);
1931 }
1932 
1933 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
1934 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
1935 {
1936   PetscErrorCode     ierr;
1937   PetscMemType       rootmtype,leafmtype;
1938 
1939   PetscFunctionBegin;
1940   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1941   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1942   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1943   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1944   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1945   if (sf->ops->BcastToZero) {
1946     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
1947   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
1948   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1949   PetscFunctionReturn(0);
1950 }
1951 
1952