xref: /petsc/src/vec/is/sf/interface/sf.c (revision c823fc1f4e9a95d0fa9549bc75dfa3b1db6e9a8a)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_HAVE_CUDA)
6   #include <cuda_runtime.h>
7 #endif
8 
9 #if defined(PETSC_HAVE_HIP)
10   #include <hip/hip_runtime.h>
11 #endif
12 
13 #if defined(PETSC_USE_DEBUG)
14 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
15     if (PetscUnlikely(!(sf)->graphset))                              \
16       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
17   } while (0)
18 #else
19 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
20 #endif
21 
22 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",NULL};
23 
24 PETSC_STATIC_INLINE PetscErrorCode PetscGetMemType(const void *data,PetscMemType *type)
25 {
26   PetscFunctionBegin;
27   PetscValidPointer(type,2);
28   *type = PETSC_MEMTYPE_HOST;
29 #if defined(PETSC_HAVE_CUDA)
30   if (PetscCUDAInitialized && data) {
31     cudaError_t                  cerr;
32     struct cudaPointerAttributes attr;
33     enum cudaMemoryType          mtype;
34     cerr = cudaPointerGetAttributes(&attr,data); /* Do not check error since before CUDA 11.0, passing a host pointer returns cudaErrorInvalidValue */
35     cudaGetLastError(); /* Reset the last error */
36     #if (CUDART_VERSION < 10000)
37       mtype = attr.memoryType;
38     #else
39       mtype = attr.type;
40     #endif
41     if (cerr == cudaSuccess && mtype == cudaMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
42   }
43 #endif
44 
45 #if defined(PETSC_HAVE_HIP)
46   if (PetscHIPInitialized && data) {
47     hipError_t                   cerr;
48     struct hipPointerAttribute_t attr;
49     enum hipMemoryType           mtype;
50     cerr = hipPointerGetAttributes(&attr,data); /* Do not check error since before CUDA 11.0, passing a host pointer returns cudaErrorInvalidValue */
51     hipGetLastError(); /* Reset the last error */
52     mtype = attr.memoryType;
53     if (cerr == hipSuccess && mtype == hipMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
54   }
55 #endif
56   PetscFunctionReturn(0);
57 }
58 
59 /*@
60    PetscSFCreate - create a star forest communication context
61 
62    Collective
63 
64    Input Arguments:
65 .  comm - communicator on which the star forest will operate
66 
67    Output Arguments:
68 .  sf - new star forest context
69 
70    Options Database Keys:
71 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
72 .  -sf_type window    -Use MPI-3 one-sided window for communication
73 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
74 
75    Level: intermediate
76 
77    Notes:
78    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
79    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
80    SFs are optimized and they have better performance than general SFs.
81 
82 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
83 @*/
84 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
85 {
86   PetscErrorCode ierr;
87   PetscSF        b;
88 
89   PetscFunctionBegin;
90   PetscValidPointer(sf,2);
91   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
92 
93   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
94 
95   b->nroots    = -1;
96   b->nleaves   = -1;
97   b->minleaf   = PETSC_MAX_INT;
98   b->maxleaf   = PETSC_MIN_INT;
99   b->nranks    = -1;
100   b->rankorder = PETSC_TRUE;
101   b->ingroup   = MPI_GROUP_NULL;
102   b->outgroup  = MPI_GROUP_NULL;
103   b->graphset  = PETSC_FALSE;
104 #if defined(PETSC_HAVE_DEVICE)
105   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
106   b->use_stream_aware_mpi = PETSC_FALSE;
107   b->use_default_stream   = PETSC_TRUE; /* The assumption is true for PETSc internal use of SF */
108   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
109     b->backend = PETSCSF_BACKEND_KOKKOS;
110   #elif defined(PETSC_HAVE_CUDA)
111     b->backend = PETSCSF_BACKEND_CUDA;
112   #endif
113 #endif
114   *sf = b;
115   PetscFunctionReturn(0);
116 }
117 
118 /*@
119    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
120 
121    Collective
122 
123    Input Arguments:
124 .  sf - star forest
125 
126    Level: advanced
127 
128 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
129 @*/
130 PetscErrorCode PetscSFReset(PetscSF sf)
131 {
132   PetscErrorCode ierr;
133 
134   PetscFunctionBegin;
135   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
136   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
137   sf->nroots   = -1;
138   sf->nleaves  = -1;
139   sf->minleaf  = PETSC_MAX_INT;
140   sf->maxleaf  = PETSC_MIN_INT;
141   sf->mine     = NULL;
142   sf->remote   = NULL;
143   sf->graphset = PETSC_FALSE;
144   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
145   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
146   sf->nranks = -1;
147   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
148 #if defined(PETSC_HAVE_DEVICE)
149   for (PetscInt i=0; i<2; i++) {ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,sf->rmine_d[i]);CHKERRQ(ierr);}
150 #endif
151   sf->degreeknown = PETSC_FALSE;
152   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
153   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRQ(ierr);}
154   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRQ(ierr);}
155   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
156   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
157   sf->setupcalled = PETSC_FALSE;
158   PetscFunctionReturn(0);
159 }
160 
161 /*@C
162    PetscSFSetType - Set the PetscSF communication implementation
163 
164    Collective on PetscSF
165 
166    Input Parameters:
167 +  sf - the PetscSF context
168 -  type - a known method
169 
170    Options Database Key:
171 .  -sf_type <type> - Sets the method; use -help for a list
172    of available methods (for instance, window, basic, neighbor)
173 
174    Notes:
175    See "include/petscsf.h" for available methods (for instance)
176 +    PETSCSFWINDOW - MPI-2/3 one-sided
177 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
178 
179   Level: intermediate
180 
181 .seealso: PetscSFType, PetscSFCreate()
182 @*/
183 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
184 {
185   PetscErrorCode ierr,(*r)(PetscSF);
186   PetscBool      match;
187 
188   PetscFunctionBegin;
189   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
190   PetscValidCharPointer(type,2);
191 
192   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
193   if (match) PetscFunctionReturn(0);
194 
195   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
196   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
197   /* Destroy the previous PetscSF implementation context */
198   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
199   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
200   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
201   ierr = (*r)(sf);CHKERRQ(ierr);
202   PetscFunctionReturn(0);
203 }
204 
205 /*@C
206   PetscSFGetType - Get the PetscSF communication implementation
207 
208   Not Collective
209 
210   Input Parameter:
211 . sf  - the PetscSF context
212 
213   Output Parameter:
214 . type - the PetscSF type name
215 
216   Level: intermediate
217 
218 .seealso: PetscSFSetType(), PetscSFCreate()
219 @*/
220 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
221 {
222   PetscFunctionBegin;
223   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
224   PetscValidPointer(type,2);
225   *type = ((PetscObject)sf)->type_name;
226   PetscFunctionReturn(0);
227 }
228 
229 /*@
230    PetscSFDestroy - destroy star forest
231 
232    Collective
233 
234    Input Arguments:
235 .  sf - address of star forest
236 
237    Level: intermediate
238 
239 .seealso: PetscSFCreate(), PetscSFReset()
240 @*/
241 PetscErrorCode PetscSFDestroy(PetscSF *sf)
242 {
243   PetscErrorCode ierr;
244 
245   PetscFunctionBegin;
246   if (!*sf) PetscFunctionReturn(0);
247   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
248   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
249   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
250   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
251   ierr = PetscSFDestroy(&(*sf)->vscat.lsf);CHKERRQ(ierr);
252   if ((*sf)->vscat.bs > 1) {ierr = MPI_Type_free(&(*sf)->vscat.unit);CHKERRQ(ierr);}
253   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
254   PetscFunctionReturn(0);
255 }
256 
257 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
258 {
259   PetscInt           i, nleaves;
260   PetscMPIInt        size;
261   const PetscInt    *ilocal;
262   const PetscSFNode *iremote;
263   PetscErrorCode     ierr;
264 
265   PetscFunctionBegin;
266   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
267   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
268   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
269   for (i = 0; i < nleaves; i++) {
270     const PetscInt rank = iremote[i].rank;
271     const PetscInt remote = iremote[i].index;
272     const PetscInt leaf = ilocal ? ilocal[i] : i;
273     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
274     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
275     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
276   }
277   PetscFunctionReturn(0);
278 }
279 
280 
281 
282 
283 /*@
284    PetscSFSetUp - set up communication structures
285 
286    Collective
287 
288    Input Arguments:
289 .  sf - star forest communication object
290 
291    Level: beginner
292 
293 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
294 @*/
295 PetscErrorCode PetscSFSetUp(PetscSF sf)
296 {
297   PetscErrorCode ierr;
298 
299   PetscFunctionBegin;
300   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
301   PetscSFCheckGraphSet(sf,1);
302   if (sf->setupcalled) PetscFunctionReturn(0);
303   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
304   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
305   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);} /* Zero all sf->ops */
306   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
307 #if defined(PETSC_HAVE_CUDA)
308   if (sf->backend == PETSCSF_BACKEND_CUDA) {
309     sf->ops->Malloc = PetscSFMalloc_Cuda;
310     sf->ops->Free   = PetscSFFree_Cuda;
311   }
312 #endif
313 
314 #if defined(PETSC_HAVE_KOKKOS)
315   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
316     sf->ops->Malloc = PetscSFMalloc_Kokkos;
317     sf->ops->Free   = PetscSFFree_Kokkos;
318   }
319 #endif
320   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
321   sf->setupcalled = PETSC_TRUE;
322   PetscFunctionReturn(0);
323 }
324 
325 /*@
326    PetscSFSetFromOptions - set PetscSF options using the options database
327 
328    Logically Collective
329 
330    Input Arguments:
331 .  sf - star forest
332 
333    Options Database Keys:
334 +  -sf_type               - implementation type, see PetscSFSetType()
335 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
336 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
337                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
338                             If true, this option only works with -use_gpu_aware_mpi 1.
339 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
340                                If true, this option only works with -use_gpu_aware_mpi 1.
341 
342 -  -sf_backend cuda | kokkos -Select the device backend SF uses. Currently SF has two backends: cuda and Kokkos.
343                               On CUDA devices, one can choose cuda or kokkos with the default being cuda. On other devices,
344                               the only available is kokkos.
345 
346    Level: intermediate
347 @*/
348 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
349 {
350   PetscSFType    deft;
351   char           type[256];
352   PetscErrorCode ierr;
353   PetscBool      flg;
354 
355   PetscFunctionBegin;
356   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
357   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
358   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
359   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
360   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
361   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
362 #if defined(PETSC_HAVE_DEVICE)
363   {
364     char        backendstr[32] = {0};
365     PetscBool   isCuda = PETSC_FALSE,isKokkos = PETSC_FALSE,set;
366     /* Change the defaults set in PetscSFCreate() with command line options */
367     ierr = PetscOptionsBool("-sf_use_default_stream","Assume SF's input and output root/leafdata is computed on the default stream","PetscSFSetFromOptions",sf->use_default_stream,&sf->use_default_stream,NULL);CHKERRQ(ierr);
368     ierr = PetscOptionsBool("-sf_use_stream_aware_mpi","Assume the underlying MPI is cuda-stream aware","PetscSFSetFromOptions",sf->use_stream_aware_mpi,&sf->use_stream_aware_mpi,NULL);CHKERRQ(ierr);
369     ierr = PetscOptionsString("-sf_backend","Select the device backend SF uses","PetscSFSetFromOptions",NULL,backendstr,sizeof(backendstr),&set);CHKERRQ(ierr);
370     ierr = PetscStrcasecmp("cuda",backendstr,&isCuda);CHKERRQ(ierr);
371     ierr = PetscStrcasecmp("kokkos",backendstr,&isKokkos);CHKERRQ(ierr);
372   #if defined(PETSC_HAVE_CUDA)
373     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
374     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
375     else if (set) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported on cuda devices. You may choose cuda or kokkos (if installed)", backendstr);
376   #elif defined(PETSC_HAVE_KOKKOS)
377     if (set && !isKokkos) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You can only choose kokkos", backendstr);
378   #endif
379   }
380 #endif
381   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
382   ierr = PetscOptionsEnd();CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 /*@
387    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
388 
389    Logically Collective
390 
391    Input Arguments:
392 +  sf - star forest
393 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
394 
395    Level: advanced
396 
397 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
398 @*/
399 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
400 {
401   PetscFunctionBegin;
402   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
403   PetscValidLogicalCollectiveBool(sf,flg,2);
404   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
405   sf->rankorder = flg;
406   PetscFunctionReturn(0);
407 }
408 
409 /*@
410    PetscSFSetGraph - Set a parallel star forest
411 
412    Collective
413 
414    Input Arguments:
415 +  sf - star forest
416 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
417 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
418 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
419 during setup in debug mode)
420 .  localmode - copy mode for ilocal
421 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
422 during setup in debug mode)
423 -  remotemode - copy mode for iremote
424 
425    Level: intermediate
426 
427    Notes:
428     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
429 
430    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
431    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
432    needed
433 
434    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
435    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
436 
437 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
438 @*/
439 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
440 {
441   PetscErrorCode ierr;
442 
443   PetscFunctionBegin;
444   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
445   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
446   if (nleaves > 0) PetscValidPointer(iremote,6);
447   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
448   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
449 
450   if (sf->nroots >= 0) { /* Reset only if graph already set */
451     ierr = PetscSFReset(sf);CHKERRQ(ierr);
452   }
453 
454   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
455 
456   sf->nroots  = nroots;
457   sf->nleaves = nleaves;
458 
459   if (nleaves && ilocal) {
460     PetscInt i;
461     PetscInt minleaf = PETSC_MAX_INT;
462     PetscInt maxleaf = PETSC_MIN_INT;
463     int      contiguous = 1;
464     for (i=0; i<nleaves; i++) {
465       minleaf = PetscMin(minleaf,ilocal[i]);
466       maxleaf = PetscMax(maxleaf,ilocal[i]);
467       contiguous &= (ilocal[i] == i);
468     }
469     sf->minleaf = minleaf;
470     sf->maxleaf = maxleaf;
471     if (contiguous) {
472       if (localmode == PETSC_OWN_POINTER) {
473         ierr = PetscFree(ilocal);CHKERRQ(ierr);
474       }
475       ilocal = NULL;
476     }
477   } else {
478     sf->minleaf = 0;
479     sf->maxleaf = nleaves - 1;
480   }
481 
482   if (ilocal) {
483     switch (localmode) {
484     case PETSC_COPY_VALUES:
485       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
486       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
487       sf->mine = sf->mine_alloc;
488       break;
489     case PETSC_OWN_POINTER:
490       sf->mine_alloc = (PetscInt*)ilocal;
491       sf->mine       = sf->mine_alloc;
492       break;
493     case PETSC_USE_POINTER:
494       sf->mine_alloc = NULL;
495       sf->mine       = (PetscInt*)ilocal;
496       break;
497     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
498     }
499   }
500 
501   switch (remotemode) {
502   case PETSC_COPY_VALUES:
503     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
504     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
505     sf->remote = sf->remote_alloc;
506     break;
507   case PETSC_OWN_POINTER:
508     sf->remote_alloc = (PetscSFNode*)iremote;
509     sf->remote       = sf->remote_alloc;
510     break;
511   case PETSC_USE_POINTER:
512     sf->remote_alloc = NULL;
513     sf->remote       = (PetscSFNode*)iremote;
514     break;
515   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
516   }
517 
518   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
519   sf->graphset = PETSC_TRUE;
520   PetscFunctionReturn(0);
521 }
522 
523 /*@
524   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
525 
526   Collective
527 
528   Input Parameters:
529 + sf      - The PetscSF
530 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
531 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
532 
533   Notes:
534   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
535   n and N are local and global sizes of x respectively.
536 
537   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
538   sequential vectors y on all ranks.
539 
540   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
541   sequential vector y on rank 0.
542 
543   In above cases, entries of x are roots and entries of y are leaves.
544 
545   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
546   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
547   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
548   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
549   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
550 
551   In this case, roots and leaves are symmetric.
552 
553   Level: intermediate
554  @*/
555 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
556 {
557   MPI_Comm       comm;
558   PetscInt       n,N,res[2];
559   PetscMPIInt    rank,size;
560   PetscSFType    type;
561   PetscErrorCode ierr;
562 
563   PetscFunctionBegin;
564   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
565   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
566   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
567   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
568 
569   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
570     type = PETSCSFALLTOALL;
571     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
572     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
573     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
574     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
575   } else {
576     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
577     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
578     res[0] = n;
579     res[1] = -n;
580     /* Check if n are same over all ranks so that we can optimize it */
581     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
582     if (res[0] == -res[1]) { /* same n */
583       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
584     } else {
585       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
586     }
587     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
588   }
589   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
590 
591   sf->pattern = pattern;
592   sf->mine    = NULL; /* Contiguous */
593 
594   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
595      Also set other easy stuff.
596    */
597   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
598     sf->nleaves      = N;
599     sf->nroots       = n;
600     sf->nranks       = size;
601     sf->minleaf      = 0;
602     sf->maxleaf      = N - 1;
603   } else if (pattern == PETSCSF_PATTERN_GATHER) {
604     sf->nleaves      = rank ? 0 : N;
605     sf->nroots       = n;
606     sf->nranks       = rank ? 0 : size;
607     sf->minleaf      = 0;
608     sf->maxleaf      = rank ? -1 : N - 1;
609   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
610     sf->nleaves      = size;
611     sf->nroots       = size;
612     sf->nranks       = size;
613     sf->minleaf      = 0;
614     sf->maxleaf      = size - 1;
615   }
616   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
617   sf->graphset = PETSC_TRUE;
618   PetscFunctionReturn(0);
619 }
620 
621 /*@
622    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
623 
624    Collective
625 
626    Input Arguments:
627 
628 .  sf - star forest to invert
629 
630    Output Arguments:
631 .  isf - inverse of sf
632    Level: advanced
633 
634    Notes:
635    All roots must have degree 1.
636 
637    The local space may be a permutation, but cannot be sparse.
638 
639 .seealso: PetscSFSetGraph()
640 @*/
641 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
642 {
643   PetscErrorCode ierr;
644   PetscMPIInt    rank;
645   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
646   const PetscInt *ilocal;
647   PetscSFNode    *roots,*leaves;
648 
649   PetscFunctionBegin;
650   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
651   PetscSFCheckGraphSet(sf,1);
652   PetscValidPointer(isf,2);
653 
654   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
655   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
656 
657   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
658   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
659   for (i=0; i<maxlocal; i++) {
660     leaves[i].rank  = rank;
661     leaves[i].index = i;
662   }
663   for (i=0; i <nroots; i++) {
664     roots[i].rank  = -1;
665     roots[i].index = -1;
666   }
667   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
668   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
669 
670   /* Check whether our leaves are sparse */
671   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
672   if (count == nroots) newilocal = NULL;
673   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
674     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
675     for (i=0,count=0; i<nroots; i++) {
676       if (roots[i].rank >= 0) {
677         newilocal[count]   = i;
678         roots[count].rank  = roots[i].rank;
679         roots[count].index = roots[i].index;
680         count++;
681       }
682     }
683   }
684 
685   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
686   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
687   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
688   PetscFunctionReturn(0);
689 }
690 
691 /*@
692    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
693 
694    Collective
695 
696    Input Arguments:
697 +  sf - communication object to duplicate
698 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
699 
700    Output Arguments:
701 .  newsf - new communication object
702 
703    Level: beginner
704 
705 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
706 @*/
707 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
708 {
709   PetscSFType    type;
710   PetscErrorCode ierr;
711   MPI_Datatype   dtype=MPIU_SCALAR;
712 
713   PetscFunctionBegin;
714   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
715   PetscValidLogicalCollectiveEnum(sf,opt,2);
716   PetscValidPointer(newsf,3);
717   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
718   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
719   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
720   if (opt == PETSCSF_DUPLICATE_GRAPH) {
721     PetscSFCheckGraphSet(sf,1);
722     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
723       PetscInt          nroots,nleaves;
724       const PetscInt    *ilocal;
725       const PetscSFNode *iremote;
726       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
727       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
728     } else {
729       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
730     }
731   }
732   /* Since oldtype is committed, so is newtype, according to MPI */
733   if (sf->vscat.bs > 1) {ierr = MPI_Type_dup(sf->vscat.unit,&dtype);CHKERRQ(ierr);}
734   (*newsf)->vscat.bs     = sf->vscat.bs;
735   (*newsf)->vscat.unit   = dtype;
736   (*newsf)->vscat.to_n   = sf->vscat.to_n;
737   (*newsf)->vscat.from_n = sf->vscat.from_n;
738   /* Do not copy lsf. Build it on demand since it is rarely used */
739 
740 #if defined(PETSC_HAVE_DEVICE)
741   (*newsf)->backend              = sf->backend;
742   (*newsf)->use_default_stream   = sf->use_default_stream;
743   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
744   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
745 #endif
746   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
747   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
748   PetscFunctionReturn(0);
749 }
750 
751 /*@C
752    PetscSFGetGraph - Get the graph specifying a parallel star forest
753 
754    Not Collective
755 
756    Input Arguments:
757 .  sf - star forest
758 
759    Output Arguments:
760 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
761 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
762 .  ilocal - locations of leaves in leafdata buffers
763 -  iremote - remote locations of root vertices for each leaf on the current process
764 
765    Notes:
766    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
767 
768    When called from Fortran, the returned iremote array is a copy and must be deallocated after use. Consequently, if you
769    want to update the graph, you must call PetscSFSetGraph after modifying the iremote array.
770 
771    Level: intermediate
772 
773 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
774 @*/
775 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
776 {
777   PetscErrorCode ierr;
778 
779   PetscFunctionBegin;
780   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
781   if (sf->ops->GetGraph) {
782     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
783   } else {
784     if (nroots) *nroots = sf->nroots;
785     if (nleaves) *nleaves = sf->nleaves;
786     if (ilocal) *ilocal = sf->mine;
787     if (iremote) *iremote = sf->remote;
788   }
789   PetscFunctionReturn(0);
790 }
791 
792 /*@
793    PetscSFGetLeafRange - Get the active leaf ranges
794 
795    Not Collective
796 
797    Input Arguments:
798 .  sf - star forest
799 
800    Output Arguments:
801 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
802 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
803 
804    Level: developer
805 
806 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
807 @*/
808 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
809 {
810   PetscFunctionBegin;
811   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
812   PetscSFCheckGraphSet(sf,1);
813   if (minleaf) *minleaf = sf->minleaf;
814   if (maxleaf) *maxleaf = sf->maxleaf;
815   PetscFunctionReturn(0);
816 }
817 
818 /*@C
819    PetscSFViewFromOptions - View from Options
820 
821    Collective on PetscSF
822 
823    Input Parameters:
824 +  A - the star forest
825 .  obj - Optional object
826 -  name - command line option
827 
828    Level: intermediate
829 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
830 @*/
831 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
832 {
833   PetscErrorCode ierr;
834 
835   PetscFunctionBegin;
836   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
837   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
838   PetscFunctionReturn(0);
839 }
840 
841 /*@C
842    PetscSFView - view a star forest
843 
844    Collective
845 
846    Input Arguments:
847 +  sf - star forest
848 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
849 
850    Level: beginner
851 
852 .seealso: PetscSFCreate(), PetscSFSetGraph()
853 @*/
854 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
855 {
856   PetscErrorCode    ierr;
857   PetscBool         iascii;
858   PetscViewerFormat format;
859 
860   PetscFunctionBegin;
861   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
862   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
863   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
864   PetscCheckSameComm(sf,1,viewer,2);
865   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
866   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
867   if (iascii) {
868     PetscMPIInt rank;
869     PetscInt    ii,i,j;
870 
871     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
872     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
873     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
874     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
875       if (!sf->graphset) {
876         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
877         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
878         PetscFunctionReturn(0);
879       }
880       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
881       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
882       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
883       for (i=0; i<sf->nleaves; i++) {
884         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
885       }
886       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
887       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
888       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
889         PetscMPIInt *tmpranks,*perm;
890         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
891         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
892         for (i=0; i<sf->nranks; i++) perm[i] = i;
893         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
894         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
895         for (ii=0; ii<sf->nranks; ii++) {
896           i = perm[ii];
897           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
898           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
899             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
900           }
901         }
902         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
903       }
904       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
905       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
906     }
907     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
908   }
909   PetscFunctionReturn(0);
910 }
911 
912 /*@C
913    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
914 
915    Not Collective
916 
917    Input Arguments:
918 .  sf - star forest
919 
920    Output Arguments:
921 +  nranks - number of ranks referenced by local part
922 .  ranks - array of ranks
923 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
924 .  rmine - concatenated array holding local indices referencing each remote rank
925 -  rremote - concatenated array holding remote indices referenced for each remote rank
926 
927    Level: developer
928 
929 .seealso: PetscSFGetLeafRanks()
930 @*/
931 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
932 {
933   PetscErrorCode ierr;
934 
935   PetscFunctionBegin;
936   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
937   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
938   if (sf->ops->GetRootRanks) {
939     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
940   } else {
941     /* The generic implementation */
942     if (nranks)  *nranks  = sf->nranks;
943     if (ranks)   *ranks   = sf->ranks;
944     if (roffset) *roffset = sf->roffset;
945     if (rmine)   *rmine   = sf->rmine;
946     if (rremote) *rremote = sf->rremote;
947   }
948   PetscFunctionReturn(0);
949 }
950 
951 /*@C
952    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
953 
954    Not Collective
955 
956    Input Arguments:
957 .  sf - star forest
958 
959    Output Arguments:
960 +  niranks - number of leaf ranks referencing roots on this process
961 .  iranks - array of ranks
962 .  ioffset - offset in irootloc for each rank (length niranks+1)
963 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
964 
965    Level: developer
966 
967 .seealso: PetscSFGetRootRanks()
968 @*/
969 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
970 {
971   PetscErrorCode ierr;
972 
973   PetscFunctionBegin;
974   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
975   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
976   if (sf->ops->GetLeafRanks) {
977     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
978   } else {
979     PetscSFType type;
980     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
981     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
982   }
983   PetscFunctionReturn(0);
984 }
985 
986 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
987   PetscInt i;
988   for (i=0; i<n; i++) {
989     if (needle == list[i]) return PETSC_TRUE;
990   }
991   return PETSC_FALSE;
992 }
993 
994 /*@C
995    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
996 
997    Collective
998 
999    Input Arguments:
1000 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
1001 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
1002 
1003    Level: developer
1004 
1005 .seealso: PetscSFGetRootRanks()
1006 @*/
1007 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
1008 {
1009   PetscErrorCode     ierr;
1010   PetscTable         table;
1011   PetscTablePosition pos;
1012   PetscMPIInt        size,groupsize,*groupranks;
1013   PetscInt           *rcount,*ranks;
1014   PetscInt           i, irank = -1,orank = -1;
1015 
1016   PetscFunctionBegin;
1017   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1018   PetscSFCheckGraphSet(sf,1);
1019   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
1020   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
1021   for (i=0; i<sf->nleaves; i++) {
1022     /* Log 1-based rank */
1023     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
1024   }
1025   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
1026   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
1027   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
1028   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
1029   for (i=0; i<sf->nranks; i++) {
1030     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
1031     ranks[i]--;             /* Convert back to 0-based */
1032   }
1033   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
1034 
1035   /* We expect that dgroup is reliably "small" while nranks could be large */
1036   {
1037     MPI_Group group = MPI_GROUP_NULL;
1038     PetscMPIInt *dgroupranks;
1039     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1040     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRQ(ierr);
1041     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
1042     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
1043     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
1044     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRQ(ierr);}
1045     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1046     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
1047   }
1048 
1049   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1050   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i;) {
1051     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
1052       if (InList(ranks[i],groupsize,groupranks)) break;
1053     }
1054     for (; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1055       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
1056     }
1057     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
1058       PetscInt tmprank,tmpcount;
1059 
1060       tmprank             = ranks[i];
1061       tmpcount            = rcount[i];
1062       ranks[i]            = ranks[sf->ndranks];
1063       rcount[i]           = rcount[sf->ndranks];
1064       ranks[sf->ndranks]  = tmprank;
1065       rcount[sf->ndranks] = tmpcount;
1066       sf->ndranks++;
1067     }
1068   }
1069   ierr = PetscFree(groupranks);CHKERRQ(ierr);
1070   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
1071   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
1072   sf->roffset[0] = 0;
1073   for (i=0; i<sf->nranks; i++) {
1074     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
1075     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
1076     rcount[i]        = 0;
1077   }
1078   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
1079     /* short circuit */
1080     if (orank != sf->remote[i].rank) {
1081       /* Search for index of iremote[i].rank in sf->ranks */
1082       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
1083       if (irank < 0) {
1084         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
1085         if (irank >= 0) irank += sf->ndranks;
1086       }
1087       orank = sf->remote[i].rank;
1088     }
1089     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
1090     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1091     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1092     rcount[irank]++;
1093   }
1094   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
1095   PetscFunctionReturn(0);
1096 }
1097 
1098 /*@C
1099    PetscSFGetGroups - gets incoming and outgoing process groups
1100 
1101    Collective
1102 
1103    Input Argument:
1104 .  sf - star forest
1105 
1106    Output Arguments:
1107 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1108 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1109 
1110    Level: developer
1111 
1112 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1113 @*/
1114 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1115 {
1116   PetscErrorCode ierr;
1117   MPI_Group      group = MPI_GROUP_NULL;
1118 
1119   PetscFunctionBegin;
1120   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1121   if (sf->ingroup == MPI_GROUP_NULL) {
1122     PetscInt       i;
1123     const PetscInt *indegree;
1124     PetscMPIInt    rank,*outranks,*inranks;
1125     PetscSFNode    *remote;
1126     PetscSF        bgcount;
1127 
1128     /* Compute the number of incoming ranks */
1129     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1130     for (i=0; i<sf->nranks; i++) {
1131       remote[i].rank  = sf->ranks[i];
1132       remote[i].index = 0;
1133     }
1134     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1135     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1136     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1137     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1138     /* Enumerate the incoming ranks */
1139     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1140     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1141     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1142     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1143     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1144     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1145     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRQ(ierr);
1146     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1147     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1148     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1149   }
1150   *incoming = sf->ingroup;
1151 
1152   if (sf->outgroup == MPI_GROUP_NULL) {
1153     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1154     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRQ(ierr);
1155     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1156   }
1157   *outgoing = sf->outgroup;
1158   PetscFunctionReturn(0);
1159 }
1160 
1161 /*@
1162    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1163 
1164    Collective
1165 
1166    Input Argument:
1167 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1168 
1169    Output Arguments:
1170 .  multi - star forest with split roots, such that each root has degree exactly 1
1171 
1172    Level: developer
1173 
1174    Notes:
1175 
1176    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1177    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1178    edge, it is a candidate for future optimization that might involve its removal.
1179 
1180 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1181 @*/
1182 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1183 {
1184   PetscErrorCode ierr;
1185 
1186   PetscFunctionBegin;
1187   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1188   PetscValidPointer(multi,2);
1189   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1190     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1191     *multi = sf->multi;
1192     PetscFunctionReturn(0);
1193   }
1194   if (!sf->multi) {
1195     const PetscInt *indegree;
1196     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1197     PetscSFNode    *remote;
1198     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1199     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1200     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1201     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1202     inoffset[0] = 0;
1203     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1204     for (i=0; i<maxlocal; i++) outones[i] = 1;
1205     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1206     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1207     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1208     if (PetscDefined(USE_DEBUG)) { /* Check that the expected number of increments occurred */
1209       for (i=0; i<sf->nroots; i++) {
1210         if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1211       }
1212     }
1213     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1214     for (i=0; i<sf->nleaves; i++) {
1215       remote[i].rank  = sf->remote[i].rank;
1216       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1217     }
1218     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1219     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1220     if (sf->rankorder) {        /* Sort the ranks */
1221       PetscMPIInt rank;
1222       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1223       PetscSFNode *newremote;
1224       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1225       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1226       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1227       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1228       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1229       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1230       /* Sort the incoming ranks at each vertex, build the inverse map */
1231       for (i=0; i<sf->nroots; i++) {
1232         PetscInt j;
1233         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1234         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1235         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1236       }
1237       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1238       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1239       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1240       for (i=0; i<sf->nleaves; i++) {
1241         newremote[i].rank  = sf->remote[i].rank;
1242         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1243       }
1244       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1245       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1246     }
1247     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1248   }
1249   *multi = sf->multi;
1250   PetscFunctionReturn(0);
1251 }
1252 
1253 /*@C
1254    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1255 
1256    Collective
1257 
1258    Input Arguments:
1259 +  sf - original star forest
1260 .  nselected  - number of selected roots on this process
1261 -  selected   - indices of the selected roots on this process
1262 
1263    Output Arguments:
1264 .  esf - new star forest
1265 
1266    Level: advanced
1267 
1268    Note:
1269    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1270    be done by calling PetscSFGetGraph().
1271 
1272 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1273 @*/
1274 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *esf)
1275 {
1276   PetscInt          i,j,n,nroots,nleaves,esf_nleaves,*new_ilocal,minleaf,maxleaf,maxlocal;
1277   const PetscInt    *ilocal;
1278   signed char       *rootdata,*leafdata,*leafmem;
1279   const PetscSFNode *iremote;
1280   PetscSFNode       *new_iremote;
1281   MPI_Comm          comm;
1282   PetscErrorCode    ierr;
1283 
1284   PetscFunctionBegin;
1285   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1286   PetscSFCheckGraphSet(sf,1);
1287   if (nselected) PetscValidPointer(selected,3);
1288   PetscValidPointer(esf,4);
1289 
1290   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1291   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1292   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1293   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1294 
1295   if (PetscDefined(USE_DEBUG)) {  /* Error out if selected[] has dups or  out of range indices */
1296     PetscBool dups;
1297     ierr = PetscCheckDupsInt(nselected,selected,&dups);CHKERRQ(ierr);
1298     if (dups) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"selected[] has dups");
1299     for (i=0; i<nselected; i++)
1300       if (selected[i] < 0 || selected[i] >= nroots) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"selected root indice %D is out of [0,%D)",selected[i],nroots);
1301   }
1302 
1303   if (sf->ops->CreateEmbeddedSF) {
1304     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,selected,esf);CHKERRQ(ierr);
1305   } else {
1306     /* A generic version of creating embedded sf */
1307     ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
1308     maxlocal = maxleaf - minleaf + 1;
1309     ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafmem);CHKERRQ(ierr);
1310     leafdata = leafmem - minleaf;
1311     /* Tag selected roots and bcast to leaves */
1312     for (i=0; i<nselected; i++) rootdata[selected[i]] = 1;
1313     ierr = PetscSFBcastBegin(sf,MPI_SIGNED_CHAR,rootdata,leafdata);CHKERRQ(ierr);
1314     ierr = PetscSFBcastEnd(sf,MPI_SIGNED_CHAR,rootdata,leafdata);CHKERRQ(ierr);
1315 
1316     /* Build esf with leaves that are still connected */
1317     esf_nleaves = 0;
1318     for (i=0; i<nleaves; i++) {
1319       j = ilocal ? ilocal[i] : i;
1320       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1321          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1322       */
1323       esf_nleaves += (leafdata[j] ? 1 : 0);
1324     }
1325     ierr = PetscMalloc1(esf_nleaves,&new_ilocal);CHKERRQ(ierr);
1326     ierr = PetscMalloc1(esf_nleaves,&new_iremote);CHKERRQ(ierr);
1327     for (i=n=0; i<nleaves; i++) {
1328       j = ilocal ? ilocal[i] : i;
1329       if (leafdata[j]) {
1330         new_ilocal[n]        = j;
1331         new_iremote[n].rank  = iremote[i].rank;
1332         new_iremote[n].index = iremote[i].index;
1333         ++n;
1334       }
1335     }
1336     ierr = PetscSFCreate(comm,esf);CHKERRQ(ierr);
1337     ierr = PetscSFSetFromOptions(*esf);CHKERRQ(ierr);
1338     ierr = PetscSFSetGraph(*esf,nroots,esf_nleaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1339     ierr = PetscFree2(rootdata,leafmem);CHKERRQ(ierr);
1340   }
1341   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1342   PetscFunctionReturn(0);
1343 }
1344 
1345 /*@C
1346   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1347 
1348   Collective
1349 
1350   Input Arguments:
1351 + sf - original star forest
1352 . nselected  - number of selected leaves on this process
1353 - selected   - indices of the selected leaves on this process
1354 
1355   Output Arguments:
1356 .  newsf - new star forest
1357 
1358   Level: advanced
1359 
1360 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1361 @*/
1362 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1363 {
1364   const PetscSFNode *iremote;
1365   PetscSFNode       *new_iremote;
1366   const PetscInt    *ilocal;
1367   PetscInt          i,nroots,*leaves,*new_ilocal;
1368   MPI_Comm          comm;
1369   PetscErrorCode    ierr;
1370 
1371   PetscFunctionBegin;
1372   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1373   PetscSFCheckGraphSet(sf,1);
1374   if (nselected) PetscValidPointer(selected,3);
1375   PetscValidPointer(newsf,4);
1376 
1377   /* Uniq selected[] and put results in leaves[] */
1378   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1379   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1380   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1381   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1382   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1383 
1384   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1385   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1386     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1387   } else {
1388     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1389     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1390     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1391     for (i=0; i<nselected; ++i) {
1392       const PetscInt l     = leaves[i];
1393       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1394       new_iremote[i].rank  = iremote[l].rank;
1395       new_iremote[i].index = iremote[l].index;
1396     }
1397     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1398     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1399   }
1400   ierr = PetscFree(leaves);CHKERRQ(ierr);
1401   PetscFunctionReturn(0);
1402 }
1403 
1404 /*@C
1405    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1406 
1407    Collective on PetscSF
1408 
1409    Input Arguments:
1410 +  sf - star forest on which to communicate
1411 .  unit - data type associated with each node
1412 .  rootdata - buffer to broadcast
1413 -  op - operation to use for reduction
1414 
1415    Output Arguments:
1416 .  leafdata - buffer to be reduced with values from each leaf's respective root
1417 
1418    Level: intermediate
1419 
1420    Notes:
1421     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1422     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1423     use PetscSFBcastAndOpWithMemTypeBegin() instead.
1424 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd(), PetscSFBcastAndOpWithMemTypeBegin()
1425 @*/
1426 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1427 {
1428   PetscErrorCode ierr;
1429   PetscMemType   rootmtype,leafmtype;
1430 
1431   PetscFunctionBegin;
1432   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1433   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1434   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1435   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1436   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1437   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1438   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1439   PetscFunctionReturn(0);
1440 }
1441 
1442 /*@C
1443    PetscSFBcastAndOpWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to PetscSFBcastAndOpEnd()
1444 
1445    Collective on PetscSF
1446 
1447    Input Arguments:
1448 +  sf - star forest on which to communicate
1449 .  unit - data type associated with each node
1450 .  rootmtype - memory type of rootdata
1451 .  rootdata - buffer to broadcast
1452 .  leafmtype - memory type of leafdata
1453 -  op - operation to use for reduction
1454 
1455    Output Arguments:
1456 .  leafdata - buffer to be reduced with values from each leaf's respective root
1457 
1458    Level: intermediate
1459 
1460 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd(),PetscSFBcastAndOpBegin()
1461 @*/
1462 PetscErrorCode PetscSFBcastAndOpWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
1463 {
1464   PetscErrorCode ierr;
1465 
1466   PetscFunctionBegin;
1467   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1468   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1469   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1470   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1471   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);}
1472   PetscFunctionReturn(0);
1473 }
1474 
1475 /*@C
1476    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1477 
1478    Collective
1479 
1480    Input Arguments:
1481 +  sf - star forest
1482 .  unit - data type
1483 .  rootdata - buffer to broadcast
1484 -  op - operation to use for reduction
1485 
1486    Output Arguments:
1487 .  leafdata - buffer to be reduced with values from each leaf's respective root
1488 
1489    Level: intermediate
1490 
1491 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1492 @*/
1493 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1494 {
1495   PetscErrorCode ierr;
1496 
1497   PetscFunctionBegin;
1498   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1499   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);}
1500   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootdata,leafdata,op);CHKERRQ(ierr);
1501   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);}
1502   PetscFunctionReturn(0);
1503 }
1504 
1505 /*@C
1506    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1507 
1508    Collective
1509 
1510    Input Arguments:
1511 +  sf - star forest
1512 .  unit - data type
1513 .  leafdata - values to reduce
1514 -  op - reduction operation
1515 
1516    Output Arguments:
1517 .  rootdata - result of reduction of values from all leaves of each root
1518 
1519    Level: intermediate
1520 
1521    Notes:
1522     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1523     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1524     use PetscSFReduceWithMemTypeBegin() instead.
1525 
1526 .seealso: PetscSFBcastBegin(), PetscSFReduceWithMemTypeBegin()
1527 @*/
1528 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1529 {
1530   PetscErrorCode ierr;
1531   PetscMemType   rootmtype,leafmtype;
1532 
1533   PetscFunctionBegin;
1534   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1535   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1536   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1537   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1538   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1539   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1540   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1541   PetscFunctionReturn(0);
1542 }
1543 
1544 /*@C
1545    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to PetscSFReduceEnd()
1546 
1547    Collective
1548 
1549    Input Arguments:
1550 +  sf - star forest
1551 .  unit - data type
1552 .  leafmtype - memory type of leafdata
1553 .  leafdata - values to reduce
1554 .  rootmtype - memory type of rootdata
1555 -  op - reduction operation
1556 
1557    Output Arguments:
1558 .  rootdata - result of reduction of values from all leaves of each root
1559 
1560    Level: intermediate
1561 
1562 .seealso: PetscSFBcastBegin(), PetscSFReduceBegin()
1563 @*/
1564 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
1565 {
1566   PetscErrorCode ierr;
1567 
1568   PetscFunctionBegin;
1569   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1570   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1571   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1572   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1573   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1574   PetscFunctionReturn(0);
1575 }
1576 
1577 /*@C
1578    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1579 
1580    Collective
1581 
1582    Input Arguments:
1583 +  sf - star forest
1584 .  unit - data type
1585 .  leafdata - values to reduce
1586 -  op - reduction operation
1587 
1588    Output Arguments:
1589 .  rootdata - result of reduction of values from all leaves of each root
1590 
1591    Level: intermediate
1592 
1593 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1594 @*/
1595 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1596 {
1597   PetscErrorCode ierr;
1598 
1599   PetscFunctionBegin;
1600   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1601   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1602   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
1603   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1604   PetscFunctionReturn(0);
1605 }
1606 
1607 /*@C
1608    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1609 
1610    Collective
1611 
1612    Input Arguments:
1613 +  sf - star forest
1614 .  unit - data type
1615 .  leafdata - leaf values to use in reduction
1616 -  op - operation to use for reduction
1617 
1618    Output Arguments:
1619 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1620 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1621 
1622    Level: advanced
1623 
1624    Note:
1625    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1626    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1627    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1628    integers.
1629 
1630 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1631 @*/
1632 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1633 {
1634   PetscErrorCode ierr;
1635   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1636 
1637   PetscFunctionBegin;
1638   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1639   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1640   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1641   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1642   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1643   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1644   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1645   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1646   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1647   PetscFunctionReturn(0);
1648 }
1649 
1650 /*@C
1651    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1652 
1653    Collective
1654 
1655    Input Arguments:
1656 +  sf - star forest
1657 .  unit - data type
1658 .  leafdata - leaf values to use in reduction
1659 -  op - operation to use for reduction
1660 
1661    Output Arguments:
1662 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1663 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1664 
1665    Level: advanced
1666 
1667 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1668 @*/
1669 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1670 {
1671   PetscErrorCode ierr;
1672 
1673   PetscFunctionBegin;
1674   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1675   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1676   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootdata,leafdata,leafupdate,op);CHKERRQ(ierr);
1677   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1678   PetscFunctionReturn(0);
1679 }
1680 
1681 /*@C
1682    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1683 
1684    Collective
1685 
1686    Input Arguments:
1687 .  sf - star forest
1688 
1689    Output Arguments:
1690 .  degree - degree of each root vertex
1691 
1692    Level: advanced
1693 
1694    Notes:
1695    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1696 
1697 .seealso: PetscSFGatherBegin()
1698 @*/
1699 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1700 {
1701   PetscErrorCode ierr;
1702 
1703   PetscFunctionBegin;
1704   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1705   PetscSFCheckGraphSet(sf,1);
1706   PetscValidPointer(degree,2);
1707   if (!sf->degreeknown) {
1708     PetscInt i, nroots = sf->nroots, maxlocal;
1709     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1710     maxlocal = sf->maxleaf-sf->minleaf+1;
1711     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1712     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1713     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1714     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1715     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1716   }
1717   *degree = NULL;
1718   PetscFunctionReturn(0);
1719 }
1720 
1721 /*@C
1722    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1723 
1724    Collective
1725 
1726    Input Arguments:
1727 .  sf - star forest
1728 
1729    Output Arguments:
1730 .  degree - degree of each root vertex
1731 
1732    Level: developer
1733 
1734    Notes:
1735    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1736 
1737 .seealso:
1738 @*/
1739 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1740 {
1741   PetscErrorCode ierr;
1742 
1743   PetscFunctionBegin;
1744   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1745   PetscSFCheckGraphSet(sf,1);
1746   PetscValidPointer(degree,2);
1747   if (!sf->degreeknown) {
1748     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1749     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1750     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1751     sf->degreeknown = PETSC_TRUE;
1752   }
1753   *degree = sf->degree;
1754   PetscFunctionReturn(0);
1755 }
1756 
1757 
1758 /*@C
1759    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1760    Each multi-root is assigned index of the corresponding original root.
1761 
1762    Collective
1763 
1764    Input Arguments:
1765 +  sf - star forest
1766 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1767 
1768    Output Arguments:
1769 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1770 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1771 
1772    Level: developer
1773 
1774    Notes:
1775    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1776 
1777 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1778 @*/
1779 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1780 {
1781   PetscSF             msf;
1782   PetscInt            i, j, k, nroots, nmroots;
1783   PetscErrorCode      ierr;
1784 
1785   PetscFunctionBegin;
1786   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1787   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1788   if (nroots) PetscValidIntPointer(degree,2);
1789   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1790   PetscValidPointer(multiRootsOrigNumbering,4);
1791   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1792   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1793   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1794   for (i=0,j=0,k=0; i<nroots; i++) {
1795     if (!degree[i]) continue;
1796     for (j=0; j<degree[i]; j++,k++) {
1797       (*multiRootsOrigNumbering)[k] = i;
1798     }
1799   }
1800   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1801   if (nMultiRoots) *nMultiRoots = nmroots;
1802   PetscFunctionReturn(0);
1803 }
1804 
1805 /*@C
1806    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1807 
1808    Collective
1809 
1810    Input Arguments:
1811 +  sf - star forest
1812 .  unit - data type
1813 -  leafdata - leaf data to gather to roots
1814 
1815    Output Argument:
1816 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1817 
1818    Level: intermediate
1819 
1820 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1821 @*/
1822 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1823 {
1824   PetscErrorCode ierr;
1825   PetscSF        multi = NULL;
1826 
1827   PetscFunctionBegin;
1828   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1829   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1830   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1831   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1832   PetscFunctionReturn(0);
1833 }
1834 
1835 /*@C
1836    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1837 
1838    Collective
1839 
1840    Input Arguments:
1841 +  sf - star forest
1842 .  unit - data type
1843 -  leafdata - leaf data to gather to roots
1844 
1845    Output Argument:
1846 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1847 
1848    Level: intermediate
1849 
1850 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1851 @*/
1852 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1853 {
1854   PetscErrorCode ierr;
1855   PetscSF        multi = NULL;
1856 
1857   PetscFunctionBegin;
1858   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1859   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1860   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1861   PetscFunctionReturn(0);
1862 }
1863 
1864 /*@C
1865    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1866 
1867    Collective
1868 
1869    Input Arguments:
1870 +  sf - star forest
1871 .  unit - data type
1872 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1873 
1874    Output Argument:
1875 .  leafdata - leaf data to be update with personal data from each respective root
1876 
1877    Level: intermediate
1878 
1879 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1880 @*/
1881 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1882 {
1883   PetscErrorCode ierr;
1884   PetscSF        multi = NULL;
1885 
1886   PetscFunctionBegin;
1887   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1888   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1889   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1890   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1891   PetscFunctionReturn(0);
1892 }
1893 
1894 /*@C
1895    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1896 
1897    Collective
1898 
1899    Input Arguments:
1900 +  sf - star forest
1901 .  unit - data type
1902 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1903 
1904    Output Argument:
1905 .  leafdata - leaf data to be update with personal data from each respective root
1906 
1907    Level: intermediate
1908 
1909 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1910 @*/
1911 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1912 {
1913   PetscErrorCode ierr;
1914   PetscSF        multi = NULL;
1915 
1916   PetscFunctionBegin;
1917   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1918   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1919   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1920   PetscFunctionReturn(0);
1921 }
1922 
1923 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1924 {
1925   PetscInt        i, n, nleaves;
1926   const PetscInt *ilocal = NULL;
1927   PetscHSetI      seen;
1928   PetscErrorCode  ierr;
1929 
1930   PetscFunctionBegin;
1931   if (PetscDefined(USE_DEBUG)) {
1932     ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1933     ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1934     for (i = 0; i < nleaves; i++) {
1935       const PetscInt leaf = ilocal ? ilocal[i] : i;
1936       ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1937     }
1938     ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1939     if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1940     ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1941   }
1942   PetscFunctionReturn(0);
1943 }
1944 
1945 /*@
1946   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1947 
1948   Input Parameters:
1949 + sfA - The first PetscSF
1950 - sfB - The second PetscSF
1951 
1952   Output Parameters:
1953 . sfBA - The composite SF
1954 
1955   Level: developer
1956 
1957   Notes:
1958   Currently, the two SFs must be defined on congruent communicators and they must be true star
1959   forests, i.e. the same leaf is not connected with different roots.
1960 
1961   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1962   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1963   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1964   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1965 
1966 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1967 @*/
1968 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1969 {
1970   PetscErrorCode    ierr;
1971   const PetscSFNode *remotePointsA,*remotePointsB;
1972   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1973   const PetscInt    *localPointsA,*localPointsB;
1974   PetscInt          *localPointsBA;
1975   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1976   PetscBool         denseB;
1977 
1978   PetscFunctionBegin;
1979   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1980   PetscSFCheckGraphSet(sfA,1);
1981   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1982   PetscSFCheckGraphSet(sfB,2);
1983   PetscCheckSameComm(sfA,1,sfB,2);
1984   PetscValidPointer(sfBA,3);
1985   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1986   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1987 
1988   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1989   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1990   if (localPointsA) {
1991     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
1992     for (i=0; i<numRootsB; i++) {
1993       reorderedRemotePointsA[i].rank = -1;
1994       reorderedRemotePointsA[i].index = -1;
1995     }
1996     for (i=0; i<numLeavesA; i++) {
1997       if (localPointsA[i] >= numRootsB) continue;
1998       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
1999     }
2000     remotePointsA = reorderedRemotePointsA;
2001   }
2002   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
2003   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
2004   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
2005   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
2006   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2007 
2008   denseB = (PetscBool)!localPointsB;
2009   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2010     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
2011     else numLeavesBA++;
2012   }
2013   if (denseB) {
2014     localPointsBA  = NULL;
2015     remotePointsBA = leafdataB;
2016   } else {
2017     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
2018     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
2019     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2020       const PetscInt l = localPointsB ? localPointsB[i] : i;
2021 
2022       if (leafdataB[l-minleaf].rank == -1) continue;
2023       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
2024       localPointsBA[numLeavesBA] = l;
2025       numLeavesBA++;
2026     }
2027     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
2028   }
2029   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2030   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2031   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2032   PetscFunctionReturn(0);
2033 }
2034 
2035 /*@
2036   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
2037 
2038   Input Parameters:
2039 + sfA - The first PetscSF
2040 - sfB - The second PetscSF
2041 
2042   Output Parameters:
2043 . sfBA - The composite SF.
2044 
2045   Level: developer
2046 
2047   Notes:
2048   Currently, the two SFs must be defined on congruent communicators and they must be true star
2049   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2050   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2051 
2052   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2053   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2054   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2055   a Reduce on sfB, on connected roots.
2056 
2057 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
2058 @*/
2059 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
2060 {
2061   PetscErrorCode    ierr;
2062   const PetscSFNode *remotePointsA,*remotePointsB;
2063   PetscSFNode       *remotePointsBA;
2064   const PetscInt    *localPointsA,*localPointsB;
2065   PetscSFNode       *reorderedRemotePointsA = NULL;
2066   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
2067   MPI_Op            op;
2068 #if defined(PETSC_USE_64BIT_INDICES)
2069   PetscBool         iswin;
2070 #endif
2071 
2072   PetscFunctionBegin;
2073   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2074   PetscSFCheckGraphSet(sfA,1);
2075   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2076   PetscSFCheckGraphSet(sfB,2);
2077   PetscCheckSameComm(sfA,1,sfB,2);
2078   PetscValidPointer(sfBA,3);
2079   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2080   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2081 
2082   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
2083   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
2084 
2085   /* TODO: Check roots of sfB have degree of 1 */
2086   /* Once we implement it, we can replace the MPI_MAXLOC
2087      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
2088      We use MPI_MAXLOC only to have a deterministic output from this routine if
2089      the root condition is not meet.
2090    */
2091   op = MPI_MAXLOC;
2092 #if defined(PETSC_USE_64BIT_INDICES)
2093   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2094   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
2095   if (iswin) op = MPIU_REPLACE;
2096 #endif
2097 
2098   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
2099   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
2100   for (i=0; i<maxleaf - minleaf + 1; i++) {
2101     reorderedRemotePointsA[i].rank = -1;
2102     reorderedRemotePointsA[i].index = -1;
2103   }
2104   if (localPointsA) {
2105     for (i=0; i<numLeavesA; i++) {
2106       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2107       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2108     }
2109   } else {
2110     for (i=0; i<numLeavesA; i++) {
2111       if (i > maxleaf || i < minleaf) continue;
2112       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2113     }
2114   }
2115 
2116   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
2117   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
2118   for (i=0; i<numRootsB; i++) {
2119     remotePointsBA[i].rank = -1;
2120     remotePointsBA[i].index = -1;
2121   }
2122 
2123   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2124   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2125   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2126   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
2127     if (remotePointsBA[i].rank == -1) continue;
2128     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
2129     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2130     localPointsBA[numLeavesBA] = i;
2131     numLeavesBA++;
2132   }
2133   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2134   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2135   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2136   PetscFunctionReturn(0);
2137 }
2138 
2139 /*
2140   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
2141 
2142   Input Parameters:
2143 . sf - The global PetscSF
2144 
2145   Output Parameters:
2146 . out - The local PetscSF
2147  */
2148 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
2149 {
2150   MPI_Comm           comm;
2151   PetscMPIInt        myrank;
2152   const PetscInt     *ilocal;
2153   const PetscSFNode  *iremote;
2154   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
2155   PetscSFNode        *liremote;
2156   PetscSF            lsf;
2157   PetscErrorCode     ierr;
2158 
2159   PetscFunctionBegin;
2160   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2161   if (sf->ops->CreateLocalSF) {
2162     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2163   } else {
2164     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2165     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2166     ierr = MPI_Comm_rank(comm,&myrank);CHKERRQ(ierr);
2167 
2168     /* Find out local edges and build a local SF */
2169     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2170     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2171     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2172     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2173 
2174     for (i=j=0; i<nleaves; i++) {
2175       if (iremote[i].rank == (PetscInt)myrank) {
2176         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2177         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2178         liremote[j].index = iremote[i].index;
2179         j++;
2180       }
2181     }
2182     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2183     ierr = PetscSFSetFromOptions(lsf);CHKERRQ(ierr);
2184     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2185     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2186     *out = lsf;
2187   }
2188   PetscFunctionReturn(0);
2189 }
2190 
2191 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2192 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2193 {
2194   PetscErrorCode     ierr;
2195   PetscMemType       rootmtype,leafmtype;
2196 
2197   PetscFunctionBegin;
2198   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2199   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2200   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2201   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2202   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2203   if (sf->ops->BcastToZero) {
2204     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2205   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2206   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2207   PetscFunctionReturn(0);
2208 }
2209 
2210