xref: /petsc/src/vec/is/sf/interface/sf.c (revision e1b7d5bfea5a7bdb65ba2886c620bc0183e850fe)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petsc/private/viewerimpl.h>
4 #include <petscctable.h>
5 
6 #if defined(PETSC_HAVE_CUDA)
7   #include <cuda_runtime.h>
8 #endif
9 
10 #if defined(PETSC_HAVE_HIP)
11   #include <hip/hip_runtime.h>
12 #endif
13 
14 #if defined(PETSC_USE_DEBUG)
15 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
16     if (PetscUnlikely(!(sf)->graphset))                              \
17       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
18   } while (0)
19 #else
20 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
21 #endif
22 
23 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",NULL};
24 
25 PETSC_STATIC_INLINE PetscErrorCode PetscGetMemType(const void *data,PetscMemType *type)
26 {
27   PetscFunctionBegin;
28   PetscValidPointer(type,2);
29   *type = PETSC_MEMTYPE_HOST;
30 #if defined(PETSC_HAVE_CUDA)
31   if (PetscDeviceInitialized(PETSC_DEVICE_CUDA) && data) {
32     cudaError_t                  cerr;
33     struct cudaPointerAttributes attr;
34     enum cudaMemoryType          mtype;
35     cerr = cudaPointerGetAttributes(&attr,data); /* Do not check error since before CUDA 11.0, passing a host pointer returns cudaErrorInvalidValue */
36     cudaGetLastError(); /* Reset the last error */
37     #if (CUDART_VERSION < 10000)
38       mtype = attr.memoryType;
39     #else
40       mtype = attr.type;
41     #endif
42     if (cerr == cudaSuccess && mtype == cudaMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
43   }
44 #endif
45 
46 #if defined(PETSC_HAVE_HIP)
47   if (PetscDeviceInitialized(PETSC_DEVICE_HIP) && data) {
48     hipError_t                   cerr;
49     struct hipPointerAttribute_t attr;
50     enum hipMemoryType           mtype;
51     cerr = hipPointerGetAttributes(&attr,data);
52     hipGetLastError(); /* Reset the last error */
53     mtype = attr.memoryType;
54     if (cerr == hipSuccess && mtype == hipMemoryTypeDevice) *type = PETSC_MEMTYPE_DEVICE;
55   }
56 #endif
57   PetscFunctionReturn(0);
58 }
59 
60 /*@
61    PetscSFCreate - create a star forest communication context
62 
63    Collective
64 
65    Input Parameter:
66 .  comm - communicator on which the star forest will operate
67 
68    Output Parameter:
69 .  sf - new star forest context
70 
71    Options Database Keys:
72 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
73 .  -sf_type window    -Use MPI-3 one-sided window for communication
74 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
75 
76    Level: intermediate
77 
78    Notes:
79    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
80    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
81    SFs are optimized and they have better performance than general SFs.
82 
83 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
84 @*/
85 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
86 {
87   PetscErrorCode ierr;
88   PetscSF        b;
89 
90   PetscFunctionBegin;
91   PetscValidPointer(sf,2);
92   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
93 
94   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
95 
96   b->nroots    = -1;
97   b->nleaves   = -1;
98   b->minleaf   = PETSC_MAX_INT;
99   b->maxleaf   = PETSC_MIN_INT;
100   b->nranks    = -1;
101   b->rankorder = PETSC_TRUE;
102   b->ingroup   = MPI_GROUP_NULL;
103   b->outgroup  = MPI_GROUP_NULL;
104   b->graphset  = PETSC_FALSE;
105 #if defined(PETSC_HAVE_DEVICE)
106   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
107   b->use_stream_aware_mpi = PETSC_FALSE;
108   b->unknown_input_stream= PETSC_FALSE;
109   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
110     b->backend = PETSCSF_BACKEND_KOKKOS;
111   #elif defined(PETSC_HAVE_CUDA)
112     b->backend = PETSCSF_BACKEND_CUDA;
113   #elif defined(PETSC_HAVE_HIP)
114     b->backend = PETSCSF_BACKEND_HIP;
115   #endif
116 
117   #if defined(PETSC_HAVE_NVSHMEM)
118     b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
119     b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
120     ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&b->use_nvshmem,NULL);CHKERRQ(ierr);
121     ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem_get",&b->use_nvshmem_get,NULL);CHKERRQ(ierr);
122   #endif
123 #endif
124   b->vscat.from_n = -1;
125   b->vscat.to_n   = -1;
126   b->vscat.unit   = MPIU_SCALAR;
127  *sf = b;
128   PetscFunctionReturn(0);
129 }
130 
131 /*@
132    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
133 
134    Collective
135 
136    Input Parameter:
137 .  sf - star forest
138 
139    Level: advanced
140 
141 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
142 @*/
143 PetscErrorCode PetscSFReset(PetscSF sf)
144 {
145   PetscErrorCode ierr;
146 
147   PetscFunctionBegin;
148   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
149   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
150   sf->nroots   = -1;
151   sf->nleaves  = -1;
152   sf->minleaf  = PETSC_MAX_INT;
153   sf->maxleaf  = PETSC_MIN_INT;
154   sf->mine     = NULL;
155   sf->remote   = NULL;
156   sf->graphset = PETSC_FALSE;
157   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
158   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
159   sf->nranks = -1;
160   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
161   sf->degreeknown = PETSC_FALSE;
162   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
163   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRMPI(ierr);}
164   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRMPI(ierr);}
165   if (sf->multi) sf->multi->multi = NULL;
166   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
167   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
168 
169  #if defined(PETSC_HAVE_DEVICE)
170   for (PetscInt i=0; i<2; i++) {ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,sf->rmine_d[i]);CHKERRQ(ierr);}
171  #endif
172 
173   sf->setupcalled = PETSC_FALSE;
174   PetscFunctionReturn(0);
175 }
176 
177 /*@C
178    PetscSFSetType - Set the PetscSF communication implementation
179 
180    Collective on PetscSF
181 
182    Input Parameters:
183 +  sf - the PetscSF context
184 -  type - a known method
185 
186    Options Database Key:
187 .  -sf_type <type> - Sets the method; use -help for a list
188    of available methods (for instance, window, basic, neighbor)
189 
190    Notes:
191    See "include/petscsf.h" for available methods (for instance)
192 +    PETSCSFWINDOW - MPI-2/3 one-sided
193 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
194 
195   Level: intermediate
196 
197 .seealso: PetscSFType, PetscSFCreate()
198 @*/
199 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
200 {
201   PetscErrorCode ierr,(*r)(PetscSF);
202   PetscBool      match;
203 
204   PetscFunctionBegin;
205   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
206   PetscValidCharPointer(type,2);
207 
208   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
209   if (match) PetscFunctionReturn(0);
210 
211   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
212   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
213   /* Destroy the previous PetscSF implementation context */
214   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
215   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
216   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
217   ierr = (*r)(sf);CHKERRQ(ierr);
218   PetscFunctionReturn(0);
219 }
220 
221 /*@C
222   PetscSFGetType - Get the PetscSF communication implementation
223 
224   Not Collective
225 
226   Input Parameter:
227 . sf  - the PetscSF context
228 
229   Output Parameter:
230 . type - the PetscSF type name
231 
232   Level: intermediate
233 
234 .seealso: PetscSFSetType(), PetscSFCreate()
235 @*/
236 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
237 {
238   PetscFunctionBegin;
239   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
240   PetscValidPointer(type,2);
241   *type = ((PetscObject)sf)->type_name;
242   PetscFunctionReturn(0);
243 }
244 
245 /*@C
246    PetscSFDestroy - destroy star forest
247 
248    Collective
249 
250    Input Parameter:
251 .  sf - address of star forest
252 
253    Level: intermediate
254 
255 .seealso: PetscSFCreate(), PetscSFReset()
256 @*/
257 PetscErrorCode PetscSFDestroy(PetscSF *sf)
258 {
259   PetscErrorCode ierr;
260 
261   PetscFunctionBegin;
262   if (!*sf) PetscFunctionReturn(0);
263   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
264   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
265   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
266   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
267   ierr = PetscSFDestroy(&(*sf)->vscat.lsf);CHKERRQ(ierr);
268   if ((*sf)->vscat.bs > 1) {ierr = MPI_Type_free(&(*sf)->vscat.unit);CHKERRMPI(ierr);}
269   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
270   PetscFunctionReturn(0);
271 }
272 
273 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
274 {
275   PetscInt           i, nleaves;
276   PetscMPIInt        size;
277   const PetscInt    *ilocal;
278   const PetscSFNode *iremote;
279   PetscErrorCode     ierr;
280 
281   PetscFunctionBegin;
282   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
283   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
284   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
285   for (i = 0; i < nleaves; i++) {
286     const PetscInt rank = iremote[i].rank;
287     const PetscInt remote = iremote[i].index;
288     const PetscInt leaf = ilocal ? ilocal[i] : i;
289     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
290     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
291     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
292   }
293   PetscFunctionReturn(0);
294 }
295 
296 /*@
297    PetscSFSetUp - set up communication structures
298 
299    Collective
300 
301    Input Parameter:
302 .  sf - star forest communication object
303 
304    Level: beginner
305 
306 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
307 @*/
308 PetscErrorCode PetscSFSetUp(PetscSF sf)
309 {
310   PetscErrorCode ierr;
311 
312   PetscFunctionBegin;
313   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
314   PetscSFCheckGraphSet(sf,1);
315   if (sf->setupcalled) PetscFunctionReturn(0);
316   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
317   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
318   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);} /* Zero all sf->ops */
319   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
320 #if defined(PETSC_HAVE_CUDA)
321   if (sf->backend == PETSCSF_BACKEND_CUDA) {
322     sf->ops->Malloc = PetscSFMalloc_CUDA;
323     sf->ops->Free   = PetscSFFree_CUDA;
324   }
325 #endif
326 #if defined(PETSC_HAVE_HIP)
327   if (sf->backend == PETSCSF_BACKEND_HIP) {
328     sf->ops->Malloc = PetscSFMalloc_HIP;
329     sf->ops->Free   = PetscSFFree_HIP;
330   }
331 #endif
332 
333 #
334 #if defined(PETSC_HAVE_KOKKOS)
335   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
336     sf->ops->Malloc = PetscSFMalloc_Kokkos;
337     sf->ops->Free   = PetscSFFree_Kokkos;
338   }
339 #endif
340   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
341   sf->setupcalled = PETSC_TRUE;
342   PetscFunctionReturn(0);
343 }
344 
345 /*@
346    PetscSFSetFromOptions - set PetscSF options using the options database
347 
348    Logically Collective
349 
350    Input Parameter:
351 .  sf - star forest
352 
353    Options Database Keys:
354 +  -sf_type               - implementation type, see PetscSFSetType()
355 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
356 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
357                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
358                             If true, this option only works with -use_gpu_aware_mpi 1.
359 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
360                                If true, this option only works with -use_gpu_aware_mpi 1.
361 
362 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently SF has these backends: cuda, hip and Kokkos.
363                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
364                               the only available is kokkos.
365 
366    Level: intermediate
367 @*/
368 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
369 {
370   PetscSFType    deft;
371   char           type[256];
372   PetscErrorCode ierr;
373   PetscBool      flg;
374 
375   PetscFunctionBegin;
376   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
377   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
378   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
379   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
380   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
381   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
382  #if defined(PETSC_HAVE_DEVICE)
383   {
384     char        backendstr[32] = {0};
385     PetscBool   isCuda = PETSC_FALSE,isHip = PETSC_FALSE,isKokkos = PETSC_FALSE,set;
386     /* Change the defaults set in PetscSFCreate() with command line options */
387     ierr = PetscOptionsBool("-sf_unknown_input_stream","SF root/leafdata is computed on arbitary streams unknown to SF","PetscSFSetFromOptions",sf->unknown_input_stream,&sf->unknown_input_stream,NULL);CHKERRQ(ierr);
388     ierr = PetscOptionsBool("-sf_use_stream_aware_mpi","Assume the underlying MPI is cuda-stream aware","PetscSFSetFromOptions",sf->use_stream_aware_mpi,&sf->use_stream_aware_mpi,NULL);CHKERRQ(ierr);
389     ierr = PetscOptionsString("-sf_backend","Select the device backend SF uses","PetscSFSetFromOptions",NULL,backendstr,sizeof(backendstr),&set);CHKERRQ(ierr);
390     ierr = PetscStrcasecmp("cuda",backendstr,&isCuda);CHKERRQ(ierr);
391     ierr = PetscStrcasecmp("kokkos",backendstr,&isKokkos);CHKERRQ(ierr);
392     ierr = PetscStrcasecmp("hip",backendstr,&isHip);CHKERRQ(ierr);
393   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
394     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
395     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
396     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
397     else if (set) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
398   #elif defined(PETSC_HAVE_KOKKOS)
399     if (set && !isKokkos) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You can only choose kokkos", backendstr);
400    #endif
401   }
402  #endif
403   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
404   ierr = PetscOptionsEnd();CHKERRQ(ierr);
405   PetscFunctionReturn(0);
406 }
407 
408 /*@
409    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
410 
411    Logically Collective
412 
413    Input Parameters:
414 +  sf - star forest
415 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
416 
417    Level: advanced
418 
419 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
420 @*/
421 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
422 {
423   PetscFunctionBegin;
424   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
425   PetscValidLogicalCollectiveBool(sf,flg,2);
426   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
427   sf->rankorder = flg;
428   PetscFunctionReturn(0);
429 }
430 
431 /*@C
432    PetscSFSetGraph - Set a parallel star forest
433 
434    Collective
435 
436    Input Parameters:
437 +  sf - star forest
438 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
439 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
440 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
441 during setup in debug mode)
442 .  localmode - copy mode for ilocal
443 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
444 during setup in debug mode)
445 -  remotemode - copy mode for iremote
446 
447    Level: intermediate
448 
449    Notes:
450     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
451 
452    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
453    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
454    needed
455 
456    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
457    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
458 
459 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
460 @*/
461 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
462 {
463   PetscErrorCode ierr;
464 
465   PetscFunctionBegin;
466   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
467   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
468   if (nleaves > 0) PetscValidPointer(iremote,6);
469   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
470   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
471 
472   if (sf->nroots >= 0) { /* Reset only if graph already set */
473     ierr = PetscSFReset(sf);CHKERRQ(ierr);
474   }
475 
476   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
477 
478   sf->nroots  = nroots;
479   sf->nleaves = nleaves;
480 
481   if (nleaves && ilocal) {
482     PetscInt i;
483     PetscInt minleaf = PETSC_MAX_INT;
484     PetscInt maxleaf = PETSC_MIN_INT;
485     int      contiguous = 1;
486     for (i=0; i<nleaves; i++) {
487       minleaf = PetscMin(minleaf,ilocal[i]);
488       maxleaf = PetscMax(maxleaf,ilocal[i]);
489       contiguous &= (ilocal[i] == i);
490     }
491     sf->minleaf = minleaf;
492     sf->maxleaf = maxleaf;
493     if (contiguous) {
494       if (localmode == PETSC_OWN_POINTER) {
495         ierr = PetscFree(ilocal);CHKERRQ(ierr);
496       }
497       ilocal = NULL;
498     }
499   } else {
500     sf->minleaf = 0;
501     sf->maxleaf = nleaves - 1;
502   }
503 
504   if (ilocal) {
505     switch (localmode) {
506     case PETSC_COPY_VALUES:
507       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
508       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
509       sf->mine = sf->mine_alloc;
510       break;
511     case PETSC_OWN_POINTER:
512       sf->mine_alloc = (PetscInt*)ilocal;
513       sf->mine       = sf->mine_alloc;
514       break;
515     case PETSC_USE_POINTER:
516       sf->mine_alloc = NULL;
517       sf->mine       = (PetscInt*)ilocal;
518       break;
519     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
520     }
521   }
522 
523   switch (remotemode) {
524   case PETSC_COPY_VALUES:
525     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
526     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
527     sf->remote = sf->remote_alloc;
528     break;
529   case PETSC_OWN_POINTER:
530     sf->remote_alloc = (PetscSFNode*)iremote;
531     sf->remote       = sf->remote_alloc;
532     break;
533   case PETSC_USE_POINTER:
534     sf->remote_alloc = NULL;
535     sf->remote       = (PetscSFNode*)iremote;
536     break;
537   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
538   }
539 
540   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
541   sf->graphset = PETSC_TRUE;
542   PetscFunctionReturn(0);
543 }
544 
545 /*@
546   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
547 
548   Collective
549 
550   Input Parameters:
551 + sf      - The PetscSF
552 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
553 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
554 
555   Notes:
556   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
557   n and N are local and global sizes of x respectively.
558 
559   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
560   sequential vectors y on all ranks.
561 
562   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
563   sequential vector y on rank 0.
564 
565   In above cases, entries of x are roots and entries of y are leaves.
566 
567   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
568   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
569   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
570   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
571   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
572 
573   In this case, roots and leaves are symmetric.
574 
575   Level: intermediate
576  @*/
577 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
578 {
579   MPI_Comm       comm;
580   PetscInt       n,N,res[2];
581   PetscMPIInt    rank,size;
582   PetscSFType    type;
583   PetscErrorCode ierr;
584 
585   PetscFunctionBegin;
586   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
587   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
588   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
589   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
590 
591   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
592     type = PETSCSFALLTOALL;
593     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
594     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
595     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
596     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
597   } else {
598     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
599     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
600     res[0] = n;
601     res[1] = -n;
602     /* Check if n are same over all ranks so that we can optimize it */
603     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRMPI(ierr);
604     if (res[0] == -res[1]) { /* same n */
605       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
606     } else {
607       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
608     }
609     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
610   }
611   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
612 
613   sf->pattern = pattern;
614   sf->mine    = NULL; /* Contiguous */
615 
616   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
617      Also set other easy stuff.
618    */
619   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
620     sf->nleaves      = N;
621     sf->nroots       = n;
622     sf->nranks       = size;
623     sf->minleaf      = 0;
624     sf->maxleaf      = N - 1;
625   } else if (pattern == PETSCSF_PATTERN_GATHER) {
626     sf->nleaves      = rank ? 0 : N;
627     sf->nroots       = n;
628     sf->nranks       = rank ? 0 : size;
629     sf->minleaf      = 0;
630     sf->maxleaf      = rank ? -1 : N - 1;
631   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
632     sf->nleaves      = size;
633     sf->nroots       = size;
634     sf->nranks       = size;
635     sf->minleaf      = 0;
636     sf->maxleaf      = size - 1;
637   }
638   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
639   sf->graphset = PETSC_TRUE;
640   PetscFunctionReturn(0);
641 }
642 
643 /*@
644    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
645 
646    Collective
647 
648    Input Parameter:
649 .  sf - star forest to invert
650 
651    Output Parameter:
652 .  isf - inverse of sf
653 
654    Level: advanced
655 
656    Notes:
657    All roots must have degree 1.
658 
659    The local space may be a permutation, but cannot be sparse.
660 
661 .seealso: PetscSFSetGraph()
662 @*/
663 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
664 {
665   PetscErrorCode ierr;
666   PetscMPIInt    rank;
667   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
668   const PetscInt *ilocal;
669   PetscSFNode    *roots,*leaves;
670 
671   PetscFunctionBegin;
672   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
673   PetscSFCheckGraphSet(sf,1);
674   PetscValidPointer(isf,2);
675 
676   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
677   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
678 
679   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
680   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
681   for (i=0; i<maxlocal; i++) {
682     leaves[i].rank  = rank;
683     leaves[i].index = i;
684   }
685   for (i=0; i <nroots; i++) {
686     roots[i].rank  = -1;
687     roots[i].index = -1;
688   }
689   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPI_REPLACE);CHKERRQ(ierr);
690   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPI_REPLACE);CHKERRQ(ierr);
691 
692   /* Check whether our leaves are sparse */
693   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
694   if (count == nroots) newilocal = NULL;
695   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
696     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
697     for (i=0,count=0; i<nroots; i++) {
698       if (roots[i].rank >= 0) {
699         newilocal[count]   = i;
700         roots[count].rank  = roots[i].rank;
701         roots[count].index = roots[i].index;
702         count++;
703       }
704     }
705   }
706 
707   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
708   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
709   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
710   PetscFunctionReturn(0);
711 }
712 
713 /*@
714    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
715 
716    Collective
717 
718    Input Parameters:
719 +  sf - communication object to duplicate
720 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
721 
722    Output Parameter:
723 .  newsf - new communication object
724 
725    Level: beginner
726 
727 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
728 @*/
729 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
730 {
731   PetscSFType    type;
732   PetscErrorCode ierr;
733   MPI_Datatype   dtype=MPIU_SCALAR;
734 
735   PetscFunctionBegin;
736   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
737   PetscValidLogicalCollectiveEnum(sf,opt,2);
738   PetscValidPointer(newsf,3);
739   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
740   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
741   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
742   if (opt == PETSCSF_DUPLICATE_GRAPH) {
743     PetscSFCheckGraphSet(sf,1);
744     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
745       PetscInt          nroots,nleaves;
746       const PetscInt    *ilocal;
747       const PetscSFNode *iremote;
748       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
749       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
750     } else {
751       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
752     }
753   }
754   /* Since oldtype is committed, so is newtype, according to MPI */
755   if (sf->vscat.bs > 1) {ierr = MPI_Type_dup(sf->vscat.unit,&dtype);CHKERRMPI(ierr);}
756   (*newsf)->vscat.bs     = sf->vscat.bs;
757   (*newsf)->vscat.unit   = dtype;
758   (*newsf)->vscat.to_n   = sf->vscat.to_n;
759   (*newsf)->vscat.from_n = sf->vscat.from_n;
760   /* Do not copy lsf. Build it on demand since it is rarely used */
761 
762 #if defined(PETSC_HAVE_DEVICE)
763   (*newsf)->backend              = sf->backend;
764   (*newsf)->unknown_input_stream= sf->unknown_input_stream;
765   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
766   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
767 #endif
768   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
769   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
770   PetscFunctionReturn(0);
771 }
772 
773 /*@C
774    PetscSFGetGraph - Get the graph specifying a parallel star forest
775 
776    Not Collective
777 
778    Input Parameter:
779 .  sf - star forest
780 
781    Output Parameters:
782 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
783 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
784 .  ilocal - locations of leaves in leafdata buffers (if returned value is NULL, it means leaves are in contiguous storage)
785 -  iremote - remote locations of root vertices for each leaf on the current process
786 
787    Notes:
788      We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
789 
790    Fortran Notes:
791      The returned iremote array is a copy and must be deallocated after use. Consequently, if you
792      want to update the graph, you must call PetscSFSetGraph() after modifying the iremote array.
793 
794      To check for a NULL ilocal use
795 $      if (loc(ilocal) == loc(PETSC_NULL_INTEGER)) then
796 
797    Level: intermediate
798 
799 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
800 @*/
801 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
802 {
803   PetscErrorCode ierr;
804 
805   PetscFunctionBegin;
806   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
807   if (sf->ops->GetGraph) {
808     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
809   } else {
810     if (nroots) *nroots = sf->nroots;
811     if (nleaves) *nleaves = sf->nleaves;
812     if (ilocal) *ilocal = sf->mine;
813     if (iremote) *iremote = sf->remote;
814   }
815   PetscFunctionReturn(0);
816 }
817 
818 /*@
819    PetscSFGetLeafRange - Get the active leaf ranges
820 
821    Not Collective
822 
823    Input Parameter:
824 .  sf - star forest
825 
826    Output Parameters:
827 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
828 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
829 
830    Level: developer
831 
832 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
833 @*/
834 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
835 {
836   PetscFunctionBegin;
837   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
838   PetscSFCheckGraphSet(sf,1);
839   if (minleaf) *minleaf = sf->minleaf;
840   if (maxleaf) *maxleaf = sf->maxleaf;
841   PetscFunctionReturn(0);
842 }
843 
844 /*@C
845    PetscSFViewFromOptions - View from Options
846 
847    Collective on PetscSF
848 
849    Input Parameters:
850 +  A - the star forest
851 .  obj - Optional object
852 -  name - command line option
853 
854    Level: intermediate
855 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
856 @*/
857 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
858 {
859   PetscErrorCode ierr;
860 
861   PetscFunctionBegin;
862   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
863   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 
867 /*@C
868    PetscSFView - view a star forest
869 
870    Collective
871 
872    Input Parameters:
873 +  sf - star forest
874 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
875 
876    Level: beginner
877 
878 .seealso: PetscSFCreate(), PetscSFSetGraph()
879 @*/
880 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
881 {
882   PetscErrorCode    ierr;
883   PetscBool         iascii;
884   PetscViewerFormat format;
885 
886   PetscFunctionBegin;
887   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
888   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
889   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
890   PetscCheckSameComm(sf,1,viewer,2);
891   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
892   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
893   if (iascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) {
894     PetscMPIInt rank;
895     PetscInt    ii,i,j;
896 
897     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
898     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
899     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
900       if (!sf->graphset) {
901         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
902         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
903         PetscFunctionReturn(0);
904       }
905       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
906       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
907       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
908       for (i=0; i<sf->nleaves; i++) {
909         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
910       }
911       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
912       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
913       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
914         PetscMPIInt *tmpranks,*perm;
915         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
916         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
917         for (i=0; i<sf->nranks; i++) perm[i] = i;
918         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
919         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
920         for (ii=0; ii<sf->nranks; ii++) {
921           i = perm[ii];
922           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
923           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
924             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
925           }
926         }
927         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
928       }
929       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
930       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
931     }
932     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
933   }
934   if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
935   PetscFunctionReturn(0);
936 }
937 
938 /*@C
939    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
940 
941    Not Collective
942 
943    Input Parameter:
944 .  sf - star forest
945 
946    Output Parameters:
947 +  nranks - number of ranks referenced by local part
948 .  ranks - array of ranks
949 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
950 .  rmine - concatenated array holding local indices referencing each remote rank
951 -  rremote - concatenated array holding remote indices referenced for each remote rank
952 
953    Level: developer
954 
955 .seealso: PetscSFGetLeafRanks()
956 @*/
957 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
958 {
959   PetscErrorCode ierr;
960 
961   PetscFunctionBegin;
962   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
963   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
964   if (sf->ops->GetRootRanks) {
965     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
966   } else {
967     /* The generic implementation */
968     if (nranks)  *nranks  = sf->nranks;
969     if (ranks)   *ranks   = sf->ranks;
970     if (roffset) *roffset = sf->roffset;
971     if (rmine)   *rmine   = sf->rmine;
972     if (rremote) *rremote = sf->rremote;
973   }
974   PetscFunctionReturn(0);
975 }
976 
977 /*@C
978    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
979 
980    Not Collective
981 
982    Input Parameter:
983 .  sf - star forest
984 
985    Output Parameters:
986 +  niranks - number of leaf ranks referencing roots on this process
987 .  iranks - array of ranks
988 .  ioffset - offset in irootloc for each rank (length niranks+1)
989 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
990 
991    Level: developer
992 
993 .seealso: PetscSFGetRootRanks()
994 @*/
995 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
996 {
997   PetscErrorCode ierr;
998 
999   PetscFunctionBegin;
1000   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1001   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
1002   if (sf->ops->GetLeafRanks) {
1003     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
1004   } else {
1005     PetscSFType type;
1006     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
1007     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
1008   }
1009   PetscFunctionReturn(0);
1010 }
1011 
1012 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
1013   PetscInt i;
1014   for (i=0; i<n; i++) {
1015     if (needle == list[i]) return PETSC_TRUE;
1016   }
1017   return PETSC_FALSE;
1018 }
1019 
1020 /*@C
1021    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
1022 
1023    Collective
1024 
1025    Input Parameters:
1026 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
1027 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
1028 
1029    Level: developer
1030 
1031 .seealso: PetscSFGetRootRanks()
1032 @*/
1033 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
1034 {
1035   PetscErrorCode     ierr;
1036   PetscTable         table;
1037   PetscTablePosition pos;
1038   PetscMPIInt        size,groupsize,*groupranks;
1039   PetscInt           *rcount,*ranks;
1040   PetscInt           i, irank = -1,orank = -1;
1041 
1042   PetscFunctionBegin;
1043   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1044   PetscSFCheckGraphSet(sf,1);
1045   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
1046   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
1047   for (i=0; i<sf->nleaves; i++) {
1048     /* Log 1-based rank */
1049     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
1050   }
1051   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
1052   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
1053   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
1054   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
1055   for (i=0; i<sf->nranks; i++) {
1056     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
1057     ranks[i]--;             /* Convert back to 0-based */
1058   }
1059   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
1060 
1061   /* We expect that dgroup is reliably "small" while nranks could be large */
1062   {
1063     MPI_Group group = MPI_GROUP_NULL;
1064     PetscMPIInt *dgroupranks;
1065     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1066     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRMPI(ierr);
1067     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
1068     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
1069     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
1070     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRMPI(ierr);}
1071     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1072     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
1073   }
1074 
1075   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1076   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i;) {
1077     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
1078       if (InList(ranks[i],groupsize,groupranks)) break;
1079     }
1080     for (; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1081       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
1082     }
1083     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
1084       PetscInt tmprank,tmpcount;
1085 
1086       tmprank             = ranks[i];
1087       tmpcount            = rcount[i];
1088       ranks[i]            = ranks[sf->ndranks];
1089       rcount[i]           = rcount[sf->ndranks];
1090       ranks[sf->ndranks]  = tmprank;
1091       rcount[sf->ndranks] = tmpcount;
1092       sf->ndranks++;
1093     }
1094   }
1095   ierr = PetscFree(groupranks);CHKERRQ(ierr);
1096   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
1097   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
1098   sf->roffset[0] = 0;
1099   for (i=0; i<sf->nranks; i++) {
1100     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
1101     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
1102     rcount[i]        = 0;
1103   }
1104   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
1105     /* short circuit */
1106     if (orank != sf->remote[i].rank) {
1107       /* Search for index of iremote[i].rank in sf->ranks */
1108       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
1109       if (irank < 0) {
1110         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
1111         if (irank >= 0) irank += sf->ndranks;
1112       }
1113       orank = sf->remote[i].rank;
1114     }
1115     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
1116     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1117     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1118     rcount[irank]++;
1119   }
1120   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
1121   PetscFunctionReturn(0);
1122 }
1123 
1124 /*@C
1125    PetscSFGetGroups - gets incoming and outgoing process groups
1126 
1127    Collective
1128 
1129    Input Parameter:
1130 .  sf - star forest
1131 
1132    Output Parameters:
1133 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1134 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1135 
1136    Level: developer
1137 
1138 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1139 @*/
1140 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1141 {
1142   PetscErrorCode ierr;
1143   MPI_Group      group = MPI_GROUP_NULL;
1144 
1145   PetscFunctionBegin;
1146   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1147   if (sf->ingroup == MPI_GROUP_NULL) {
1148     PetscInt       i;
1149     const PetscInt *indegree;
1150     PetscMPIInt    rank,*outranks,*inranks;
1151     PetscSFNode    *remote;
1152     PetscSF        bgcount;
1153 
1154     /* Compute the number of incoming ranks */
1155     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1156     for (i=0; i<sf->nranks; i++) {
1157       remote[i].rank  = sf->ranks[i];
1158       remote[i].index = 0;
1159     }
1160     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1161     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1162     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1163     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1164     /* Enumerate the incoming ranks */
1165     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1166     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1167     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1168     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1169     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1170     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1171     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRMPI(ierr);
1172     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1173     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1174     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1175   }
1176   *incoming = sf->ingroup;
1177 
1178   if (sf->outgroup == MPI_GROUP_NULL) {
1179     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1180     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRMPI(ierr);
1181     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1182   }
1183   *outgoing = sf->outgroup;
1184   PetscFunctionReturn(0);
1185 }
1186 
1187 /*@
1188    PetscSFGetMultiSF - gets the inner SF implementing gathers and scatters
1189 
1190    Collective
1191 
1192    Input Parameter:
1193 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1194 
1195    Output Parameter:
1196 .  multi - star forest with split roots, such that each root has degree exactly 1
1197 
1198    Level: developer
1199 
1200    Notes:
1201 
1202    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1203    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1204    edge, it is a candidate for future optimization that might involve its removal.
1205 
1206 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1207 @*/
1208 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1209 {
1210   PetscErrorCode ierr;
1211 
1212   PetscFunctionBegin;
1213   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1214   PetscValidPointer(multi,2);
1215   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1216     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1217     *multi = sf->multi;
1218     sf->multi->multi = sf->multi;
1219     PetscFunctionReturn(0);
1220   }
1221   if (!sf->multi) {
1222     const PetscInt *indegree;
1223     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1224     PetscSFNode    *remote;
1225     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1226     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1227     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1228     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1229     inoffset[0] = 0;
1230     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1231     for (i=0; i<maxlocal; i++) outones[i] = 1;
1232     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1233     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1234     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1235     if (PetscDefined(USE_DEBUG)) { /* Check that the expected number of increments occurred */
1236       for (i=0; i<sf->nroots; i++) {
1237         if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1238       }
1239     }
1240     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1241     for (i=0; i<sf->nleaves; i++) {
1242       remote[i].rank  = sf->remote[i].rank;
1243       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1244     }
1245     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1246     sf->multi->multi = sf->multi;
1247     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1248     if (sf->rankorder) {        /* Sort the ranks */
1249       PetscMPIInt rank;
1250       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1251       PetscSFNode *newremote;
1252       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1253       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1254       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1255       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1256       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPI_REPLACE);CHKERRQ(ierr);
1257       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPI_REPLACE);CHKERRQ(ierr);
1258       /* Sort the incoming ranks at each vertex, build the inverse map */
1259       for (i=0; i<sf->nroots; i++) {
1260         PetscInt j;
1261         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1262         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1263         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1264       }
1265       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset,MPI_REPLACE);CHKERRQ(ierr);
1266       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset,MPI_REPLACE);CHKERRQ(ierr);
1267       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1268       for (i=0; i<sf->nleaves; i++) {
1269         newremote[i].rank  = sf->remote[i].rank;
1270         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1271       }
1272       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1273       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1274     }
1275     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1276   }
1277   *multi = sf->multi;
1278   PetscFunctionReturn(0);
1279 }
1280 
1281 /*@C
1282    PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots, does not remap indices
1283 
1284    Collective
1285 
1286    Input Parameters:
1287 +  sf - original star forest
1288 .  nselected  - number of selected roots on this process
1289 -  selected   - indices of the selected roots on this process
1290 
1291    Output Parameter:
1292 .  esf - new star forest
1293 
1294    Level: advanced
1295 
1296    Note:
1297    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1298    be done by calling PetscSFGetGraph().
1299 
1300 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1301 @*/
1302 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *esf)
1303 {
1304   PetscInt          i,j,n,nroots,nleaves,esf_nleaves,*new_ilocal,minleaf,maxleaf,maxlocal;
1305   const PetscInt    *ilocal;
1306   signed char       *rootdata,*leafdata,*leafmem;
1307   const PetscSFNode *iremote;
1308   PetscSFNode       *new_iremote;
1309   MPI_Comm          comm;
1310   PetscErrorCode    ierr;
1311 
1312   PetscFunctionBegin;
1313   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1314   PetscSFCheckGraphSet(sf,1);
1315   if (nselected) PetscValidPointer(selected,3);
1316   PetscValidPointer(esf,4);
1317 
1318   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1319   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1320   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1321   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1322 
1323   if (PetscDefined(USE_DEBUG)) {  /* Error out if selected[] has dups or  out of range indices */
1324     PetscBool dups;
1325     ierr = PetscCheckDupsInt(nselected,selected,&dups);CHKERRQ(ierr);
1326     if (dups) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"selected[] has dups");
1327     for (i=0; i<nselected; i++)
1328       if (selected[i] < 0 || selected[i] >= nroots) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"selected root indice %D is out of [0,%D)",selected[i],nroots);
1329   }
1330 
1331   if (sf->ops->CreateEmbeddedRootSF) {
1332     ierr = (*sf->ops->CreateEmbeddedRootSF)(sf,nselected,selected,esf);CHKERRQ(ierr);
1333   } else {
1334     /* A generic version of creating embedded sf */
1335     ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
1336     maxlocal = maxleaf - minleaf + 1;
1337     ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafmem);CHKERRQ(ierr);
1338     leafdata = leafmem - minleaf;
1339     /* Tag selected roots and bcast to leaves */
1340     for (i=0; i<nselected; i++) rootdata[selected[i]] = 1;
1341     ierr = PetscSFBcastBegin(sf,MPI_SIGNED_CHAR,rootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1342     ierr = PetscSFBcastEnd(sf,MPI_SIGNED_CHAR,rootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1343 
1344     /* Build esf with leaves that are still connected */
1345     esf_nleaves = 0;
1346     for (i=0; i<nleaves; i++) {
1347       j = ilocal ? ilocal[i] : i;
1348       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1349          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1350       */
1351       esf_nleaves += (leafdata[j] ? 1 : 0);
1352     }
1353     ierr = PetscMalloc1(esf_nleaves,&new_ilocal);CHKERRQ(ierr);
1354     ierr = PetscMalloc1(esf_nleaves,&new_iremote);CHKERRQ(ierr);
1355     for (i=n=0; i<nleaves; i++) {
1356       j = ilocal ? ilocal[i] : i;
1357       if (leafdata[j]) {
1358         new_ilocal[n]        = j;
1359         new_iremote[n].rank  = iremote[i].rank;
1360         new_iremote[n].index = iremote[i].index;
1361         ++n;
1362       }
1363     }
1364     ierr = PetscSFCreate(comm,esf);CHKERRQ(ierr);
1365     ierr = PetscSFSetFromOptions(*esf);CHKERRQ(ierr);
1366     ierr = PetscSFSetGraph(*esf,nroots,esf_nleaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1367     ierr = PetscFree2(rootdata,leafmem);CHKERRQ(ierr);
1368   }
1369   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 /*@C
1374   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1375 
1376   Collective
1377 
1378   Input Parameters:
1379 + sf - original star forest
1380 . nselected  - number of selected leaves on this process
1381 - selected   - indices of the selected leaves on this process
1382 
1383   Output Parameter:
1384 .  newsf - new star forest
1385 
1386   Level: advanced
1387 
1388 .seealso: PetscSFCreateEmbeddedRootSF(), PetscSFSetGraph(), PetscSFGetGraph()
1389 @*/
1390 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1391 {
1392   const PetscSFNode *iremote;
1393   PetscSFNode       *new_iremote;
1394   const PetscInt    *ilocal;
1395   PetscInt          i,nroots,*leaves,*new_ilocal;
1396   MPI_Comm          comm;
1397   PetscErrorCode    ierr;
1398 
1399   PetscFunctionBegin;
1400   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1401   PetscSFCheckGraphSet(sf,1);
1402   if (nselected) PetscValidPointer(selected,3);
1403   PetscValidPointer(newsf,4);
1404 
1405   /* Uniq selected[] and put results in leaves[] */
1406   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1407   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1408   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1409   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1410   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1411 
1412   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1413   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1414     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1415   } else {
1416     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1417     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1418     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1419     for (i=0; i<nselected; ++i) {
1420       const PetscInt l     = leaves[i];
1421       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1422       new_iremote[i].rank  = iremote[l].rank;
1423       new_iremote[i].index = iremote[l].index;
1424     }
1425     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1426     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1427   }
1428   ierr = PetscFree(leaves);CHKERRQ(ierr);
1429   PetscFunctionReturn(0);
1430 }
1431 
1432 /*@C
1433    PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastEnd()
1434 
1435    Collective on PetscSF
1436 
1437    Input Parameters:
1438 +  sf - star forest on which to communicate
1439 .  unit - data type associated with each node
1440 .  rootdata - buffer to broadcast
1441 -  op - operation to use for reduction
1442 
1443    Output Parameter:
1444 .  leafdata - buffer to be reduced with values from each leaf's respective root
1445 
1446    Level: intermediate
1447 
1448    Notes:
1449     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1450     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1451     use PetscSFBcastWithMemTypeBegin() instead.
1452 .seealso: PetscSFBcastEnd(), PetscSFBcastWithMemTypeBegin()
1453 @*/
1454 PetscErrorCode PetscSFBcastBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1455 {
1456   PetscErrorCode ierr;
1457   PetscMemType   rootmtype,leafmtype;
1458 
1459   PetscFunctionBegin;
1460   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1461   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1462   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1463   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1464   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1465   ierr = (*sf->ops->BcastBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1466   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1467   PetscFunctionReturn(0);
1468 }
1469 
1470 /*@C
1471    PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to PetscSFBcastEnd()
1472 
1473    Collective on PetscSF
1474 
1475    Input Parameters:
1476 +  sf - star forest on which to communicate
1477 .  unit - data type associated with each node
1478 .  rootmtype - memory type of rootdata
1479 .  rootdata - buffer to broadcast
1480 .  leafmtype - memory type of leafdata
1481 -  op - operation to use for reduction
1482 
1483    Output Parameter:
1484 .  leafdata - buffer to be reduced with values from each leaf's respective root
1485 
1486    Level: intermediate
1487 
1488 .seealso: PetscSFBcastEnd(), PetscSFBcastBegin()
1489 @*/
1490 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
1491 {
1492   PetscErrorCode ierr;
1493 
1494   PetscFunctionBegin;
1495   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1496   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1497   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1498   ierr = (*sf->ops->BcastBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1499   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1500   PetscFunctionReturn(0);
1501 }
1502 
1503 /*@C
1504    PetscSFBcastEnd - end a broadcast & reduce operation started with PetscSFBcastBegin()
1505 
1506    Collective
1507 
1508    Input Parameters:
1509 +  sf - star forest
1510 .  unit - data type
1511 .  rootdata - buffer to broadcast
1512 -  op - operation to use for reduction
1513 
1514    Output Parameter:
1515 .  leafdata - buffer to be reduced with values from each leaf's respective root
1516 
1517    Level: intermediate
1518 
1519 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1520 @*/
1521 PetscErrorCode PetscSFBcastEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1522 {
1523   PetscErrorCode ierr;
1524 
1525   PetscFunctionBegin;
1526   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1527   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastEnd,sf,0,0,0);CHKERRQ(ierr);}
1528   ierr = (*sf->ops->BcastEnd)(sf,unit,rootdata,leafdata,op);CHKERRQ(ierr);
1529   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastEnd,sf,0,0,0);CHKERRQ(ierr);}
1530   PetscFunctionReturn(0);
1531 }
1532 
1533 /*@C
1534    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1535 
1536    Collective
1537 
1538    Input Parameters:
1539 +  sf - star forest
1540 .  unit - data type
1541 .  leafdata - values to reduce
1542 -  op - reduction operation
1543 
1544    Output Parameter:
1545 .  rootdata - result of reduction of values from all leaves of each root
1546 
1547    Level: intermediate
1548 
1549    Notes:
1550     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1551     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1552     use PetscSFReduceWithMemTypeBegin() instead.
1553 
1554 .seealso: PetscSFBcastBegin(), PetscSFReduceWithMemTypeBegin()
1555 @*/
1556 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1557 {
1558   PetscErrorCode ierr;
1559   PetscMemType   rootmtype,leafmtype;
1560 
1561   PetscFunctionBegin;
1562   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1563   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1564   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1565   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1566   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1567   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1568   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1569   PetscFunctionReturn(0);
1570 }
1571 
1572 /*@C
1573    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to PetscSFReduceEnd()
1574 
1575    Collective
1576 
1577    Input Parameters:
1578 +  sf - star forest
1579 .  unit - data type
1580 .  leafmtype - memory type of leafdata
1581 .  leafdata - values to reduce
1582 .  rootmtype - memory type of rootdata
1583 -  op - reduction operation
1584 
1585    Output Parameter:
1586 .  rootdata - result of reduction of values from all leaves of each root
1587 
1588    Level: intermediate
1589 
1590 .seealso: PetscSFBcastBegin(), PetscSFReduceBegin()
1591 @*/
1592 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
1593 {
1594   PetscErrorCode ierr;
1595 
1596   PetscFunctionBegin;
1597   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1598   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1599   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1600   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1601   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1602   PetscFunctionReturn(0);
1603 }
1604 
1605 /*@C
1606    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1607 
1608    Collective
1609 
1610    Input Parameters:
1611 +  sf - star forest
1612 .  unit - data type
1613 .  leafdata - values to reduce
1614 -  op - reduction operation
1615 
1616    Output Parameter:
1617 .  rootdata - result of reduction of values from all leaves of each root
1618 
1619    Level: intermediate
1620 
1621 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1622 @*/
1623 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1624 {
1625   PetscErrorCode ierr;
1626 
1627   PetscFunctionBegin;
1628   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1629   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1630   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
1631   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1632   PetscFunctionReturn(0);
1633 }
1634 
1635 /*@C
1636    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1637 
1638    Collective
1639 
1640    Input Parameters:
1641 +  sf - star forest
1642 .  unit - data type
1643 .  leafdata - leaf values to use in reduction
1644 -  op - operation to use for reduction
1645 
1646    Output Parameters:
1647 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1648 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1649 
1650    Level: advanced
1651 
1652    Note:
1653    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1654    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1655    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1656    integers.
1657 
1658 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1659 @*/
1660 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1661 {
1662   PetscErrorCode ierr;
1663   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1664 
1665   PetscFunctionBegin;
1666   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1667   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1668   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1669   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1670   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1671   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1672   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1673   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1674   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1675   PetscFunctionReturn(0);
1676 }
1677 
1678 /*@C
1679    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1680 
1681    Collective
1682 
1683    Input Parameters:
1684 +  sf - star forest
1685 .  unit - data type
1686 .  leafdata - leaf values to use in reduction
1687 -  op - operation to use for reduction
1688 
1689    Output Parameters:
1690 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1691 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1692 
1693    Level: advanced
1694 
1695 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1696 @*/
1697 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1698 {
1699   PetscErrorCode ierr;
1700 
1701   PetscFunctionBegin;
1702   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1703   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1704   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootdata,leafdata,leafupdate,op);CHKERRQ(ierr);
1705   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1706   PetscFunctionReturn(0);
1707 }
1708 
1709 /*@C
1710    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1711 
1712    Collective
1713 
1714    Input Parameter:
1715 .  sf - star forest
1716 
1717    Output Parameter:
1718 .  degree - degree of each root vertex
1719 
1720    Level: advanced
1721 
1722    Notes:
1723    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1724 
1725 .seealso: PetscSFGatherBegin()
1726 @*/
1727 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1728 {
1729   PetscErrorCode ierr;
1730 
1731   PetscFunctionBegin;
1732   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1733   PetscSFCheckGraphSet(sf,1);
1734   PetscValidPointer(degree,2);
1735   if (!sf->degreeknown) {
1736     PetscInt i, nroots = sf->nroots, maxlocal;
1737     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1738     maxlocal = sf->maxleaf-sf->minleaf+1;
1739     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1740     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1741     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1742     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1743     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1744   }
1745   *degree = NULL;
1746   PetscFunctionReturn(0);
1747 }
1748 
1749 /*@C
1750    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1751 
1752    Collective
1753 
1754    Input Parameter:
1755 .  sf - star forest
1756 
1757    Output Parameter:
1758 .  degree - degree of each root vertex
1759 
1760    Level: developer
1761 
1762    Notes:
1763    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1764 
1765 .seealso:
1766 @*/
1767 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1768 {
1769   PetscErrorCode ierr;
1770 
1771   PetscFunctionBegin;
1772   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1773   PetscSFCheckGraphSet(sf,1);
1774   PetscValidPointer(degree,2);
1775   if (!sf->degreeknown) {
1776     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1777     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1778     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1779     sf->degreeknown = PETSC_TRUE;
1780   }
1781   *degree = sf->degree;
1782   PetscFunctionReturn(0);
1783 }
1784 
1785 /*@C
1786    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1787    Each multi-root is assigned index of the corresponding original root.
1788 
1789    Collective
1790 
1791    Input Parameters:
1792 +  sf - star forest
1793 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1794 
1795    Output Parameters:
1796 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1797 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1798 
1799    Level: developer
1800 
1801    Notes:
1802    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1803 
1804 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1805 @*/
1806 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1807 {
1808   PetscSF             msf;
1809   PetscInt            i, j, k, nroots, nmroots;
1810   PetscErrorCode      ierr;
1811 
1812   PetscFunctionBegin;
1813   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1814   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1815   if (nroots) PetscValidIntPointer(degree,2);
1816   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1817   PetscValidPointer(multiRootsOrigNumbering,4);
1818   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1819   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1820   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1821   for (i=0,j=0,k=0; i<nroots; i++) {
1822     if (!degree[i]) continue;
1823     for (j=0; j<degree[i]; j++,k++) {
1824       (*multiRootsOrigNumbering)[k] = i;
1825     }
1826   }
1827   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1828   if (nMultiRoots) *nMultiRoots = nmroots;
1829   PetscFunctionReturn(0);
1830 }
1831 
1832 /*@C
1833    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1834 
1835    Collective
1836 
1837    Input Parameters:
1838 +  sf - star forest
1839 .  unit - data type
1840 -  leafdata - leaf data to gather to roots
1841 
1842    Output Parameter:
1843 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1844 
1845    Level: intermediate
1846 
1847 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1848 @*/
1849 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1850 {
1851   PetscErrorCode ierr;
1852   PetscSF        multi = NULL;
1853 
1854   PetscFunctionBegin;
1855   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1856   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1857   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1858   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPI_REPLACE);CHKERRQ(ierr);
1859   PetscFunctionReturn(0);
1860 }
1861 
1862 /*@C
1863    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1864 
1865    Collective
1866 
1867    Input Parameters:
1868 +  sf - star forest
1869 .  unit - data type
1870 -  leafdata - leaf data to gather to roots
1871 
1872    Output Parameter:
1873 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1874 
1875    Level: intermediate
1876 
1877 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1878 @*/
1879 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1880 {
1881   PetscErrorCode ierr;
1882   PetscSF        multi = NULL;
1883 
1884   PetscFunctionBegin;
1885   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1886   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1887   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPI_REPLACE);CHKERRQ(ierr);
1888   PetscFunctionReturn(0);
1889 }
1890 
1891 /*@C
1892    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1893 
1894    Collective
1895 
1896    Input Parameters:
1897 +  sf - star forest
1898 .  unit - data type
1899 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1900 
1901    Output Parameter:
1902 .  leafdata - leaf data to be update with personal data from each respective root
1903 
1904    Level: intermediate
1905 
1906 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1907 @*/
1908 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1909 {
1910   PetscErrorCode ierr;
1911   PetscSF        multi = NULL;
1912 
1913   PetscFunctionBegin;
1914   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1915   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1916   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1917   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1918   PetscFunctionReturn(0);
1919 }
1920 
1921 /*@C
1922    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1923 
1924    Collective
1925 
1926    Input Parameters:
1927 +  sf - star forest
1928 .  unit - data type
1929 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1930 
1931    Output Parameter:
1932 .  leafdata - leaf data to be update with personal data from each respective root
1933 
1934    Level: intermediate
1935 
1936 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1937 @*/
1938 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1939 {
1940   PetscErrorCode ierr;
1941   PetscSF        multi = NULL;
1942 
1943   PetscFunctionBegin;
1944   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1945   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1946   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1947   PetscFunctionReturn(0);
1948 }
1949 
1950 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1951 {
1952   PetscInt        i, n, nleaves;
1953   const PetscInt *ilocal = NULL;
1954   PetscHSetI      seen;
1955   PetscErrorCode  ierr;
1956 
1957   PetscFunctionBegin;
1958   if (PetscDefined(USE_DEBUG)) {
1959     ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1960     ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1961     for (i = 0; i < nleaves; i++) {
1962       const PetscInt leaf = ilocal ? ilocal[i] : i;
1963       ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1964     }
1965     ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1966     if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1967     ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1968   }
1969   PetscFunctionReturn(0);
1970 }
1971 
1972 /*@
1973   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1974 
1975   Input Parameters:
1976 + sfA - The first PetscSF
1977 - sfB - The second PetscSF
1978 
1979   Output Parameters:
1980 . sfBA - The composite SF
1981 
1982   Level: developer
1983 
1984   Notes:
1985   Currently, the two SFs must be defined on congruent communicators and they must be true star
1986   forests, i.e. the same leaf is not connected with different roots.
1987 
1988   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1989   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1990   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1991   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1992 
1993 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1994 @*/
1995 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1996 {
1997   PetscErrorCode    ierr;
1998   const PetscSFNode *remotePointsA,*remotePointsB;
1999   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
2000   const PetscInt    *localPointsA,*localPointsB;
2001   PetscInt          *localPointsBA;
2002   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
2003   PetscBool         denseB;
2004 
2005   PetscFunctionBegin;
2006   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2007   PetscSFCheckGraphSet(sfA,1);
2008   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2009   PetscSFCheckGraphSet(sfB,2);
2010   PetscCheckSameComm(sfA,1,sfB,2);
2011   PetscValidPointer(sfBA,3);
2012   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2013   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2014 
2015   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
2016   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
2017   if (localPointsA) {
2018     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
2019     for (i=0; i<numRootsB; i++) {
2020       reorderedRemotePointsA[i].rank = -1;
2021       reorderedRemotePointsA[i].index = -1;
2022     }
2023     for (i=0; i<numLeavesA; i++) {
2024       if (localPointsA[i] >= numRootsB) continue;
2025       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
2026     }
2027     remotePointsA = reorderedRemotePointsA;
2028   }
2029   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
2030   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
2031   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf,MPI_REPLACE);CHKERRQ(ierr);
2032   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf,MPI_REPLACE);CHKERRQ(ierr);
2033   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2034 
2035   denseB = (PetscBool)!localPointsB;
2036   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2037     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
2038     else numLeavesBA++;
2039   }
2040   if (denseB) {
2041     localPointsBA  = NULL;
2042     remotePointsBA = leafdataB;
2043   } else {
2044     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
2045     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
2046     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2047       const PetscInt l = localPointsB ? localPointsB[i] : i;
2048 
2049       if (leafdataB[l-minleaf].rank == -1) continue;
2050       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
2051       localPointsBA[numLeavesBA] = l;
2052       numLeavesBA++;
2053     }
2054     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
2055   }
2056   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2057   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2058   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2059   PetscFunctionReturn(0);
2060 }
2061 
2062 /*@
2063   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
2064 
2065   Input Parameters:
2066 + sfA - The first PetscSF
2067 - sfB - The second PetscSF
2068 
2069   Output Parameters:
2070 . sfBA - The composite SF.
2071 
2072   Level: developer
2073 
2074   Notes:
2075   Currently, the two SFs must be defined on congruent communicators and they must be true star
2076   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2077   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2078 
2079   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2080   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2081   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2082   a Reduce on sfB, on connected roots.
2083 
2084 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
2085 @*/
2086 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
2087 {
2088   PetscErrorCode    ierr;
2089   const PetscSFNode *remotePointsA,*remotePointsB;
2090   PetscSFNode       *remotePointsBA;
2091   const PetscInt    *localPointsA,*localPointsB;
2092   PetscSFNode       *reorderedRemotePointsA = NULL;
2093   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
2094   MPI_Op            op;
2095 #if defined(PETSC_USE_64BIT_INDICES)
2096   PetscBool         iswin;
2097 #endif
2098 
2099   PetscFunctionBegin;
2100   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2101   PetscSFCheckGraphSet(sfA,1);
2102   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2103   PetscSFCheckGraphSet(sfB,2);
2104   PetscCheckSameComm(sfA,1,sfB,2);
2105   PetscValidPointer(sfBA,3);
2106   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2107   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2108 
2109   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
2110   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
2111 
2112   /* TODO: Check roots of sfB have degree of 1 */
2113   /* Once we implement it, we can replace the MPI_MAXLOC
2114      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2115      We use MPI_MAXLOC only to have a deterministic output from this routine if
2116      the root condition is not meet.
2117    */
2118   op = MPI_MAXLOC;
2119 #if defined(PETSC_USE_64BIT_INDICES)
2120   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2121   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
2122   if (iswin) op = MPI_REPLACE;
2123 #endif
2124 
2125   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
2126   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
2127   for (i=0; i<maxleaf - minleaf + 1; i++) {
2128     reorderedRemotePointsA[i].rank = -1;
2129     reorderedRemotePointsA[i].index = -1;
2130   }
2131   if (localPointsA) {
2132     for (i=0; i<numLeavesA; i++) {
2133       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2134       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2135     }
2136   } else {
2137     for (i=0; i<numLeavesA; i++) {
2138       if (i > maxleaf || i < minleaf) continue;
2139       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2140     }
2141   }
2142 
2143   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
2144   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
2145   for (i=0; i<numRootsB; i++) {
2146     remotePointsBA[i].rank = -1;
2147     remotePointsBA[i].index = -1;
2148   }
2149 
2150   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2151   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2152   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2153   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
2154     if (remotePointsBA[i].rank == -1) continue;
2155     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
2156     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2157     localPointsBA[numLeavesBA] = i;
2158     numLeavesBA++;
2159   }
2160   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2161   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2162   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2163   PetscFunctionReturn(0);
2164 }
2165 
2166 /*
2167   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
2168 
2169   Input Parameters:
2170 . sf - The global PetscSF
2171 
2172   Output Parameters:
2173 . out - The local PetscSF
2174  */
2175 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
2176 {
2177   MPI_Comm           comm;
2178   PetscMPIInt        myrank;
2179   const PetscInt     *ilocal;
2180   const PetscSFNode  *iremote;
2181   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
2182   PetscSFNode        *liremote;
2183   PetscSF            lsf;
2184   PetscErrorCode     ierr;
2185 
2186   PetscFunctionBegin;
2187   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2188   if (sf->ops->CreateLocalSF) {
2189     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2190   } else {
2191     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2192     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2193     ierr = MPI_Comm_rank(comm,&myrank);CHKERRMPI(ierr);
2194 
2195     /* Find out local edges and build a local SF */
2196     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2197     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2198     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2199     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2200 
2201     for (i=j=0; i<nleaves; i++) {
2202       if (iremote[i].rank == (PetscInt)myrank) {
2203         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2204         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2205         liremote[j].index = iremote[i].index;
2206         j++;
2207       }
2208     }
2209     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2210     ierr = PetscSFSetFromOptions(lsf);CHKERRQ(ierr);
2211     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2212     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2213     *out = lsf;
2214   }
2215   PetscFunctionReturn(0);
2216 }
2217 
2218 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2219 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2220 {
2221   PetscErrorCode     ierr;
2222   PetscMemType       rootmtype,leafmtype;
2223 
2224   PetscFunctionBegin;
2225   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2226   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2227   ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);
2228   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2229   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2230   if (sf->ops->BcastToZero) {
2231     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2232   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2233   ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);
2234   PetscFunctionReturn(0);
2235 }
2236 
2237