xref: /petsc/src/vec/is/sf/impls/basic/sfbasic.c (revision 637e6665586de6ab88316786e864cbbe563a9c63)
1 
2 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
3 
4 /*===================================================================================*/
5 /*              Internal routines for PetscSFPack                              */
6 /*===================================================================================*/
7 
8 /* Return root and leaf MPI requests for communication in the given direction. If the requests have not been
9    initialized (since we use persistent requests), then initialize them.
10 */
11 static PetscErrorCode PetscSFPackGetReqs_Basic(PetscSF sf,PetscSFPack link,PetscSFDirection direction,MPI_Request **rootreqs,MPI_Request **leafreqs)
12 {
13   PetscErrorCode ierr;
14   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
15   PetscInt       i,j,nrootranks,ndrootranks,nleafranks,ndleafranks,disp;
16   const PetscInt *rootoffset,*leafoffset;
17   PetscMPIInt    n;
18   MPI_Comm       comm = PetscObjectComm((PetscObject)sf);
19   MPI_Datatype   unit = link->unit;
20   PetscMemType   rootmtype,leafmtype;
21 
22   PetscFunctionBegin;
23   if (use_gpu_aware_mpi) {
24     rootmtype = link->rootmtype;
25     leafmtype = link->leafmtype;
26   } else {
27     rootmtype = PETSC_MEMTYPE_HOST;
28     leafmtype = PETSC_MEMTYPE_HOST;
29   }
30 
31   if (rootreqs && !link->rootreqsinited[direction][rootmtype]) {
32     ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
33     if (direction == PETSCSF_LEAF2ROOT_REDUCE) {
34       for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
35         disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
36         ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
37         ierr = MPI_Recv_init(link->rootbuf[rootmtype]+disp,n,unit,bas->iranks[i],link->tag,comm,&link->rootreqs[direction][rootmtype][j]);CHKERRQ(ierr);
38       }
39     } else if (direction == PETSCSF_ROOT2LEAF_BCAST) {
40       for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
41         disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
42         ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
43         ierr = MPI_Send_init(link->rootbuf[rootmtype]+disp,n,unit,bas->iranks[i],link->tag,comm,&link->rootreqs[direction][rootmtype][j]);CHKERRQ(ierr);
44       }
45     } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Out-of-range PetscSFDirection = %d\n",(int)direction);
46     link->rootreqsinited[direction][rootmtype] = PETSC_TRUE;
47   }
48 
49   if (leafreqs && !link->leafreqsinited[direction][leafmtype]) {
50     ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
51     if (direction == PETSCSF_LEAF2ROOT_REDUCE) {
52       for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
53         disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
54         ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
55         ierr = MPI_Send_init(link->leafbuf[leafmtype]+disp,n,unit,sf->ranks[i],link->tag,comm,&link->leafreqs[direction][leafmtype][j]);CHKERRQ(ierr);
56       }
57     } else if (direction == PETSCSF_ROOT2LEAF_BCAST) {
58       for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
59         disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
60         ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
61         ierr = MPI_Recv_init(link->leafbuf[leafmtype]+disp,n,unit,sf->ranks[i],link->tag,comm,&link->leafreqs[direction][leafmtype][j]);CHKERRQ(ierr);
62       }
63     } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Out-of-range PetscSFDirection = %d\n",(int)direction);
64     link->leafreqsinited[direction][leafmtype] = PETSC_TRUE;
65   }
66 
67   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype];
68   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype];
69   PetscFunctionReturn(0);
70 }
71 
72 /* Common part shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs. */
73 PETSC_INTERN PetscErrorCode PetscSFPackGet_Basic_Common(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,PetscInt nrootreqs,PetscInt nleafreqs,PetscSFPack *mylink)
74 {
75   PetscErrorCode    ierr;
76   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
77   PetscInt          i,j,nreqs,nrootranks,ndrootranks,nleafranks,ndleafranks;
78   const PetscInt    *rootoffset,*leafoffset;
79   PetscSFPack       *p,link;
80   PetscBool         match;
81 
82   PetscFunctionBegin;
83   ierr = PetscSFPackSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
84 
85   /* Look for types in cache */
86   for (p=&bas->avail; (link=*p); p=&link->next) {
87     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
88     if (match) {
89       *p = link->next; /* Remove from available list */
90       goto found;
91     }
92   }
93 
94   ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
95   ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
96   ierr = PetscNew(&link);CHKERRQ(ierr);
97   ierr = PetscSFPackSetUp_Host(sf,link,unit);CHKERRQ(ierr);
98   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
99 
100   /* Allocate root, leaf, self buffers, and MPI requests */
101   link->rootbuflen = rootoffset[nrootranks]-rootoffset[ndrootranks];
102   link->leafbuflen = leafoffset[nleafranks]-leafoffset[ndleafranks];
103   link->selfbuflen = rootoffset[ndrootranks];
104   link->nrootreqs  = nrootreqs;
105   link->nleafreqs  = nleafreqs;
106   nreqs            = (nrootreqs+nleafreqs)*4; /* Quadruple the requests since there are two communication directions and two memory types */
107   ierr             = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
108   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
109 
110   for (i=0; i<2; i++) { /* Two communication directions */
111     for (j=0; j<2; j++) { /* Two memory types */
112       link->rootreqs[i][j] = link->reqs + nrootreqs*(2*i+j);
113       link->leafreqs[i][j] = link->reqs + nrootreqs*4 + nleafreqs*(2*i+j);
114     }
115   }
116 
117 found:
118   link->rootmtype = rootmtype;
119   link->leafmtype = leafmtype;
120 #if defined(PETSC_HAVE_CUDA)
121   ierr = PetscSFPackSetUp_Device(sf,link,unit);CHKERRQ(ierr);
122   if (!use_gpu_aware_mpi) {
123     /* If not using GPU aware MPI, we always need buffers on host. In case root/leafdata is on device, we copy root/leafdata to/from
124        these buffers for MPI. We only need buffers for remote neighbors since self-to-self communication is not done via MPI.
125      */
126     if (!link->rootbuf[PETSC_MEMTYPE_HOST]) {ierr = PetscMallocWithMemType(PETSC_MEMTYPE_HOST,link->rootbuflen*link->unitbytes,(void**)&link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);}
127     if (!link->leafbuf[PETSC_MEMTYPE_HOST]) {ierr = PetscMallocWithMemType(PETSC_MEMTYPE_HOST,link->leafbuflen*link->unitbytes,(void**)&link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);}
128   }
129 #endif
130   if (!link->rootbuf[rootmtype]) {ierr = PetscMallocWithMemType(rootmtype,link->rootbuflen*link->unitbytes,(void**)&link->rootbuf[rootmtype]);CHKERRQ(ierr);}
131   if (!link->leafbuf[leafmtype]) {ierr = PetscMallocWithMemType(leafmtype,link->leafbuflen*link->unitbytes,(void**)&link->leafbuf[leafmtype]);CHKERRQ(ierr);}
132   if (!link->selfbuf[rootmtype]) {ierr = PetscMallocWithMemType(rootmtype,link->selfbuflen*link->unitbytes,(void**)&link->selfbuf[rootmtype]);CHKERRQ(ierr);}
133   if (rootmtype != leafmtype && !link->selfbuf[leafmtype]) {ierr = PetscMallocWithMemType(leafmtype,link->selfbuflen*link->unitbytes,(void**)&link->selfbuf[leafmtype]);CHKERRQ(ierr);}
134   link->rootdata = rootdata;
135   link->leafdata = leafdata;
136   link->next     = bas->inuse;
137   bas->inuse     = link;
138   *mylink        = link;
139   PetscFunctionReturn(0);
140 }
141 
142 static PetscErrorCode PetscSFPackGet_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,PetscSFDirection direction,PetscSFPack *mylink)
143 {
144   PetscErrorCode    ierr;
145   PetscInt          nrootranks,ndrootranks,nleafranks,ndleafranks;
146 
147   PetscFunctionBegin;
148   ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,NULL,NULL);CHKERRQ(ierr);
149   ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,NULL,NULL,NULL);CHKERRQ(ierr);
150   ierr = PetscSFPackGet_Basic_Common(sf,unit,rootmtype,rootdata,leafmtype,leafdata,nrootranks-ndrootranks,nleafranks-ndleafranks,mylink);CHKERRQ(ierr);
151   PetscFunctionReturn(0);
152 }
153 
154 /*===================================================================================*/
155 /*              SF public interface implementations                                  */
156 /*===================================================================================*/
157 PETSC_INTERN PetscErrorCode PetscSFSetUp_Basic(PetscSF sf)
158 {
159   PetscErrorCode ierr;
160   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
161   PetscInt       *rlengths,*ilengths,i;
162   PetscMPIInt    rank,niranks,*iranks,tag;
163   MPI_Comm       comm;
164   MPI_Group      group;
165   MPI_Request    *rootreqs,*leafreqs;
166 
167   PetscFunctionBegin;
168   ierr = MPI_Comm_group(PETSC_COMM_SELF,&group);CHKERRQ(ierr);
169   ierr = PetscSFSetUpRanks(sf,group);CHKERRQ(ierr);
170   ierr = MPI_Group_free(&group);CHKERRQ(ierr);
171   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
172   ierr = PetscObjectGetNewTag((PetscObject)sf,&tag);CHKERRQ(ierr);
173   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
174   /*
175    * Inform roots about how many leaves and from which ranks
176    */
177   ierr = PetscMalloc1(sf->nranks,&rlengths);CHKERRQ(ierr);
178   /* Determine number, sending ranks, and length of incoming */
179   for (i=0; i<sf->nranks; i++) {
180     rlengths[i] = sf->roffset[i+1] - sf->roffset[i]; /* Number of roots referenced by my leaves; for rank sf->ranks[i] */
181   }
182   ierr = PetscCommBuildTwoSided(comm,1,MPIU_INT,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,rlengths+sf->ndranks,&niranks,&iranks,(void**)&ilengths);CHKERRQ(ierr);
183 
184   /* Sort iranks. See use of VecScatterGetRemoteOrdered_Private() in MatGetBrowsOfAoCols_MPIAIJ() on why.
185      We could sort ranks there at the price of allocating extra working arrays. Presumably, niranks is
186      small and the sorting is cheap.
187    */
188   ierr = PetscSortMPIIntWithIntArray(niranks,iranks,ilengths);CHKERRQ(ierr);
189 
190   /* Partition into distinguished and non-distinguished incoming ranks */
191   bas->ndiranks = sf->ndranks;
192   bas->niranks = bas->ndiranks + niranks;
193   ierr = PetscMalloc2(bas->niranks,&bas->iranks,bas->niranks+1,&bas->ioffset);CHKERRQ(ierr);
194   bas->ioffset[0] = 0;
195   for (i=0; i<bas->ndiranks; i++) {
196     bas->iranks[i] = sf->ranks[i];
197     bas->ioffset[i+1] = bas->ioffset[i] + rlengths[i];
198   }
199   if (bas->ndiranks > 1 || (bas->ndiranks == 1 && bas->iranks[0] != rank)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Broken setup for shared ranks");
200   for ( ; i<bas->niranks; i++) {
201     bas->iranks[i] = iranks[i-bas->ndiranks];
202     bas->ioffset[i+1] = bas->ioffset[i] + ilengths[i-bas->ndiranks];
203   }
204   bas->itotal = bas->ioffset[i];
205   ierr = PetscFree(rlengths);CHKERRQ(ierr);
206   ierr = PetscFree(iranks);CHKERRQ(ierr);
207   ierr = PetscFree(ilengths);CHKERRQ(ierr);
208 
209   /* Send leaf identities to roots */
210   ierr = PetscMalloc1(bas->itotal,&bas->irootloc);CHKERRQ(ierr);
211   ierr = PetscMalloc2(bas->niranks-bas->ndiranks,&rootreqs,sf->nranks-sf->ndranks,&leafreqs);CHKERRQ(ierr);
212   for (i=bas->ndiranks; i<bas->niranks; i++) {
213     ierr = MPI_Irecv(bas->irootloc+bas->ioffset[i],bas->ioffset[i+1]-bas->ioffset[i],MPIU_INT,bas->iranks[i],tag,comm,&rootreqs[i-bas->ndiranks]);CHKERRQ(ierr);
214   }
215   for (i=0; i<sf->nranks; i++) {
216     PetscMPIInt npoints;
217     ierr = PetscMPIIntCast(sf->roffset[i+1] - sf->roffset[i],&npoints);CHKERRQ(ierr);
218     if (i < sf->ndranks) {
219       if (sf->ranks[i] != rank) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot interpret distinguished leaf rank");
220       if (bas->iranks[0] != rank) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot interpret distinguished root rank");
221       if (npoints != bas->ioffset[1]-bas->ioffset[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Distinguished rank exchange has mismatched lengths");
222       ierr = PetscArraycpy(bas->irootloc+bas->ioffset[0],sf->rremote+sf->roffset[i],npoints);CHKERRQ(ierr);
223       continue;
224     }
225     ierr = MPI_Isend(sf->rremote+sf->roffset[i],npoints,MPIU_INT,sf->ranks[i],tag,comm,&leafreqs[i-sf->ndranks]);CHKERRQ(ierr);
226   }
227   ierr = MPI_Waitall(bas->niranks-bas->ndiranks,rootreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
228   ierr = MPI_Waitall(sf->nranks-sf->ndranks,leafreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
229   ierr = PetscFree2(rootreqs,leafreqs);CHKERRQ(ierr);
230 
231   sf->selfleafdups    = PETSC_TRUE; /* The conservative assumption is there are data race */
232   sf->remoteleafdups  = PETSC_TRUE;
233   bas->selfrootdups   = PETSC_TRUE;
234   bas->remoterootdups = PETSC_TRUE;
235 
236   /* Setup packing optimization for roots and leaves */
237   ierr = PetscSFPackSetupOptimizations_Basic(sf);CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 static PetscErrorCode PetscSFSetFromOptions_Basic(PetscOptionItems *PetscOptionsObject,PetscSF sf)
242 {
243   PetscErrorCode ierr;
244 
245   PetscFunctionBegin;
246   ierr = PetscOptionsHead(PetscOptionsObject,"PetscSF Basic options");CHKERRQ(ierr);
247   ierr = PetscOptionsTail();CHKERRQ(ierr);
248   PetscFunctionReturn(0);
249 }
250 
251 PETSC_INTERN PetscErrorCode PetscSFReset_Basic(PetscSF sf)
252 {
253   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
254   PetscErrorCode    ierr;
255 
256   PetscFunctionBegin;
257   if (bas->inuse) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Outstanding operation has not been completed");
258   ierr = PetscFree2(bas->iranks,bas->ioffset);CHKERRQ(ierr);
259   ierr = PetscFree(bas->irootloc);CHKERRQ(ierr);
260 #if defined(PETSC_HAVE_CUDA)
261   if (bas->irootloc_d) {cudaError_t err = cudaFree(bas->irootloc_d);CHKERRCUDA(err);bas->irootloc_d=NULL;}
262 #endif
263   ierr = PetscSFPackDestroyAvailable(&bas->avail);CHKERRQ(ierr);
264   ierr = PetscSFPackDestroyOptimizations_Basic(sf);CHKERRQ(ierr);
265   PetscFunctionReturn(0);
266 }
267 
268 PETSC_INTERN PetscErrorCode PetscSFDestroy_Basic(PetscSF sf)
269 {
270   PetscErrorCode ierr;
271 
272   PetscFunctionBegin;
273   ierr = PetscSFReset(sf);CHKERRQ(ierr);
274   ierr = PetscFree(sf->data);CHKERRQ(ierr);
275   PetscFunctionReturn(0);
276 }
277 
278 PETSC_INTERN PetscErrorCode PetscSFView_Basic(PetscSF sf,PetscViewer viewer)
279 {
280   PetscErrorCode ierr;
281   PetscBool      iascii;
282 
283   PetscFunctionBegin;
284   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
285   if (iascii) {ierr = PetscViewerASCIIPrintf(viewer,"  sort=%s\n",sf->rankorder ? "rank-order" : "unordered");CHKERRQ(ierr);}
286   PetscFunctionReturn(0);
287 }
288 
289 static PetscErrorCode PetscSFBcastAndOpBegin_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
290 {
291   PetscErrorCode    ierr;
292   PetscSFPack       link;
293   const PetscInt    *rootloc = NULL;
294   MPI_Request       *rootreqs,*leafreqs;
295 
296   PetscFunctionBegin;
297   ierr = PetscSFPackGet_Basic(sf,unit,rootmtype,rootdata,leafmtype,leafdata,PETSCSF_ROOT2LEAF_BCAST,&link);CHKERRQ(ierr);
298   ierr = PetscSFGetRootIndicesWithMemType_Basic(sf,rootmtype,&rootloc);CHKERRQ(ierr);
299 
300   ierr = PetscSFPackGetReqs_Basic(sf,link,PETSCSF_ROOT2LEAF_BCAST,&rootreqs,&leafreqs);CHKERRQ(ierr);
301   /* Post Irecv. Note distinguished ranks receive data via shared memory (i.e., not via MPI) */
302   ierr = MPI_Startall_irecv(link->leafbuflen,unit,link->nleafreqs,leafreqs);CHKERRQ(ierr);
303 
304   /* Do Isend */
305   ierr = PetscSFPackRootData(sf,link,rootloc,rootdata,PETSC_TRUE);CHKERRQ(ierr);
306   ierr = MPI_Startall_isend(link->rootbuflen,unit,link->nrootreqs,rootreqs);CHKERRQ(ierr);
307 
308   /* Do self to self communication via memcpy only when rootdata and leafdata are in different memory */
309   if (rootmtype != leafmtype) {ierr = PetscMemcpyWithMemType(leafmtype,rootmtype,link->selfbuf[leafmtype],link->selfbuf[rootmtype],link->selfbuflen*link->unitbytes);CHKERRQ(ierr);}
310   PetscFunctionReturn(0);
311 }
312 
313 PETSC_INTERN PetscErrorCode PetscSFBcastAndOpEnd_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
314 {
315   PetscErrorCode    ierr;
316   PetscSFPack       link;
317   const PetscInt    *leafloc = NULL;
318 
319   PetscFunctionBegin;
320   ierr = PetscSFPackGetInUse(sf,unit,rootdata,leafdata,PETSC_OWN_POINTER,&link);CHKERRQ(ierr);
321   ierr = PetscSFPackWaitall(link,PETSCSF_ROOT2LEAF_BCAST);CHKERRQ(ierr);
322   ierr = PetscSFGetLeafIndicesWithMemType_Basic(sf,leafmtype,&leafloc);CHKERRQ(ierr);
323   ierr = PetscSFUnpackAndOpLeafData(sf,link,leafloc,leafdata,op,PETSC_TRUE);CHKERRQ(ierr);
324   ierr = PetscSFPackReclaim(sf,&link);CHKERRQ(ierr);
325   PetscFunctionReturn(0);
326 }
327 
328 /* leaf -> root with reduction */
329 static PetscErrorCode PetscSFReduceBegin_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
330 {
331   PetscErrorCode    ierr;
332   PetscSFPack       link;
333   const PetscInt    *leafloc = NULL;
334   MPI_Request       *rootreqs = NULL,*leafreqs = NULL; /* dummy values for compiler warnings about uninitialized value */
335 
336   PetscFunctionBegin;
337   ierr = PetscSFGetLeafIndicesWithMemType_Basic(sf,leafmtype,&leafloc);
338 
339   ierr = PetscSFPackGet_Basic(sf,unit,rootmtype,rootdata,leafmtype,leafdata,PETSCSF_LEAF2ROOT_REDUCE,&link);CHKERRQ(ierr);
340   ierr = PetscSFPackGetReqs_Basic(sf,link,PETSCSF_LEAF2ROOT_REDUCE,&rootreqs,&leafreqs);CHKERRQ(ierr);
341   /* Eagerly post root receives for non-distinguished ranks */
342   ierr = MPI_Startall_irecv(link->rootbuflen,unit,link->nrootreqs,rootreqs);CHKERRQ(ierr);
343 
344   /* Pack and send leaf data */
345   ierr = PetscSFPackLeafData(sf,link,leafloc,leafdata,PETSC_TRUE);CHKERRQ(ierr);
346   ierr = MPI_Startall_isend(link->leafbuflen,unit,link->nleafreqs,leafreqs);CHKERRQ(ierr);
347 
348   if (rootmtype != leafmtype) {ierr = PetscMemcpyWithMemType(rootmtype,leafmtype,link->selfbuf[rootmtype],link->selfbuf[leafmtype],link->selfbuflen*link->unitbytes);CHKERRQ(ierr);}
349   PetscFunctionReturn(0);
350 }
351 
352 PETSC_INTERN PetscErrorCode PetscSFReduceEnd_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
353 {
354   PetscErrorCode    ierr;
355   PetscSFPack       link;
356   const PetscInt    *rootloc = NULL;
357 
358   PetscFunctionBegin;
359   ierr = PetscSFGetRootIndicesWithMemType_Basic(sf,rootmtype,&rootloc);CHKERRQ(ierr);
360   ierr = PetscSFPackGetInUse(sf,unit,rootdata,leafdata,PETSC_OWN_POINTER,&link);CHKERRQ(ierr);
361   ierr = PetscSFPackWaitall(link,PETSCSF_LEAF2ROOT_REDUCE);CHKERRQ(ierr);
362   ierr = PetscSFUnpackAndOpRootData(sf,link,rootloc,rootdata,op,PETSC_TRUE);CHKERRQ(ierr);
363   ierr = PetscSFPackReclaim(sf,&link);CHKERRQ(ierr);
364   PetscFunctionReturn(0);
365 }
366 
367 PETSC_INTERN PetscErrorCode PetscSFFetchAndOpBegin_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,void *rootdata,PetscMemType leafmtype,const void *leafdata,void *leafupdate,MPI_Op op)
368 {
369   PetscErrorCode ierr;
370 
371   PetscFunctionBegin;
372   ierr = PetscSFReduceBegin(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
373   PetscFunctionReturn(0);
374 }
375 
376 static PetscErrorCode PetscSFFetchAndOpEnd_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,void *rootdata,PetscMemType leafmtype,const void *leafdata,void *leafupdate,MPI_Op op)
377 {
378   PetscErrorCode    ierr;
379   PetscSFPack       link;
380   const PetscInt    *rootloc = NULL,*leafloc = NULL;
381   MPI_Request       *rootreqs,*leafreqs;
382 
383   PetscFunctionBegin;
384   ierr = PetscSFGetRootIndicesWithMemType_Basic(sf,rootmtype,&rootloc);CHKERRQ(ierr);
385   ierr = PetscSFGetLeafIndicesWithMemType_Basic(sf,leafmtype,&leafloc);CHKERRQ(ierr);
386   ierr = PetscSFPackGetInUse(sf,unit,rootdata,leafdata,PETSC_OWN_POINTER,&link);CHKERRQ(ierr);
387   /* This implementation could be changed to unpack as receives arrive, at the cost of non-determinism */
388   ierr = PetscSFPackWaitall(link,PETSCSF_LEAF2ROOT_REDUCE);CHKERRQ(ierr);
389   ierr = PetscSFPackGetReqs_Basic(sf,link,PETSCSF_ROOT2LEAF_BCAST,&rootreqs,&leafreqs);CHKERRQ(ierr);
390 
391   /* Post leaf receives */
392   ierr = MPI_Startall_irecv(link->leafbuflen,unit,link->nleafreqs,leafreqs);CHKERRQ(ierr);
393 
394   /* Process local fetch-and-op, post root sends */
395   ierr = PetscSFFetchAndOpRootData(sf,link,rootloc,rootdata,op,PETSC_TRUE);CHKERRQ(ierr);
396   ierr = MPI_Startall_isend(link->rootbuflen,unit,link->nrootreqs,rootreqs);CHKERRQ(ierr);
397   if (rootmtype != leafmtype) {ierr = PetscMemcpyWithMemType(leafmtype,rootmtype,link->selfbuf[leafmtype],link->selfbuf[rootmtype],link->selfbuflen*link->unitbytes);CHKERRQ(ierr);}
398 
399   /* Unpack and insert fetched data into leaves */
400   ierr = PetscSFPackWaitall(link,PETSCSF_ROOT2LEAF_BCAST);CHKERRQ(ierr);
401   ierr = PetscSFUnpackAndOpLeafData(sf,link,leafloc,leafupdate,MPIU_REPLACE,PETSC_TRUE);CHKERRQ(ierr);
402   ierr = PetscSFPackReclaim(sf,&link);CHKERRQ(ierr);
403   PetscFunctionReturn(0);
404 }
405 
406 PETSC_INTERN PetscErrorCode PetscSFGetLeafRanks_Basic(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
407 {
408   PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
409 
410   PetscFunctionBegin;
411   if (niranks)  *niranks  = bas->niranks;
412   if (iranks)   *iranks   = bas->iranks;
413   if (ioffset)  *ioffset  = bas->ioffset;
414   if (irootloc) *irootloc = bas->irootloc;
415   PetscFunctionReturn(0);
416 }
417 
418 /* An optimized PetscSFCreateEmbeddedSF. We aggresively make use of the established communication on sf.
419    We need one bcast on sf, and no communication anymore to build the embedded sf. Note that selected[]
420    was sorted before calling the routine.
421  */
422 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedSF_Basic(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
423 {
424   PetscSF           esf;
425   PetscInt          esf_nranks,esf_ndranks,*esf_roffset,*esf_rmine,*esf_rremote,count;
426   PetscInt          i,j,k,p,q,nroots,*rootdata,*leafdata,connected_leaves,*new_ilocal,nranks,ndranks,niranks,ndiranks,minleaf,maxleaf,maxlocal;
427   PetscMPIInt       *esf_ranks;
428   const PetscMPIInt *ranks,*iranks;
429   const PetscInt    *roffset,*rmine,*rremote,*ioffset,*irootloc,*buffer;
430   PetscBool         connected;
431   PetscSFPack       link;
432   PetscSFNode       *new_iremote;
433   PetscSF_Basic     *bas;
434   PetscErrorCode    ierr;
435 
436   PetscFunctionBegin;
437   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),&esf);CHKERRQ(ierr);
438   ierr = PetscSFSetType(esf,PETSCSFBASIC);CHKERRQ(ierr); /* This optimized routine can only create a basic sf */
439 
440   /* Find out which leaves are still connected to roots in the embedded sf */
441   ierr = PetscSFGetGraph(sf,&nroots,NULL,NULL,NULL);CHKERRQ(ierr);
442   ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
443   /* We abused the term leafdata here, whose size is usually the number of leaf data items. Here its size is # of leaves (always >= # of leaf data items) */
444   maxlocal = (minleaf > maxleaf)? 0 : maxleaf-minleaf+1; /* maxleaf=-1 and minleaf=0 when nleaves=0 */
445   ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafdata);CHKERRQ(ierr);
446   /* Tag selected roots */
447   for (i=0; i<nselected; ++i) rootdata[selected[i]] = 1;
448 
449   /* Bcast from roots to leaves to tag connected leaves. We reuse the established bcast communication in
450      sf but do not do unpacking (from leaf buffer to leafdata). The raw data in leaf buffer is what we are
451      interested in since it tells which leaves are connected to which ranks.
452    */
453   ierr = PetscSFBcastAndOpBegin_Basic(sf,MPIU_INT,PETSC_MEMTYPE_HOST,rootdata,PETSC_MEMTYPE_HOST,leafdata-minleaf,MPIU_REPLACE);CHKERRQ(ierr); /* Need to give leafdata but we won't use it */
454   ierr = PetscSFPackGetInUse(sf,MPIU_INT,rootdata,leafdata-minleaf,PETSC_OWN_POINTER,&link);CHKERRQ(ierr);
455   ierr = PetscSFPackWaitall(link,PETSCSF_ROOT2LEAF_BCAST);CHKERRQ(ierr);
456   ierr = PetscSFGetLeafInfo_Basic(sf,&nranks,&ndranks,&ranks,&roffset,&rmine,&rremote);CHKERRQ(ierr); /* Get send info */
457   esf_nranks = esf_ndranks = connected_leaves = 0;
458   for (i=0; i<nranks; i++) {
459     connected = PETSC_FALSE; /* Is the current process still connected to this remote root rank? */
460     buffer    = i < ndranks? (PetscInt*)link->selfbuf[PETSC_MEMTYPE_HOST] : (PetscInt*)link->leafbuf[PETSC_MEMTYPE_HOST] + (roffset[i] - roffset[ndranks]);
461     count     = roffset[i+1] - roffset[i];
462     for (j=0; j<count; j++) {if (buffer[j]) {connected_leaves++; connected = PETSC_TRUE;}}
463     if (connected) {esf_nranks++; if (i < ndranks) esf_ndranks++;}
464   }
465 
466   /* Set graph of esf and also set up its outgoing communication (i.e., send info), which is usually done by PetscSFSetUpRanks */
467   ierr = PetscMalloc1(connected_leaves,&new_ilocal);CHKERRQ(ierr);
468   ierr = PetscMalloc1(connected_leaves,&new_iremote);CHKERRQ(ierr);
469   ierr = PetscMalloc4(esf_nranks,&esf_ranks,esf_nranks+1,&esf_roffset,connected_leaves,&esf_rmine,connected_leaves,&esf_rremote);CHKERRQ(ierr);
470   p    = 0; /* Counter for connected root ranks */
471   q    = 0; /* Counter for connected leaves */
472   esf_roffset[0] = 0;
473   for (i=0; i<nranks; i++) { /* Scan leaf data again to fill esf arrays */
474     buffer    = i < ndranks? (PetscInt*)link->selfbuf[PETSC_MEMTYPE_HOST] : (PetscInt*)link->leafbuf[PETSC_MEMTYPE_HOST] + (roffset[i] - roffset[ndranks]);
475     connected = PETSC_FALSE;
476     for (j=roffset[i],k=0; j<roffset[i+1]; j++,k++) {
477         if (buffer[k]) {
478         esf_rmine[q]         = new_ilocal[q] = rmine[j];
479         esf_rremote[q]       = rremote[j];
480         new_iremote[q].index = rremote[j];
481         new_iremote[q].rank  = ranks[i];
482         connected            = PETSC_TRUE;
483         q++;
484       }
485     }
486     if (connected) {
487       esf_ranks[p]     = ranks[i];
488       esf_roffset[p+1] = q;
489       p++;
490     }
491   }
492 
493   ierr = PetscSFPackReclaim(sf,&link);CHKERRQ(ierr);
494 
495   /* SetGraph internally resets the SF, so we only set its fields after the call */
496   ierr         = PetscSFSetGraph(esf,nroots,connected_leaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
497   esf->nranks  = esf_nranks;
498   esf->ndranks = esf_ndranks;
499   esf->ranks   = esf_ranks;
500   esf->roffset = esf_roffset;
501   esf->rmine   = esf_rmine;
502   esf->rremote = esf_rremote;
503 
504   /* Set up the incoming communication (i.e., recv info) stored in esf->data, which is usually done by PetscSFSetUp_Basic */
505   bas  = (PetscSF_Basic*)esf->data;
506   ierr = PetscSFGetRootInfo_Basic(sf,&niranks,&ndiranks,&iranks,&ioffset,&irootloc);CHKERRQ(ierr); /* Get recv info */
507   /* Embedded sf always has simpler communication than the original one. We might allocate longer arrays than needed here. But we
508      expect these arrays are usually short, so we do not care. The benefit is we can fill these arrays by just parsing irootloc once.
509    */
510   ierr = PetscMalloc2(niranks,&bas->iranks,niranks+1,&bas->ioffset);CHKERRQ(ierr);
511   ierr = PetscMalloc1(ioffset[niranks],&bas->irootloc);CHKERRQ(ierr);
512   bas->niranks = bas->ndiranks = bas->ioffset[0] = 0;
513   p = 0; /* Counter for connected leaf ranks */
514   q = 0; /* Counter for connected roots */
515   for (i=0; i<niranks; i++) {
516     connected = PETSC_FALSE; /* Is the current process still connected to this remote leaf rank? */
517     for (j=ioffset[i]; j<ioffset[i+1]; j++) {
518       PetscInt loc;
519       ierr = PetscFindInt(irootloc[j],nselected,selected,&loc);CHKERRQ(ierr);
520       if (loc >= 0) { /* Found in selected this root is connected */
521         bas->irootloc[q++] = irootloc[j];
522         connected = PETSC_TRUE;
523       }
524     }
525     if (connected) {
526       bas->niranks++;
527       if (i<ndiranks) bas->ndiranks++; /* Note that order of ranks (including distinguished ranks) is kept */
528       bas->iranks[p]    = iranks[i];
529       bas->ioffset[p+1] = q;
530       p++;
531     }
532   }
533   bas->itotal = q;
534 
535   /* Setup packing optimizations */
536   ierr = PetscSFPackSetupOptimizations_Basic(esf);CHKERRQ(ierr);
537   esf->setupcalled = PETSC_TRUE; /* We have done setup ourselves! */
538 
539   ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
540   *newsf = esf;
541   PetscFunctionReturn(0);
542 }
543 
544 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedLeafSF_Basic(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
545 {
546   PetscSF           esf;
547   PetscInt          i,j,k,p,q,nroots,*rootdata,*leafdata,*new_ilocal,niranks,ndiranks,minleaf,maxleaf,maxlocal;
548   const PetscInt    *ilocal,*ioffset,*irootloc,*buffer;
549   const PetscMPIInt *iranks;
550   PetscSFPack       link;
551   PetscSFNode       *new_iremote;
552   const PetscSFNode *iremote;
553   PetscSF_Basic     *bas;
554   MPI_Group         group;
555   PetscErrorCode    ierr;
556 
557   PetscFunctionBegin;
558   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),&esf);CHKERRQ(ierr);
559   ierr = PetscSFSetType(esf,PETSCSFBASIC);CHKERRQ(ierr); /* This optimized routine can only create a basic sf */
560 
561   /* Set the graph of esf, which is easy for CreateEmbeddedLeafSF */
562   ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
563   ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
564   ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
565   ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
566   for (i=0; i<nselected; i++) {
567     const PetscInt l     = selected[i];
568     new_ilocal[i]        = ilocal ? ilocal[l] : l;
569     new_iremote[i].rank  = iremote[l].rank;
570     new_iremote[i].index = iremote[l].index;
571   }
572 
573   /* Tag selected leaves before PetscSFSetGraph since new_ilocal might turn into NULL since we use PETSC_OWN_POINTER below */
574   maxlocal = (minleaf > maxleaf)? 0 : maxleaf-minleaf+1; /* maxleaf=-1 and minleaf=0 when nleaves=0 */
575   ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafdata);CHKERRQ(ierr);
576   for (i=0; i<nselected; i++) leafdata[new_ilocal[i]-minleaf] = 1; /* -minleaf to adjust indices according to minleaf */
577 
578   ierr = PetscSFSetGraph(esf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
579 
580   /* Set up the outgoing communication (i.e., send info). We can not reuse rmine etc in sf since there is no way to
581      map rmine[i] (ilocal of leaves) back to selected[j]  (leaf indices).
582    */
583   ierr = MPI_Comm_group(PETSC_COMM_SELF,&group);CHKERRQ(ierr);
584   ierr = PetscSFSetUpRanks(esf,group);CHKERRQ(ierr);
585   ierr = MPI_Group_free(&group);CHKERRQ(ierr);
586 
587   /* Set up the incoming communication (i.e., recv info) */
588   ierr = PetscSFGetRootInfo_Basic(sf,&niranks,&ndiranks,&iranks,&ioffset,&irootloc);CHKERRQ(ierr);
589   bas  = (PetscSF_Basic*)esf->data;
590   ierr = PetscMalloc2(niranks,&bas->iranks,niranks+1,&bas->ioffset);CHKERRQ(ierr);
591   ierr = PetscMalloc1(ioffset[niranks],&bas->irootloc);CHKERRQ(ierr);
592 
593   /* Pass info about selected leaves to root buffer */
594   ierr = PetscSFReduceBegin_Basic(sf,MPIU_INT,PETSC_MEMTYPE_HOST,leafdata-minleaf,PETSC_MEMTYPE_HOST,rootdata,MPIU_REPLACE);CHKERRQ(ierr); /* -minleaf to re-adjust start address of leafdata */
595   ierr = PetscSFPackGetInUse(sf,MPIU_INT,rootdata,leafdata-minleaf,PETSC_OWN_POINTER,&link);CHKERRQ(ierr);
596   ierr = PetscSFPackWaitall(link,PETSCSF_LEAF2ROOT_REDUCE);CHKERRQ(ierr);
597 
598   bas->niranks = bas->ndiranks = bas->ioffset[0] = 0;
599   p = 0; /* Counter for connected leaf ranks */
600   q = 0; /* Counter for connected roots */
601   for (i=0; i<niranks; i++) {
602     PetscBool connected = PETSC_FALSE; /* Is the current process still connected to this remote leaf rank? */
603     buffer = i < ndiranks? (PetscInt*)link->selfbuf[PETSC_MEMTYPE_HOST] : (PetscInt*)link->rootbuf[PETSC_MEMTYPE_HOST] + (ioffset[i] - ioffset[ndiranks]);
604     for (j=ioffset[i],k=0; j<ioffset[i+1]; j++,k++) {
605       if (buffer[k]) {bas->irootloc[q++] = irootloc[j]; connected = PETSC_TRUE;}
606     }
607     if (connected) {
608       bas->niranks++;
609       if (i<ndiranks) bas->ndiranks++;
610       bas->iranks[p]    = iranks[i];
611       bas->ioffset[p+1] = q;
612       p++;
613     }
614   }
615   bas->itotal = q;
616   ierr = PetscSFPackReclaim(sf,&link);CHKERRQ(ierr);
617 
618   /* Setup packing optimizations */
619   ierr = PetscSFPackSetupOptimizations_Basic(esf);CHKERRQ(ierr);
620   esf->setupcalled = PETSC_TRUE; /* We have done setup ourselves! */
621 
622   ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
623   *newsf = esf;
624   PetscFunctionReturn(0);
625 }
626 
627 PETSC_EXTERN PetscErrorCode PetscSFCreate_Basic(PetscSF sf)
628 {
629   PetscSF_Basic  *dat;
630   PetscErrorCode ierr;
631 
632   PetscFunctionBegin;
633   sf->ops->SetUp                = PetscSFSetUp_Basic;
634   sf->ops->SetFromOptions       = PetscSFSetFromOptions_Basic;
635   sf->ops->Reset                = PetscSFReset_Basic;
636   sf->ops->Destroy              = PetscSFDestroy_Basic;
637   sf->ops->View                 = PetscSFView_Basic;
638   sf->ops->BcastAndOpBegin      = PetscSFBcastAndOpBegin_Basic;
639   sf->ops->BcastAndOpEnd        = PetscSFBcastAndOpEnd_Basic;
640   sf->ops->ReduceBegin          = PetscSFReduceBegin_Basic;
641   sf->ops->ReduceEnd            = PetscSFReduceEnd_Basic;
642   sf->ops->FetchAndOpBegin      = PetscSFFetchAndOpBegin_Basic;
643   sf->ops->FetchAndOpEnd        = PetscSFFetchAndOpEnd_Basic;
644   sf->ops->GetLeafRanks         = PetscSFGetLeafRanks_Basic;
645   sf->ops->CreateEmbeddedSF     = PetscSFCreateEmbeddedSF_Basic;
646   sf->ops->CreateEmbeddedLeafSF = PetscSFCreateEmbeddedLeafSF_Basic;
647 
648   ierr = PetscNewLog(sf,&dat);CHKERRQ(ierr);
649   sf->data = (void*)dat;
650   PetscFunctionReturn(0);
651 }
652