xref: /petsc/src/vec/is/sf/impls/basic/sfbasic.c (revision c4762a1b19cd2af06abeed90e8f9d34fb975dd94)
1 
2 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
3 
4 /*===================================================================================*/
5 /*              Internal routines for PetscSFPack                              */
6 /*===================================================================================*/
7 
8 /* Return root and leaf MPI requests for communication in the given direction. If the requests have not been
9    initialized (since we use persistent requests), then initialize them.
10 */
11 static PetscErrorCode PetscSFPackGetReqs_Basic(PetscSF sf,PetscSFPack link,PetscSFDirection direction,MPI_Request **rootreqs,MPI_Request **leafreqs)
12 {
13   PetscErrorCode ierr;
14   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
15   PetscInt       i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
16   const PetscInt *rootoffset,*leafoffset;
17   PetscMPIInt    n;
18   MPI_Comm       comm = PetscObjectComm((PetscObject)sf);
19   MPI_Datatype   unit = link->unit;
20   PetscMemType   rootmtype,leafmtype;
21 
22   PetscFunctionBegin;
23   if (use_gpu_aware_mpi) {
24     rootmtype = link->rootmtype;
25     leafmtype = link->leafmtype;
26   } else {
27     rootmtype = PETSC_MEMTYPE_HOST;
28     leafmtype = PETSC_MEMTYPE_HOST;
29   }
30 
31   if (rootreqs && !link->rootreqsinited[direction][rootmtype]) {
32     ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
33     if (direction == PETSCSF_LEAF2ROOT_REDUCE) {
34       for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
35         MPI_Aint disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
36         ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
37         ierr = MPI_Recv_init(link->rootbuf[rootmtype]+disp,n,unit,bas->iranks[i],link->tag,comm,&link->rootreqs[direction][rootmtype][j]);CHKERRQ(ierr);
38       }
39     } else if (direction == PETSCSF_ROOT2LEAF_BCAST) {
40       for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
41         MPI_Aint disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
42         ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
43         ierr = MPI_Send_init(link->rootbuf[rootmtype]+disp,n,unit,bas->iranks[i],link->tag,comm,&link->rootreqs[direction][rootmtype][j]);CHKERRQ(ierr);
44       }
45     } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Out-of-range PetscSFDirection = %d\n",(int)direction);
46     link->rootreqsinited[direction][rootmtype] = PETSC_TRUE;
47   }
48 
49   if (leafreqs && !link->leafreqsinited[direction][leafmtype]) {
50     ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
51     if (direction == PETSCSF_LEAF2ROOT_REDUCE) {
52       for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
53         MPI_Aint disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
54         ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
55         ierr = MPI_Send_init(link->leafbuf[leafmtype]+disp,n,unit,sf->ranks[i],link->tag,comm,&link->leafreqs[direction][leafmtype][j]);CHKERRQ(ierr);
56       }
57     } else if (direction == PETSCSF_ROOT2LEAF_BCAST) {
58       for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
59         MPI_Aint disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
60         ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
61         ierr = MPI_Recv_init(link->leafbuf[leafmtype]+disp,n,unit,sf->ranks[i],link->tag,comm,&link->leafreqs[direction][leafmtype][j]);CHKERRQ(ierr);
62       }
63     } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Out-of-range PetscSFDirection = %d\n",(int)direction);
64     link->leafreqsinited[direction][leafmtype] = PETSC_TRUE;
65   }
66 
67   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype];
68   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype];
69   PetscFunctionReturn(0);
70 }
71 
72 /* Common part shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs. */
73 PETSC_INTERN PetscErrorCode PetscSFPackGet_Basic_Common(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,PetscInt nrootreqs,PetscInt nleafreqs,PetscSFPack *mylink)
74 {
75   PetscErrorCode    ierr;
76   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
77   PetscInt          i,j,nreqs,nrootranks,ndrootranks,nleafranks,ndleafranks;
78   const PetscInt    *rootoffset,*leafoffset;
79   PetscSFPack       *p,link;
80   PetscBool         match;
81 
82   PetscFunctionBegin;
83   ierr = PetscSFPackSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
84 
85   /* Look for types in cache */
86   for (p=&bas->avail; (link=*p); p=&link->next) {
87     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
88     if (match) {
89       *p = link->next; /* Remove from available list */
90       goto found;
91     }
92   }
93 
94   ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
95   ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
96   ierr = PetscNew(&link);CHKERRQ(ierr);
97   ierr = PetscSFPackSetUp_Host(sf,link,unit);CHKERRQ(ierr);
98   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
99 
100   /* Allocate root, leaf, self buffers, and MPI requests */
101   link->rootbuflen = rootoffset[nrootranks]-rootoffset[ndrootranks];
102   link->leafbuflen = leafoffset[nleafranks]-leafoffset[ndleafranks];
103   link->selfbuflen = rootoffset[ndrootranks];
104   link->nrootreqs  = nrootreqs;
105   link->nleafreqs  = nleafreqs;
106   nreqs            = (nrootreqs+nleafreqs)*4; /* Quadruple the requests since there are two communication directions and two memory types */
107   ierr             = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
108   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
109 
110   for (i=0; i<2; i++) { /* Two communication directions */
111     for (j=0; j<2; j++) { /* Two memory types */
112       link->rootreqs[i][j] = link->reqs + nrootreqs*(2*i+j);
113       link->leafreqs[i][j] = link->reqs + nrootreqs*4 + nleafreqs*(2*i+j);
114     }
115   }
116 
117 found:
118   link->rootmtype = rootmtype;
119   link->leafmtype = leafmtype;
120 #if defined(PETSC_HAVE_CUDA)
121   ierr = PetscSFPackSetUp_Device(sf,link,unit);CHKERRQ(ierr);
122   if (!use_gpu_aware_mpi) {
123     /* If not using GPU aware MPI, we always need buffers on host. In case root/leafdata is on device, we copy root/leafdata to/from
124        these buffers for MPI. We only need buffers for remote neighbors since self-to-self communication is not done via MPI.
125      */
126     if (!link->rootbuf[PETSC_MEMTYPE_HOST]) {
127       if (rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_pinned_buf) {
128         ierr = PetscMallocPinnedMemory(link->rootbuflen*link->unitbytes,(void**)&link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
129       } else {
130         ierr = PetscMallocWithMemType(PETSC_MEMTYPE_HOST,link->rootbuflen*link->unitbytes,(void**)&link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
131       }
132     }
133     if (!link->leafbuf[PETSC_MEMTYPE_HOST]) {
134       if (leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_pinned_buf) {
135         ierr = PetscMallocPinnedMemory(link->leafbuflen*link->unitbytes,(void**)&link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
136       } else {
137         ierr = PetscMallocWithMemType(PETSC_MEMTYPE_HOST,link->leafbuflen*link->unitbytes,(void**)&link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
138       }
139     }
140   }
141 #endif
142   if (!link->rootbuf[rootmtype]) {ierr = PetscMallocWithMemType(rootmtype,link->rootbuflen*link->unitbytes,(void**)&link->rootbuf[rootmtype]);CHKERRQ(ierr);}
143   if (!link->leafbuf[leafmtype]) {ierr = PetscMallocWithMemType(leafmtype,link->leafbuflen*link->unitbytes,(void**)&link->leafbuf[leafmtype]);CHKERRQ(ierr);}
144   if (!link->selfbuf[rootmtype]) {ierr = PetscMallocWithMemType(rootmtype,link->selfbuflen*link->unitbytes,(void**)&link->selfbuf[rootmtype]);CHKERRQ(ierr);}
145   if (rootmtype != leafmtype && !link->selfbuf[leafmtype]) {ierr = PetscMallocWithMemType(leafmtype,link->selfbuflen*link->unitbytes,(void**)&link->selfbuf[leafmtype]);CHKERRQ(ierr);}
146   link->rootdata = rootdata;
147   link->leafdata = leafdata;
148   link->next     = bas->inuse;
149   bas->inuse     = link;
150   *mylink        = link;
151   PetscFunctionReturn(0);
152 }
153 
154 static PetscErrorCode PetscSFPackGet_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,PetscSFDirection direction,PetscSFPack *mylink)
155 {
156   PetscErrorCode    ierr;
157   PetscInt          nrootranks,ndrootranks,nleafranks,ndleafranks;
158 
159   PetscFunctionBegin;
160   ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,NULL,NULL);CHKERRQ(ierr);
161   ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,NULL,NULL,NULL);CHKERRQ(ierr);
162   ierr = PetscSFPackGet_Basic_Common(sf,unit,rootmtype,rootdata,leafmtype,leafdata,nrootranks-ndrootranks,nleafranks-ndleafranks,mylink);CHKERRQ(ierr);
163   PetscFunctionReturn(0);
164 }
165 
166 /*===================================================================================*/
167 /*              SF public interface implementations                                  */
168 /*===================================================================================*/
169 PETSC_INTERN PetscErrorCode PetscSFSetUp_Basic(PetscSF sf)
170 {
171   PetscErrorCode ierr;
172   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
173   PetscInt       *rlengths,*ilengths,i;
174   PetscMPIInt    rank,niranks,*iranks,tag;
175   MPI_Comm       comm;
176   MPI_Group      group;
177   MPI_Request    *rootreqs,*leafreqs;
178 
179   PetscFunctionBegin;
180   ierr = MPI_Comm_group(PETSC_COMM_SELF,&group);CHKERRQ(ierr);
181   ierr = PetscSFSetUpRanks(sf,group);CHKERRQ(ierr);
182   ierr = MPI_Group_free(&group);CHKERRQ(ierr);
183   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
184   ierr = PetscObjectGetNewTag((PetscObject)sf,&tag);CHKERRQ(ierr);
185   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
186   /*
187    * Inform roots about how many leaves and from which ranks
188    */
189   ierr = PetscMalloc1(sf->nranks,&rlengths);CHKERRQ(ierr);
190   /* Determine number, sending ranks, and length of incoming */
191   for (i=0; i<sf->nranks; i++) {
192     rlengths[i] = sf->roffset[i+1] - sf->roffset[i]; /* Number of roots referenced by my leaves; for rank sf->ranks[i] */
193   }
194   ierr = PetscCommBuildTwoSided(comm,1,MPIU_INT,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,rlengths+sf->ndranks,&niranks,&iranks,(void**)&ilengths);CHKERRQ(ierr);
195 
196   /* Sort iranks. See use of VecScatterGetRemoteOrdered_Private() in MatGetBrowsOfAoCols_MPIAIJ() on why.
197      We could sort ranks there at the price of allocating extra working arrays. Presumably, niranks is
198      small and the sorting is cheap.
199    */
200   ierr = PetscSortMPIIntWithIntArray(niranks,iranks,ilengths);CHKERRQ(ierr);
201 
202   /* Partition into distinguished and non-distinguished incoming ranks */
203   bas->ndiranks = sf->ndranks;
204   bas->niranks = bas->ndiranks + niranks;
205   ierr = PetscMalloc2(bas->niranks,&bas->iranks,bas->niranks+1,&bas->ioffset);CHKERRQ(ierr);
206   bas->ioffset[0] = 0;
207   for (i=0; i<bas->ndiranks; i++) {
208     bas->iranks[i] = sf->ranks[i];
209     bas->ioffset[i+1] = bas->ioffset[i] + rlengths[i];
210   }
211   if (bas->ndiranks > 1 || (bas->ndiranks == 1 && bas->iranks[0] != rank)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Broken setup for shared ranks");
212   for ( ; i<bas->niranks; i++) {
213     bas->iranks[i] = iranks[i-bas->ndiranks];
214     bas->ioffset[i+1] = bas->ioffset[i] + ilengths[i-bas->ndiranks];
215   }
216   bas->itotal = bas->ioffset[i];
217   ierr = PetscFree(rlengths);CHKERRQ(ierr);
218   ierr = PetscFree(iranks);CHKERRQ(ierr);
219   ierr = PetscFree(ilengths);CHKERRQ(ierr);
220 
221   /* Send leaf identities to roots */
222   ierr = PetscMalloc1(bas->itotal,&bas->irootloc);CHKERRQ(ierr);
223   ierr = PetscMalloc2(bas->niranks-bas->ndiranks,&rootreqs,sf->nranks-sf->ndranks,&leafreqs);CHKERRQ(ierr);
224   for (i=bas->ndiranks; i<bas->niranks; i++) {
225     ierr = MPI_Irecv(bas->irootloc+bas->ioffset[i],bas->ioffset[i+1]-bas->ioffset[i],MPIU_INT,bas->iranks[i],tag,comm,&rootreqs[i-bas->ndiranks]);CHKERRQ(ierr);
226   }
227   for (i=0; i<sf->nranks; i++) {
228     PetscMPIInt npoints;
229     ierr = PetscMPIIntCast(sf->roffset[i+1] - sf->roffset[i],&npoints);CHKERRQ(ierr);
230     if (i < sf->ndranks) {
231       if (sf->ranks[i] != rank) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot interpret distinguished leaf rank");
232       if (bas->iranks[0] != rank) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot interpret distinguished root rank");
233       if (npoints != bas->ioffset[1]-bas->ioffset[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Distinguished rank exchange has mismatched lengths");
234       ierr = PetscArraycpy(bas->irootloc+bas->ioffset[0],sf->rremote+sf->roffset[i],npoints);CHKERRQ(ierr);
235       continue;
236     }
237     ierr = MPI_Isend(sf->rremote+sf->roffset[i],npoints,MPIU_INT,sf->ranks[i],tag,comm,&leafreqs[i-sf->ndranks]);CHKERRQ(ierr);
238   }
239   ierr = MPI_Waitall(bas->niranks-bas->ndiranks,rootreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
240   ierr = MPI_Waitall(sf->nranks-sf->ndranks,leafreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
241   ierr = PetscFree2(rootreqs,leafreqs);CHKERRQ(ierr);
242 
243   sf->selfleafdups    = PETSC_TRUE; /* The conservative assumption is there are data race */
244   sf->remoteleafdups  = PETSC_TRUE;
245   bas->selfrootdups   = PETSC_TRUE;
246   bas->remoterootdups = PETSC_TRUE;
247 
248   /* Setup packing optimization for roots and leaves */
249   ierr = PetscSFPackSetupOptimizations_Basic(sf);CHKERRQ(ierr);
250   PetscFunctionReturn(0);
251 }
252 
253 PETSC_INTERN PetscErrorCode PetscSFSetFromOptions_Basic(PetscOptionItems *PetscOptionsObject,PetscSF sf)
254 {
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   ierr = PetscOptionsHead(PetscOptionsObject,"PetscSF Basic options");CHKERRQ(ierr);
259   ierr = PetscOptionsBool("-sf_use_pinned_buffer","Use pinned (nonpagable) memory for send/recv buffers on host","PetscSFSetFromOptions",sf->use_pinned_buf,&sf->use_pinned_buf,NULL);CHKERRQ(ierr);
260   ierr = PetscOptionsTail();CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 PETSC_INTERN PetscErrorCode PetscSFReset_Basic(PetscSF sf)
265 {
266   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
267   PetscErrorCode    ierr;
268 
269   PetscFunctionBegin;
270   if (bas->inuse) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Outstanding operation has not been completed");
271   ierr = PetscFree2(bas->iranks,bas->ioffset);CHKERRQ(ierr);
272   ierr = PetscFree(bas->irootloc);CHKERRQ(ierr);
273 #if defined(PETSC_HAVE_CUDA)
274   if (bas->irootloc_d) {cudaError_t err = cudaFree(bas->irootloc_d);CHKERRCUDA(err);bas->irootloc_d=NULL;}
275 #endif
276   ierr = PetscSFPackDestroyAvailable(sf,&bas->avail);CHKERRQ(ierr);
277   ierr = PetscSFPackDestroyOptimizations_Basic(sf);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 PETSC_INTERN PetscErrorCode PetscSFDestroy_Basic(PetscSF sf)
282 {
283   PetscErrorCode ierr;
284 
285   PetscFunctionBegin;
286   ierr = PetscSFReset_Basic(sf);CHKERRQ(ierr);
287   ierr = PetscFree(sf->data);CHKERRQ(ierr);
288   PetscFunctionReturn(0);
289 }
290 
291 PETSC_INTERN PetscErrorCode PetscSFView_Basic(PetscSF sf,PetscViewer viewer)
292 {
293   PetscErrorCode ierr;
294   PetscBool      iascii;
295 
296   PetscFunctionBegin;
297   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
298   if (iascii) {ierr = PetscViewerASCIIPrintf(viewer,"  sort=%s\n",sf->rankorder ? "rank-order" : "unordered");CHKERRQ(ierr);}
299   PetscFunctionReturn(0);
300 }
301 
302 static PetscErrorCode PetscSFBcastAndOpBegin_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
303 {
304   PetscErrorCode    ierr;
305   PetscSFPack       link;
306   const PetscInt    *rootloc = NULL;
307   MPI_Request       *rootreqs = NULL,*leafreqs = NULL;
308 
309   PetscFunctionBegin;
310   ierr = PetscSFPackGet_Basic(sf,unit,rootmtype,rootdata,leafmtype,leafdata,PETSCSF_ROOT2LEAF_BCAST,&link);CHKERRQ(ierr);
311   ierr = PetscSFGetRootIndicesWithMemType_Basic(sf,rootmtype,&rootloc);CHKERRQ(ierr);
312 
313   ierr = PetscSFPackGetReqs_Basic(sf,link,PETSCSF_ROOT2LEAF_BCAST,&rootreqs,&leafreqs);CHKERRQ(ierr);
314   /* Post Irecv. Note distinguished ranks receive data via shared memory (i.e., not via MPI) */
315   ierr = MPI_Startall_irecv(link->leafbuflen,unit,link->nleafreqs,leafreqs);CHKERRQ(ierr);
316 
317   /* Do Isend */
318   ierr = PetscSFPackRootData(sf,link,rootloc,rootdata,PETSC_TRUE);CHKERRQ(ierr);
319   ierr = MPI_Startall_isend(link->rootbuflen,unit,link->nrootreqs,rootreqs);CHKERRQ(ierr);
320 
321   /* Do self to self communication via memcpy only when rootdata and leafdata are in different memory */
322   if (rootmtype != leafmtype) {ierr = PetscMemcpyWithMemType(leafmtype,rootmtype,link->selfbuf[leafmtype],link->selfbuf[rootmtype],link->selfbuflen*link->unitbytes);CHKERRQ(ierr);}
323   PetscFunctionReturn(0);
324 }
325 
326 PETSC_INTERN PetscErrorCode PetscSFBcastAndOpEnd_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
327 {
328   PetscErrorCode    ierr;
329   PetscSFPack       link;
330   const PetscInt    *leafloc = NULL;
331 
332   PetscFunctionBegin;
333   ierr = PetscSFPackGetInUse(sf,unit,rootdata,leafdata,PETSC_OWN_POINTER,&link);CHKERRQ(ierr);
334   ierr = PetscSFPackWaitall(link,PETSCSF_ROOT2LEAF_BCAST);CHKERRQ(ierr);
335   ierr = PetscSFGetLeafIndicesWithMemType_Basic(sf,leafmtype,&leafloc);CHKERRQ(ierr);
336   ierr = PetscSFUnpackAndOpLeafData(sf,link,leafloc,leafdata,op,PETSC_TRUE);CHKERRQ(ierr);
337   ierr = PetscSFPackReclaim(sf,&link);CHKERRQ(ierr);
338   PetscFunctionReturn(0);
339 }
340 
341 /* leaf -> root with reduction */
342 static PetscErrorCode PetscSFReduceBegin_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
343 {
344   PetscErrorCode    ierr;
345   PetscSFPack       link;
346   const PetscInt    *leafloc = NULL;
347   MPI_Request       *rootreqs = NULL,*leafreqs = NULL; /* dummy values for compiler warnings about uninitialized value */
348 
349   PetscFunctionBegin;
350   ierr = PetscSFGetLeafIndicesWithMemType_Basic(sf,leafmtype,&leafloc);
351 
352   ierr = PetscSFPackGet_Basic(sf,unit,rootmtype,rootdata,leafmtype,leafdata,PETSCSF_LEAF2ROOT_REDUCE,&link);CHKERRQ(ierr);
353   ierr = PetscSFPackGetReqs_Basic(sf,link,PETSCSF_LEAF2ROOT_REDUCE,&rootreqs,&leafreqs);CHKERRQ(ierr);
354   /* Eagerly post root receives for non-distinguished ranks */
355   ierr = MPI_Startall_irecv(link->rootbuflen,unit,link->nrootreqs,rootreqs);CHKERRQ(ierr);
356 
357   /* Pack and send leaf data */
358   ierr = PetscSFPackLeafData(sf,link,leafloc,leafdata,PETSC_TRUE);CHKERRQ(ierr);
359   ierr = MPI_Startall_isend(link->leafbuflen,unit,link->nleafreqs,leafreqs);CHKERRQ(ierr);
360 
361   if (rootmtype != leafmtype) {ierr = PetscMemcpyWithMemType(rootmtype,leafmtype,link->selfbuf[rootmtype],link->selfbuf[leafmtype],link->selfbuflen*link->unitbytes);CHKERRQ(ierr);}
362   PetscFunctionReturn(0);
363 }
364 
365 PETSC_INTERN PetscErrorCode PetscSFReduceEnd_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
366 {
367   PetscErrorCode    ierr;
368   PetscSFPack       link;
369   const PetscInt    *rootloc = NULL;
370 
371   PetscFunctionBegin;
372   ierr = PetscSFGetRootIndicesWithMemType_Basic(sf,rootmtype,&rootloc);CHKERRQ(ierr);
373   ierr = PetscSFPackGetInUse(sf,unit,rootdata,leafdata,PETSC_OWN_POINTER,&link);CHKERRQ(ierr);
374   ierr = PetscSFPackWaitall(link,PETSCSF_LEAF2ROOT_REDUCE);CHKERRQ(ierr);
375   ierr = PetscSFUnpackAndOpRootData(sf,link,rootloc,rootdata,op,PETSC_TRUE);CHKERRQ(ierr);
376   ierr = PetscSFPackReclaim(sf,&link);CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 PETSC_INTERN PetscErrorCode PetscSFFetchAndOpBegin_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,void *rootdata,PetscMemType leafmtype,const void *leafdata,void *leafupdate,MPI_Op op)
381 {
382   PetscErrorCode ierr;
383 
384   PetscFunctionBegin;
385   ierr = PetscSFReduceBegin(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
386   PetscFunctionReturn(0);
387 }
388 
389 static PetscErrorCode PetscSFFetchAndOpEnd_Basic(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,void *rootdata,PetscMemType leafmtype,const void *leafdata,void *leafupdate,MPI_Op op)
390 {
391   PetscErrorCode    ierr;
392   PetscSFPack       link;
393   const PetscInt    *rootloc = NULL,*leafloc = NULL;
394   MPI_Request       *rootreqs = NULL,*leafreqs = NULL;
395 
396   PetscFunctionBegin;
397   ierr = PetscSFGetRootIndicesWithMemType_Basic(sf,rootmtype,&rootloc);CHKERRQ(ierr);
398   ierr = PetscSFGetLeafIndicesWithMemType_Basic(sf,leafmtype,&leafloc);CHKERRQ(ierr);
399   ierr = PetscSFPackGetInUse(sf,unit,rootdata,leafdata,PETSC_OWN_POINTER,&link);CHKERRQ(ierr);
400   /* This implementation could be changed to unpack as receives arrive, at the cost of non-determinism */
401   ierr = PetscSFPackWaitall(link,PETSCSF_LEAF2ROOT_REDUCE);CHKERRQ(ierr);
402   ierr = PetscSFPackGetReqs_Basic(sf,link,PETSCSF_ROOT2LEAF_BCAST,&rootreqs,&leafreqs);CHKERRQ(ierr);
403 
404   /* Post leaf receives */
405   ierr = MPI_Startall_irecv(link->leafbuflen,unit,link->nleafreqs,leafreqs);CHKERRQ(ierr);
406 
407   /* Process local fetch-and-op, post root sends */
408   ierr = PetscSFFetchAndOpRootData(sf,link,rootloc,rootdata,op,PETSC_TRUE);CHKERRQ(ierr);
409   ierr = MPI_Startall_isend(link->rootbuflen,unit,link->nrootreqs,rootreqs);CHKERRQ(ierr);
410   if (rootmtype != leafmtype) {ierr = PetscMemcpyWithMemType(leafmtype,rootmtype,link->selfbuf[leafmtype],link->selfbuf[rootmtype],link->selfbuflen*link->unitbytes);CHKERRQ(ierr);}
411 
412   /* Unpack and insert fetched data into leaves */
413   ierr = PetscSFPackWaitall(link,PETSCSF_ROOT2LEAF_BCAST);CHKERRQ(ierr);
414   ierr = PetscSFUnpackAndOpLeafData(sf,link,leafloc,leafupdate,MPIU_REPLACE,PETSC_TRUE);CHKERRQ(ierr);
415   ierr = PetscSFPackReclaim(sf,&link);CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 
419 PETSC_INTERN PetscErrorCode PetscSFGetLeafRanks_Basic(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
420 {
421   PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
422 
423   PetscFunctionBegin;
424   if (niranks)  *niranks  = bas->niranks;
425   if (iranks)   *iranks   = bas->iranks;
426   if (ioffset)  *ioffset  = bas->ioffset;
427   if (irootloc) *irootloc = bas->irootloc;
428   PetscFunctionReturn(0);
429 }
430 
431 /* An optimized PetscSFCreateEmbeddedSF. We aggresively make use of the established communication on sf.
432    We need one bcast on sf, and no communication anymore to build the embedded sf. Note that selected[]
433    was sorted before calling the routine.
434  */
435 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedSF_Basic(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
436 {
437   PetscSF           esf;
438   PetscInt          esf_nranks,esf_ndranks,*esf_roffset,*esf_rmine,*esf_rremote,count;
439   PetscInt          i,j,k,p,q,nroots,*rootdata,*leafdata,connected_leaves,*new_ilocal,nranks,ndranks,niranks,ndiranks,minleaf,maxleaf,maxlocal;
440   PetscMPIInt       *esf_ranks;
441   const PetscMPIInt *ranks,*iranks;
442   const PetscInt    *roffset,*rmine,*rremote,*ioffset,*irootloc,*buffer;
443   PetscBool         connected;
444   PetscSFPack       link;
445   PetscSFNode       *new_iremote;
446   PetscSF_Basic     *bas;
447   PetscErrorCode    ierr;
448 
449   PetscFunctionBegin;
450   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),&esf);CHKERRQ(ierr);
451   ierr = PetscSFSetType(esf,PETSCSFBASIC);CHKERRQ(ierr); /* This optimized routine can only create a basic sf */
452 
453   /* Find out which leaves are still connected to roots in the embedded sf */
454   ierr = PetscSFGetGraph(sf,&nroots,NULL,NULL,NULL);CHKERRQ(ierr);
455   ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
456   /* We abused the term leafdata here, whose size is usually the number of leaf data items. Here its size is # of leaves (always >= # of leaf data items) */
457   maxlocal = (minleaf > maxleaf)? 0 : maxleaf-minleaf+1; /* maxleaf=-1 and minleaf=0 when nleaves=0 */
458   ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafdata);CHKERRQ(ierr);
459   /* Tag selected roots */
460   for (i=0; i<nselected; ++i) rootdata[selected[i]] = 1;
461 
462   /* Bcast from roots to leaves to tag connected leaves. We reuse the established bcast communication in
463      sf but do not do unpacking (from leaf buffer to leafdata). The raw data in leaf buffer is what we are
464      interested in since it tells which leaves are connected to which ranks.
465    */
466   ierr = PetscSFBcastAndOpBegin_Basic(sf,MPIU_INT,PETSC_MEMTYPE_HOST,rootdata,PETSC_MEMTYPE_HOST,leafdata-minleaf,MPIU_REPLACE);CHKERRQ(ierr); /* Need to give leafdata but we won't use it */
467   ierr = PetscSFPackGetInUse(sf,MPIU_INT,rootdata,leafdata-minleaf,PETSC_OWN_POINTER,&link);CHKERRQ(ierr);
468   ierr = PetscSFPackWaitall(link,PETSCSF_ROOT2LEAF_BCAST);CHKERRQ(ierr);
469   ierr = PetscSFGetLeafInfo_Basic(sf,&nranks,&ndranks,&ranks,&roffset,&rmine,&rremote);CHKERRQ(ierr); /* Get send info */
470   esf_nranks = esf_ndranks = connected_leaves = 0;
471   for (i=0; i<nranks; i++) {
472     connected = PETSC_FALSE; /* Is the current process still connected to this remote root rank? */
473     buffer    = i < ndranks? (PetscInt*)link->selfbuf[PETSC_MEMTYPE_HOST] : (PetscInt*)link->leafbuf[PETSC_MEMTYPE_HOST] + (roffset[i] - roffset[ndranks]);
474     count     = roffset[i+1] - roffset[i];
475     for (j=0; j<count; j++) {if (buffer[j]) {connected_leaves++; connected = PETSC_TRUE;}}
476     if (connected) {esf_nranks++; if (i < ndranks) esf_ndranks++;}
477   }
478 
479   /* Set graph of esf and also set up its outgoing communication (i.e., send info), which is usually done by PetscSFSetUpRanks */
480   ierr = PetscMalloc1(connected_leaves,&new_ilocal);CHKERRQ(ierr);
481   ierr = PetscMalloc1(connected_leaves,&new_iremote);CHKERRQ(ierr);
482   ierr = PetscMalloc4(esf_nranks,&esf_ranks,esf_nranks+1,&esf_roffset,connected_leaves,&esf_rmine,connected_leaves,&esf_rremote);CHKERRQ(ierr);
483   p    = 0; /* Counter for connected root ranks */
484   q    = 0; /* Counter for connected leaves */
485   esf_roffset[0] = 0;
486   for (i=0; i<nranks; i++) { /* Scan leaf data again to fill esf arrays */
487     buffer    = i < ndranks? (PetscInt*)link->selfbuf[PETSC_MEMTYPE_HOST] : (PetscInt*)link->leafbuf[PETSC_MEMTYPE_HOST] + (roffset[i] - roffset[ndranks]);
488     connected = PETSC_FALSE;
489     for (j=roffset[i],k=0; j<roffset[i+1]; j++,k++) {
490       if (buffer[k]) {
491         esf_rmine[q]         = new_ilocal[q] = rmine[j];
492         esf_rremote[q]       = rremote[j];
493         new_iremote[q].index = rremote[j];
494         new_iremote[q].rank  = ranks[i];
495         connected            = PETSC_TRUE;
496         q++;
497       }
498     }
499     if (connected) {
500       esf_ranks[p]     = ranks[i];
501       esf_roffset[p+1] = q;
502       p++;
503     }
504   }
505 
506   ierr = PetscSFPackReclaim(sf,&link);CHKERRQ(ierr);
507 
508   /* SetGraph internally resets the SF, so we only set its fields after the call */
509   ierr         = PetscSFSetGraph(esf,nroots,connected_leaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
510   esf->nranks  = esf_nranks;
511   esf->ndranks = esf_ndranks;
512   esf->ranks   = esf_ranks;
513   esf->roffset = esf_roffset;
514   esf->rmine   = esf_rmine;
515   esf->rremote = esf_rremote;
516 
517   /* Set up the incoming communication (i.e., recv info) stored in esf->data, which is usually done by PetscSFSetUp_Basic */
518   bas  = (PetscSF_Basic*)esf->data;
519   ierr = PetscSFGetRootInfo_Basic(sf,&niranks,&ndiranks,&iranks,&ioffset,&irootloc);CHKERRQ(ierr); /* Get recv info */
520   /* Embedded sf always has simpler communication than the original one. We might allocate longer arrays than needed here. But we
521      expect these arrays are usually short, so we do not care. The benefit is we can fill these arrays by just parsing irootloc once.
522    */
523   ierr = PetscMalloc2(niranks,&bas->iranks,niranks+1,&bas->ioffset);CHKERRQ(ierr);
524   ierr = PetscMalloc1(ioffset[niranks],&bas->irootloc);CHKERRQ(ierr);
525   bas->niranks = bas->ndiranks = bas->ioffset[0] = 0;
526   p = 0; /* Counter for connected leaf ranks */
527   q = 0; /* Counter for connected roots */
528   for (i=0; i<niranks; i++) {
529     connected = PETSC_FALSE; /* Is the current process still connected to this remote leaf rank? */
530     for (j=ioffset[i]; j<ioffset[i+1]; j++) {
531       PetscInt loc;
532       ierr = PetscFindInt(irootloc[j],nselected,selected,&loc);CHKERRQ(ierr);
533       if (loc >= 0) { /* Found in selected this root is connected */
534         bas->irootloc[q++] = irootloc[j];
535         connected = PETSC_TRUE;
536       }
537     }
538     if (connected) {
539       bas->niranks++;
540       if (i<ndiranks) bas->ndiranks++; /* Note that order of ranks (including distinguished ranks) is kept */
541       bas->iranks[p]    = iranks[i];
542       bas->ioffset[p+1] = q;
543       p++;
544     }
545   }
546   bas->itotal = q;
547 
548   /* Setup packing optimizations */
549   ierr = PetscSFPackSetupOptimizations_Basic(esf);CHKERRQ(ierr);
550   esf->setupcalled = PETSC_TRUE; /* We have done setup ourselves! */
551 
552   ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
553   *newsf = esf;
554   PetscFunctionReturn(0);
555 }
556 
557 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedLeafSF_Basic(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
558 {
559   PetscSF           esf;
560   PetscInt          i,j,k,p,q,nroots,*rootdata,*leafdata,*new_ilocal,niranks,ndiranks,minleaf,maxleaf,maxlocal;
561   const PetscInt    *ilocal,*ioffset,*irootloc,*buffer;
562   const PetscMPIInt *iranks;
563   PetscSFPack       link;
564   PetscSFNode       *new_iremote;
565   const PetscSFNode *iremote;
566   PetscSF_Basic     *bas;
567   MPI_Group         group;
568   PetscErrorCode    ierr;
569 
570   PetscFunctionBegin;
571   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),&esf);CHKERRQ(ierr);
572   ierr = PetscSFSetType(esf,PETSCSFBASIC);CHKERRQ(ierr); /* This optimized routine can only create a basic sf */
573 
574   /* Set the graph of esf, which is easy for CreateEmbeddedLeafSF */
575   ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
576   ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
577   ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
578   ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
579   for (i=0; i<nselected; i++) {
580     const PetscInt l     = selected[i];
581     new_ilocal[i]        = ilocal ? ilocal[l] : l;
582     new_iremote[i].rank  = iremote[l].rank;
583     new_iremote[i].index = iremote[l].index;
584   }
585 
586   /* Tag selected leaves before PetscSFSetGraph since new_ilocal might turn into NULL since we use PETSC_OWN_POINTER below */
587   maxlocal = (minleaf > maxleaf)? 0 : maxleaf-minleaf+1; /* maxleaf=-1 and minleaf=0 when nleaves=0 */
588   ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafdata);CHKERRQ(ierr);
589   for (i=0; i<nselected; i++) leafdata[new_ilocal[i]-minleaf] = 1; /* -minleaf to adjust indices according to minleaf */
590 
591   ierr = PetscSFSetGraph(esf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
592 
593   /* Set up the outgoing communication (i.e., send info). We can not reuse rmine etc in sf since there is no way to
594      map rmine[i] (ilocal of leaves) back to selected[j]  (leaf indices).
595    */
596   ierr = MPI_Comm_group(PETSC_COMM_SELF,&group);CHKERRQ(ierr);
597   ierr = PetscSFSetUpRanks(esf,group);CHKERRQ(ierr);
598   ierr = MPI_Group_free(&group);CHKERRQ(ierr);
599 
600   /* Set up the incoming communication (i.e., recv info) */
601   ierr = PetscSFGetRootInfo_Basic(sf,&niranks,&ndiranks,&iranks,&ioffset,&irootloc);CHKERRQ(ierr);
602   bas  = (PetscSF_Basic*)esf->data;
603   ierr = PetscMalloc2(niranks,&bas->iranks,niranks+1,&bas->ioffset);CHKERRQ(ierr);
604   ierr = PetscMalloc1(ioffset[niranks],&bas->irootloc);CHKERRQ(ierr);
605 
606   /* Pass info about selected leaves to root buffer */
607   ierr = PetscSFReduceBegin_Basic(sf,MPIU_INT,PETSC_MEMTYPE_HOST,leafdata-minleaf,PETSC_MEMTYPE_HOST,rootdata,MPIU_REPLACE);CHKERRQ(ierr); /* -minleaf to re-adjust start address of leafdata */
608   ierr = PetscSFPackGetInUse(sf,MPIU_INT,rootdata,leafdata-minleaf,PETSC_OWN_POINTER,&link);CHKERRQ(ierr);
609   ierr = PetscSFPackWaitall(link,PETSCSF_LEAF2ROOT_REDUCE);CHKERRQ(ierr);
610 
611   bas->niranks = bas->ndiranks = bas->ioffset[0] = 0;
612   p = 0; /* Counter for connected leaf ranks */
613   q = 0; /* Counter for connected roots */
614   for (i=0; i<niranks; i++) {
615     PetscBool connected = PETSC_FALSE; /* Is the current process still connected to this remote leaf rank? */
616     buffer = i < ndiranks? (PetscInt*)link->selfbuf[PETSC_MEMTYPE_HOST] : (PetscInt*)link->rootbuf[PETSC_MEMTYPE_HOST] + (ioffset[i] - ioffset[ndiranks]);
617     for (j=ioffset[i],k=0; j<ioffset[i+1]; j++,k++) {
618       if (buffer[k]) {bas->irootloc[q++] = irootloc[j]; connected = PETSC_TRUE;}
619     }
620     if (connected) {
621       bas->niranks++;
622       if (i<ndiranks) bas->ndiranks++;
623       bas->iranks[p]    = iranks[i];
624       bas->ioffset[p+1] = q;
625       p++;
626     }
627   }
628   bas->itotal = q;
629   ierr = PetscSFPackReclaim(sf,&link);CHKERRQ(ierr);
630 
631   /* Setup packing optimizations */
632   ierr = PetscSFPackSetupOptimizations_Basic(esf);CHKERRQ(ierr);
633   esf->setupcalled = PETSC_TRUE; /* We have done setup ourselves! */
634 
635   ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
636   *newsf = esf;
637   PetscFunctionReturn(0);
638 }
639 
640 PETSC_EXTERN PetscErrorCode PetscSFCreate_Basic(PetscSF sf)
641 {
642   PetscSF_Basic  *dat;
643   PetscErrorCode ierr;
644 
645   PetscFunctionBegin;
646   sf->ops->SetUp                = PetscSFSetUp_Basic;
647   sf->ops->SetFromOptions       = PetscSFSetFromOptions_Basic;
648   sf->ops->Reset                = PetscSFReset_Basic;
649   sf->ops->Destroy              = PetscSFDestroy_Basic;
650   sf->ops->View                 = PetscSFView_Basic;
651   sf->ops->BcastAndOpBegin      = PetscSFBcastAndOpBegin_Basic;
652   sf->ops->BcastAndOpEnd        = PetscSFBcastAndOpEnd_Basic;
653   sf->ops->ReduceBegin          = PetscSFReduceBegin_Basic;
654   sf->ops->ReduceEnd            = PetscSFReduceEnd_Basic;
655   sf->ops->FetchAndOpBegin      = PetscSFFetchAndOpBegin_Basic;
656   sf->ops->FetchAndOpEnd        = PetscSFFetchAndOpEnd_Basic;
657   sf->ops->GetLeafRanks         = PetscSFGetLeafRanks_Basic;
658   sf->ops->CreateEmbeddedSF     = PetscSFCreateEmbeddedSF_Basic;
659   sf->ops->CreateEmbeddedLeafSF = PetscSFCreateEmbeddedLeafSF_Basic;
660 
661   ierr = PetscNewLog(sf,&dat);CHKERRQ(ierr);
662   sf->data = (void*)dat;
663   PetscFunctionReturn(0);
664 }
665