xref: /petsc/src/vec/is/sf/impls/basic/sfbasic.h (revision 63f3c55c12ae2f190c391f6df6c540efe018a44a)
1 #if !defined(__SFBASIC_H)
2 #define __SFBASIC_H
3 
4 #include <../src/vec/is/sf/impls/basic/sfpack.h>
5 
6 typedef enum {PETSCSF_LEAF2ROOT_REDUCE=0, PETSCSF_ROOT2LEAF_BCAST=1} PetscSFDirection;
7 
8 #define SFBASICHEADER \
9   PetscMPIInt      niranks;         /* Number of incoming ranks (ranks accessing my roots) */                                      \
10   PetscMPIInt      ndiranks;        /* Number of incoming ranks (ranks accessing my roots) in distinguished set */                 \
11   PetscMPIInt      *iranks;         /* Array of ranks that reference my roots */                                                   \
12   PetscInt         itotal;          /* Total number of graph edges referencing my roots */                                         \
13   PetscInt         *ioffset;        /* Array of length niranks+1 holding offset in irootloc[] for each rank */                     \
14   PetscInt         *irootloc;       /* Incoming roots referenced by ranks starting at ioffset[rank] */                             \
15   PetscInt         *irootloc_d;     /* irootloc in device memory when needed */                                                    \
16   PetscSFPackOpt   rootpackopt;     /* Optimization plans to (un)pack roots based on patterns in irootloc[]. NULL for no plans */  \
17   PetscSFPackOpt   selfrootpackopt; /* Optimization plans to (un)pack roots connected to local leaves */                           \
18   PetscSFPack      avail;           /* One or more entries per MPI Datatype, lazily constructed */                                 \
19   PetscSFPack      inuse;           /* Buffers being used for transactions that have not yet completed */                          \
20   PetscBool        selfrootdups;    /* Indices of roots in irootloc[0,ioffset[ndiranks]) have dups, implying theads working ... */ \
21                                     /* ... on these roots in parallel may have data race. */                                       \
22   PetscBool        remoterootdups   /* Indices of roots in irootloc[ioffset[ndiranks],ioffset[niranks]) have dups */
23 
24 typedef struct {
25   SFBASICHEADER;
26 } PetscSF_Basic;
27 
28 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootInfo_Basic(PetscSF sf,PetscInt *nrootranks,PetscInt *ndrootranks,const PetscMPIInt **rootranks,const PetscInt **rootoffset,const PetscInt **rootloc)
29 {
30   PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
31 
32   PetscFunctionBegin;
33   if (nrootranks)  *nrootranks  = bas->niranks;
34   if (ndrootranks) *ndrootranks = bas->ndiranks;
35   if (rootranks)   *rootranks   = bas->iranks;
36   if (rootoffset)  *rootoffset  = bas->ioffset;
37   if (rootloc)     *rootloc     = bas->irootloc;
38   PetscFunctionReturn(0);
39 }
40 
41 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafInfo_Basic(PetscSF sf,PetscInt *nleafranks,PetscInt *ndleafranks,const PetscMPIInt **leafranks,const PetscInt **leafoffset,const PetscInt **leafloc,const PetscInt **leafrremote)
42 {
43   PetscFunctionBegin;
44   if (nleafranks)  *nleafranks  = sf->nranks;
45   if (ndleafranks) *ndleafranks = sf->ndranks;
46   if (leafranks)   *leafranks   = sf->ranks;
47   if (leafoffset)  *leafoffset  = sf->roffset;
48   if (leafloc)     *leafloc     = sf->rmine;
49   if (leafrremote) *leafrremote = sf->rremote;
50   PetscFunctionReturn(0);
51 }
52 
53 /* Get root locations either on Host or Device */
54 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootIndicesWithMemType_Basic(PetscSF sf,PetscMemType mtype, const PetscInt **rootloc)
55 {
56   PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
57   PetscFunctionBegin;
58   if (rootloc) {
59     if (mtype == PETSC_MEMTYPE_HOST) *rootloc = bas->irootloc;
60 #if defined(PETSC_HAVE_CUDA)
61     else if (mtype == PETSC_MEMTYPE_DEVICE) {
62       if (!bas->irootloc_d) {
63         cudaError_t err;
64         size_t      size = bas->ioffset[bas->niranks]*sizeof(PetscInt);
65         err = cudaMalloc((void **)&bas->irootloc_d,size);CHKERRCUDA(err);
66         err = cudaMemcpy(bas->irootloc_d,bas->irootloc,size,cudaMemcpyHostToDevice);CHKERRCUDA(err);
67       }
68       *rootloc = bas->irootloc_d;
69     }
70 #endif
71     else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType in getting rootloc %d",(int)mtype);
72   }
73   PetscFunctionReturn(0);
74 }
75 
76 /* Get leaf locations either on Host (CPU) or Device (GPU) */
77 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafIndicesWithMemType_Basic(PetscSF sf,PetscMemType mtype, const PetscInt **leafloc)
78 {
79   PetscFunctionBegin;
80   if (leafloc) {
81     if (mtype == PETSC_MEMTYPE_HOST) *leafloc = sf->rmine;
82 #if defined(PETSC_HAVE_CUDA)
83     else if (mtype == PETSC_MEMTYPE_DEVICE) {
84       if (!sf->rmine_d) {
85         cudaError_t err;
86         size_t      size = sf->roffset[sf->nranks]*sizeof(PetscInt);
87         err = cudaMalloc((void **)&sf->rmine_d,size);CHKERRCUDA(err);
88         err = cudaMemcpy(sf->rmine_d,sf->rmine,size,cudaMemcpyHostToDevice);CHKERRCUDA(err);
89       }
90       *leafloc = sf->rmine_d;
91     }
92 #endif
93     else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType in getting leafloc %d",(int)mtype);
94   }
95   PetscFunctionReturn(0);
96 }
97 
98 /* A convenience struct to provide info to the following (un)packing routines so that we can treat packings to self and to remote in one loop. */
99 typedef struct _n_PackInfo {
100   PetscInt       count;  /* Number of entries to pack, unpack etc. */
101   const PetscInt *idx;   /* Indices of the entries. NULL for contiguous indices [0,count-1) */
102   PetscSFPackOpt opt;    /* Pack optimizations */
103   char           *buf;   /* The contiguous buffer where we pack to or unpack from */
104   PetscBool      atomic; /* Whether the unpack routine needs to use atomics */
105 } PackInfo;
106 
107 /* Utility routine to pack selected entries of rootdata into root buffer.
108   Input Arguments:
109   + sf       - The SF this packing works on.
110   . link     - The PetscSFPack, which gives the memtype of the roots and also provides root buffer.
111   . rootloc  - Indices of the roots, only meaningful if the root space is sparse
112   . rootdata - Where to read the roots.
113   - sparse   - Is the root space sparse (for SFBasic, SFNeighbor)  or dense (for SFAllgatherv etc)
114  */
115 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,const void *rootdata,PetscBool sparse)
116 {
117   PetscErrorCode ierr;
118   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
119   PetscInt       i;
120   PetscErrorCode (*Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*);
121   PackInfo       p[2];
122 
123   PetscFunctionBegin;
124   if (sparse) {
125     /* For SFBasic and SFNeighbor, whose root space is sparse and have separate buffers for self and remote. */
126     p[0].count  = bas->ioffset[bas->ndiranks];    p[1].count  = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks];
127     p[0].idx    = rootloc;                        p[1].idx    = rootloc+bas->ioffset[bas->ndiranks];
128     p[0].opt    = bas->selfrootpackopt;           p[1].opt    = bas->rootpackopt;
129     p[0].buf    = link->selfbuf[link->rootmtype]; p[1].buf    = link->rootbuf[link->rootmtype];
130     p[0].atomic = PETSC_FALSE;                    p[1].atomic = PETSC_FALSE;
131   } else {
132     /* For SFAllgatherv etc collectives, which have a dense root space and do not differentiate self/remote communication. */
133     p[0].count  = sf->nroots;
134     p[0].idx    = NULL;
135     p[0].opt    = NULL;
136     p[0].buf    = link->rootbuf[link->rootmtype];
137     p[0].atomic = PETSC_FALSE;
138     p[1].count  = 0;
139   }
140 
141   ierr = PetscSFPackGetPack(link,link->rootmtype,&Pack);CHKERRQ(ierr);
142   /* Only do packing when count != 0 so that we can avoid invoking CUDA kernels on GPU. */
143   for (i=0; i<2; i++) {if (p[i].count) {ierr = (*Pack)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);}}
144 #if defined(PETSC_HAVE_CUDA)
145   /* Let's assume MPI does not do cudaDeviceSynchronize() at entrance, which I do not know. Then we have to sync the packing kernel to make rootbuf & selfbuf ready for MPI & self to self copy */
146   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
147 #endif
148   PetscFunctionReturn(0);
149 }
150 
151 /* Utility routine to pack selected entries of leafdata into leaf buffer */
152 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackLeafData(PetscSF sf,PetscSFPack link,const PetscInt *leafloc,const void *leafdata,PetscBool sparse)
153 {
154   PetscErrorCode ierr;
155   PetscInt       i;
156   PetscErrorCode (*Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*);
157   PackInfo       p[2];
158 
159   PetscFunctionBegin;
160   if (sparse) {
161     p[0].count  = sf->roffset[sf->ndranks];       p[1].count  = sf->roffset[sf->nranks]-sf->roffset[sf->ndranks];
162     p[0].idx    = leafloc;                        p[1].idx    = leafloc+sf->roffset[sf->ndranks];
163     p[0].opt    = sf->selfleafpackopt;            p[1].opt    = sf->leafpackopt;
164     p[0].buf    = link->selfbuf[link->leafmtype]; p[1].buf    = link->leafbuf[link->leafmtype];
165     p[0].atomic = PETSC_FALSE;                    p[1].atomic = PETSC_FALSE;
166   } else {
167     p[0].count  = sf->nleaves;
168     p[0].idx    = NULL;
169     p[0].opt    = NULL;
170     p[0].buf    = link->leafbuf[link->leafmtype];
171     p[0].atomic = PETSC_FALSE;
172     p[1].count  = 0;
173   }
174 
175   ierr = PetscSFPackGetPack(link,link->leafmtype,&Pack);CHKERRQ(ierr);
176   for (i=0; i<2; i++) {if (p[i].count) {ierr = (*Pack)(p[i].count,p[i].idx,link,p[i].opt,leafdata,p[i].buf);CHKERRQ(ierr);}}
177 #if defined(PETSC_HAVE_CUDA)
178   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
179 #endif
180   PetscFunctionReturn(0);
181 }
182 
183 /* Utility routine to unpack data from root buffer and Op it into selected entries of rootdata */
184 PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,void *rootdata,MPI_Op op,PetscBool sparse)
185 {
186   PetscErrorCode ierr;
187   PetscInt       i;
188   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
189   PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
190   PackInfo       p[2];
191 
192   PetscFunctionBegin;
193   if (sparse) {
194     p[0].count  = bas->ioffset[bas->ndiranks];    p[1].count  = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks];
195     p[0].idx    = rootloc;                        p[1].idx    = rootloc+bas->ioffset[bas->ndiranks];
196     p[0].opt    = bas->selfrootpackopt;           p[1].opt    = bas->rootpackopt;
197     p[0].buf    = link->selfbuf[link->rootmtype]; p[1].buf    = link->rootbuf[link->rootmtype];
198     p[0].atomic = bas->selfrootdups;              p[1].atomic = bas->remoterootdups;
199   } else {
200     p[0].count  = sf->nroots;
201     p[0].idx    = NULL;
202     p[0].opt    = NULL;
203     p[0].buf    = link->rootbuf[link->rootmtype];
204     p[0].atomic = PETSC_FALSE;
205     p[1].count  = 0;
206   }
207 
208   for (i=0; i<2; i++) {
209     if (p[i].count) {
210       ierr = PetscSFPackGetUnpackAndOp(link,link->rootmtype,op,p[i].atomic,&UnpackAndOp);CHKERRQ(ierr);
211       if (UnpackAndOp) {ierr = (*UnpackAndOp)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);}
212 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
213       else {
214         PetscInt    j;
215         PetscMPIInt n;
216         if (p[i].idx) {
217           /* Note if done on GPU, this can be very slow due to the huge number of kernel calls. The op is likely user-defined. We must
218              use link->unit (instead of link->basicunit) as the datatype and 1 (instead of link->bs) as the count in MPI_Reduce_local.
219            */
220           for (j=0; j<p[i].count; j++) {ierr = MPI_Reduce_local(p[i].buf+j*link->unitbytes,(char *)rootdata+p[i].idx[j]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
221         } else {
222           ierr = PetscMPIIntCast(p[i].count,&n);CHKERRQ(ierr);
223           ierr = MPI_Reduce_local(p[i].buf,rootdata,n,link->unit,op);CHKERRQ(ierr);
224         }
225       }
226 #else
227     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
228 #endif
229     }
230   }
231 #if defined(PETSC_HAVE_CUDA)
232   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
233 #endif
234   PetscFunctionReturn(0);
235 }
236 
237 /* Utility routine to unpack data from leaf buffer and Op it into selected entries of leafdata */
238 PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpLeafData(PetscSF sf,PetscSFPack link,const PetscInt *leafloc,void *leafdata,MPI_Op op,PetscBool sparse)
239 {
240   PetscErrorCode ierr;
241   PetscInt       i;
242   PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
243   PackInfo       p[2];
244 
245   PetscFunctionBegin;
246   if (sparse) {
247     p[0].count  = sf->roffset[sf->ndranks];       p[1].count  = sf->roffset[sf->nranks]-sf->roffset[sf->ndranks];
248     p[0].idx    = leafloc;                        p[1].idx    = leafloc+sf->roffset[sf->ndranks];
249     p[0].opt    = sf->selfleafpackopt;            p[1].opt    = sf->leafpackopt;
250     p[0].buf    = link->selfbuf[link->leafmtype]; p[1].buf    = link->leafbuf[link->leafmtype];
251     p[0].atomic = sf->selfleafdups;               p[1].atomic = sf->remoteleafdups;
252   } else {
253     p[0].count  = sf->nleaves;
254     p[0].idx    = NULL;
255     p[0].opt    = NULL;
256     p[0].buf    = link->leafbuf[link->leafmtype];
257     p[0].atomic = PETSC_FALSE;
258     p[1].count  = 0;
259   }
260 
261   for (i=0; i<2; i++) {
262     if (p[i].count) {
263       ierr = PetscSFPackGetUnpackAndOp(link,link->leafmtype,op,p[i].atomic,&UnpackAndOp);CHKERRQ(ierr);
264       if (UnpackAndOp) {ierr = (*UnpackAndOp)(p[i].count,p[i].idx,link,p[i].opt,leafdata,p[i].buf);CHKERRQ(ierr);}
265 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
266       else {
267         PetscInt    j;
268         PetscMPIInt n;
269         if (p[i].idx) {
270           for (j=0; j<p[i].count; j++) {ierr = MPI_Reduce_local(p[i].buf+j*link->unitbytes,(char *)leafdata+p[i].idx[j]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
271         } else {
272           ierr = PetscMPIIntCast(p[i].count,&n);CHKERRQ(ierr);
273           ierr = MPI_Reduce_local(p[i].buf,leafdata,n,link->unit,op);CHKERRQ(ierr);
274         }
275       }
276 #else
277     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
278 #endif
279     }
280   }
281 #if defined(PETSC_HAVE_CUDA)
282   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
283 #endif
284   PetscFunctionReturn(0);
285 }
286 
287 /* Utility routine to fetch and Op selected entries of rootdata */
288 PETSC_STATIC_INLINE PetscErrorCode PetscSFFetchAndOpRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,void *rootdata,MPI_Op op,PetscBool sparse)
289 {
290   PetscErrorCode ierr;
291   PetscInt       i;
292   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
293   PetscErrorCode (*FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*);
294   PackInfo       p[2];
295 
296   PetscFunctionBegin;
297   if (sparse) {
298     /* For SFBasic and SFNeighbor, whose root space is sparse and have separate buffers for self and remote. */
299     p[0].count  = bas->ioffset[bas->ndiranks];    p[1].count  = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks];
300     p[0].idx    = rootloc;                        p[1].idx    = rootloc+bas->ioffset[bas->ndiranks];
301     p[0].opt    = bas->selfrootpackopt;           p[1].opt    = bas->rootpackopt;
302     p[0].buf    = link->selfbuf[link->rootmtype]; p[1].buf    = link->rootbuf[link->rootmtype];
303     p[0].atomic = bas->selfrootdups;              p[1].atomic = bas->remoterootdups;
304   } else {
305     p[0].count  = sf->nroots;
306     p[0].idx    = NULL;
307     p[0].opt    = NULL;
308     p[0].buf    = link->rootbuf[link->rootmtype];
309     p[0].atomic = PETSC_FALSE;
310     p[1].count  = 0;
311   }
312 
313   for (i=0; i<2; i++) {
314     if (p[i].count) {
315       ierr = PetscSFPackGetFetchAndOp(link,link->rootmtype,op,p[i].atomic,&FetchAndOp);CHKERRQ(ierr);
316       ierr = (*FetchAndOp)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);
317     }
318   }
319 #if defined(PETSC_HAVE_CUDA)
320   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
321 #endif
322   PetscFunctionReturn(0);
323 }
324 
325 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackSetupOptimizations_Basic(PetscSF sf)
326 {
327   PetscErrorCode ierr;
328   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
329 
330   PetscFunctionBegin;
331   ierr = PetscSFPackOptCreate(sf->ndranks,               sf->roffset,               sf->rmine,    &sf->selfleafpackopt);CHKERRQ(ierr);
332   ierr = PetscSFPackOptCreate(sf->nranks-sf->ndranks,    sf->roffset+sf->ndranks,   sf->rmine,    &sf->leafpackopt);CHKERRQ(ierr);
333   ierr = PetscSFPackOptCreate(bas->ndiranks,             bas->ioffset,              bas->irootloc,&bas->selfrootpackopt);CHKERRQ(ierr);
334   ierr = PetscSFPackOptCreate(bas->niranks-bas->ndiranks,bas->ioffset+bas->ndiranks,bas->irootloc,&bas->rootpackopt);CHKERRQ(ierr);
335 
336 #if defined(PETSC_HAVE_CUDA)
337   /* Check duplicates in irootloc[] so CUDA packing kernels can use cheaper regular operations
338      instead of atomics to unpack data on leaves/roots, when they know there is not data race.
339    */
340   ierr = PetscCheckDupsInt(sf->roffset[sf->ndranks],                              sf->rmine,                                &sf->selfleafdups);CHKERRQ(ierr);
341   ierr = PetscCheckDupsInt(sf->roffset[sf->nranks]-sf->roffset[sf->ndranks],      sf->rmine+sf->roffset[sf->ndranks],       &sf->remoteleafdups);CHKERRQ(ierr);
342   ierr = PetscCheckDupsInt(bas->ioffset[bas->ndiranks],                           bas->irootloc,                            &bas->selfrootdups);CHKERRQ(ierr);
343   ierr = PetscCheckDupsInt(bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks],bas->irootloc+bas->ioffset[bas->ndiranks],&bas->remoterootdups);CHKERRQ(ierr);
344 #endif
345   PetscFunctionReturn(0);
346 }
347 
348 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackDestroyOptimizations_Basic(PetscSF sf)
349 {
350   PetscErrorCode ierr;
351   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
352 
353   PetscFunctionBegin;
354   ierr = PetscSFPackOptDestroy(&sf->leafpackopt);CHKERRQ(ierr);
355   ierr = PetscSFPackOptDestroy(&sf->selfleafpackopt);CHKERRQ(ierr);
356   ierr = PetscSFPackOptDestroy(&bas->rootpackopt);CHKERRQ(ierr);
357   ierr = PetscSFPackOptDestroy(&bas->selfrootpackopt);CHKERRQ(ierr);
358 #if defined(PETSC_HAVE_CUDA)
359   sf->selfleafdups    = PETSC_TRUE;
360   sf->remoteleafdups  = PETSC_TRUE;
361   bas->selfrootdups   = PETSC_TRUE; /* The default is assuming there are dups so that atomics are used. */
362   bas->remoterootdups = PETSC_TRUE;
363 #endif
364   PetscFunctionReturn(0);
365 }
366 
367 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackWaitall_Basic(PetscSFPack link,PetscSFDirection direction)
368 {
369   PetscErrorCode ierr;
370 
371   PetscFunctionBegin;
372   ierr = MPI_Waitall(link->nrootreqs,link->rootreqs[direction][link->rootmtype],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
373   ierr = MPI_Waitall(link->nleafreqs,link->leafreqs[direction][link->leafmtype],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
374   PetscFunctionReturn(0);
375 }
376 
377 PETSC_INTERN PetscErrorCode PetscSFSetUp_Basic(PetscSF);
378 PETSC_INTERN PetscErrorCode PetscSFView_Basic(PetscSF,PetscViewer);
379 PETSC_INTERN PetscErrorCode PetscSFReset_Basic(PetscSF);
380 PETSC_INTERN PetscErrorCode PetscSFDestroy_Basic(PetscSF);
381 PETSC_INTERN PetscErrorCode PetscSFBcastAndOpEnd_Basic  (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,      void*,      MPI_Op);
382 PETSC_INTERN PetscErrorCode PetscSFReduceEnd_Basic      (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,      void*,      MPI_Op);
383 PETSC_INTERN PetscErrorCode PetscSFFetchAndOpBegin_Basic(PetscSF,MPI_Datatype,PetscMemType,      void*,PetscMemType,const void*,void*,MPI_Op);
384 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*);
385 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedLeafSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*);
386 PETSC_INTERN PetscErrorCode PetscSFGetLeafRanks_Basic(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**);
387 PETSC_INTERN PetscErrorCode PetscSFPackGet_Basic_Common(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,PetscInt,PetscInt,PetscSFPack*);
388 #endif
389