xref: /petsc/src/vec/is/sf/impls/basic/sfbasic.h (revision b7c0d12a26d4b426015009c8f0a095fd1bca8609)
1 #if !defined(__SFBASIC_H)
2 #define __SFBASIC_H
3 
4 #include <../src/vec/is/sf/impls/basic/sfpack.h>
5 
6 #define SFBASICHEADER \
7   PetscMPIInt      niranks;         /* Number of incoming ranks (ranks accessing my roots) */                                      \
8   PetscMPIInt      ndiranks;        /* Number of incoming ranks (ranks accessing my roots) in distinguished set */                 \
9   PetscMPIInt      *iranks;         /* Array of ranks that reference my roots */                                                   \
10   PetscInt         itotal;          /* Total number of graph edges referencing my roots */                                         \
11   PetscInt         *ioffset;        /* Array of length niranks+1 holding offset in irootloc[] for each rank */                     \
12   PetscInt         *irootloc;       /* Incoming roots referenced by ranks starting at ioffset[rank] */                             \
13   PetscInt         *irootloc_d;     /* irootloc in device memory when needed */                                                    \
14   PetscSFPackOpt   rootpackopt;     /* Optimization plans to (un)pack roots based on patterns in irootloc[]. NULL for no plans */  \
15   PetscSFPackOpt   selfrootpackopt; /* Optimization plans to (un)pack roots connected to local leaves */                           \
16   PetscSFPack      avail;           /* One or more entries per MPI Datatype, lazily constructed */                                 \
17   PetscSFPack      inuse;           /* Buffers being used for transactions that have not yet completed */                          \
18   PetscBool        selfrootdups;    /* Indices of roots in irootloc[0,ioffset[ndiranks]) have dups, implying theads working ... */ \
19                                     /* ... on these roots in parallel may have data race. */                                       \
20   PetscBool        remoterootdups   /* Indices of roots in irootloc[ioffset[ndiranks],ioffset[niranks]) have dups */
21 
22 typedef struct {
23   SFBASICHEADER;
24 } PetscSF_Basic;
25 
26 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootInfo_Basic(PetscSF sf,PetscInt *nrootranks,PetscInt *ndrootranks,const PetscMPIInt **rootranks,const PetscInt **rootoffset,const PetscInt **rootloc)
27 {
28   PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
29 
30   PetscFunctionBegin;
31   if (nrootranks)  *nrootranks  = bas->niranks;
32   if (ndrootranks) *ndrootranks = bas->ndiranks;
33   if (rootranks)   *rootranks   = bas->iranks;
34   if (rootoffset)  *rootoffset  = bas->ioffset;
35   if (rootloc)     *rootloc     = bas->irootloc;
36   PetscFunctionReturn(0);
37 }
38 
39 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafInfo_Basic(PetscSF sf,PetscInt *nleafranks,PetscInt *ndleafranks,const PetscMPIInt **leafranks,const PetscInt **leafoffset,const PetscInt **leafloc,const PetscInt **leafrremote)
40 {
41   PetscFunctionBegin;
42   if (nleafranks)  *nleafranks  = sf->nranks;
43   if (ndleafranks) *ndleafranks = sf->ndranks;
44   if (leafranks)   *leafranks   = sf->ranks;
45   if (leafoffset)  *leafoffset  = sf->roffset;
46   if (leafloc)     *leafloc     = sf->rmine;
47   if (leafrremote) *leafrremote = sf->rremote;
48   PetscFunctionReturn(0);
49 }
50 
51 /* Get root locations either on Host or Device */
52 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootIndicesWithMemType_Basic(PetscSF sf,PetscMemType mtype, const PetscInt **rootloc)
53 {
54   PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
55   PetscFunctionBegin;
56   if (rootloc) {
57     if (mtype == PETSC_MEMTYPE_HOST) *rootloc = bas->irootloc;
58 #if defined(PETSC_HAVE_CUDA)
59     else if (mtype == PETSC_MEMTYPE_DEVICE) {
60       if (!bas->irootloc_d) {
61         cudaError_t err;
62         size_t      size = bas->ioffset[bas->niranks]*sizeof(PetscInt);
63         err = cudaMalloc((void **)&bas->irootloc_d,size);CHKERRCUDA(err);
64         err = cudaMemcpy(bas->irootloc_d,bas->irootloc,size,cudaMemcpyHostToDevice);CHKERRCUDA(err);
65       }
66       *rootloc = bas->irootloc_d;
67     }
68 #endif
69     else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType in getting rootloc %d",(int)mtype);
70   }
71   PetscFunctionReturn(0);
72 }
73 
74 /* Get leaf locations either on Host (CPU) or Device (GPU) */
75 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafIndicesWithMemType_Basic(PetscSF sf,PetscMemType mtype, const PetscInt **leafloc)
76 {
77   PetscFunctionBegin;
78   if (leafloc) {
79     if (mtype == PETSC_MEMTYPE_HOST) *leafloc = sf->rmine;
80 #if defined(PETSC_HAVE_CUDA)
81     else if (mtype == PETSC_MEMTYPE_DEVICE) {
82       if (!sf->rmine_d) {
83         cudaError_t err;
84         size_t      size = sf->roffset[sf->nranks]*sizeof(PetscInt);
85         err = cudaMalloc((void **)&sf->rmine_d,size);CHKERRCUDA(err);
86         err = cudaMemcpy(sf->rmine_d,sf->rmine,size,cudaMemcpyHostToDevice);CHKERRCUDA(err);
87       }
88       *leafloc = sf->rmine_d;
89     }
90 #endif
91     else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType in getting leafloc %d",(int)mtype);
92   }
93   PetscFunctionReturn(0);
94 }
95 
96 /* A convenience struct to provide info to the following (un)packing routines so that we can treat packings to self and to remote in one loop. */
97 typedef struct _n_PackInfo {
98   PetscInt       count;  /* Number of entries to pack, unpack etc. */
99   const PetscInt *idx;   /* Indices of the entries. NULL for contiguous indices [0,count-1) */
100   PetscSFPackOpt opt;    /* Pack optimizations */
101   char           *buf;   /* The contiguous buffer where we pack to or unpack from */
102   PetscBool      atomic; /* Whether the unpack routine needs to use atomics */
103 } PackInfo;
104 
105 /* Utility routine to pack selected entries of rootdata into root buffer.
106   Input Arguments:
107   + sf       - The SF this packing works on.
108   . link     - The PetscSFPack, which gives the memtype of the roots and also provides root buffer.
109   . rootloc  - Indices of the roots, only meaningful if the root space is sparse
110   . rootdata - Where to read the roots.
111   - sparse   - Is the root space sparse (for SFBasic, SFNeighbor)  or dense (for SFAllgatherv etc)
112  */
113 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,const void *rootdata,PetscBool sparse)
114 {
115   PetscErrorCode ierr;
116   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
117   PetscInt       i;
118   PetscErrorCode (*Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*);
119   PackInfo       p[2];
120 
121   PetscFunctionBegin;
122   if (sparse) {
123     /* For SFBasic and SFNeighbor, whose root space is sparse and have separate buffers for self and remote. */
124     p[0].count  = bas->ioffset[bas->ndiranks];    p[1].count  = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks];
125     p[0].idx    = rootloc;                        p[1].idx    = rootloc+bas->ioffset[bas->ndiranks];
126     p[0].opt    = bas->selfrootpackopt;           p[1].opt    = bas->rootpackopt;
127     p[0].buf    = link->selfbuf[link->rootmtype]; p[1].buf    = link->rootbuf[link->rootmtype];
128     p[0].atomic = PETSC_FALSE;                    p[1].atomic = PETSC_FALSE;
129   } else {
130     /* For SFAllgatherv etc collectives, which have a dense root space and do not differentiate self/remote communication. */
131     p[0].count  = 0; /* Setting to 0 to skip self, in other words, do not single out self communication */
132     p[1].count  = sf->nroots;
133     p[1].idx    = NULL;
134     p[1].opt    = NULL;
135     p[1].buf    = link->rootbuf[link->rootmtype];
136     p[1].atomic = PETSC_FALSE;
137   }
138 
139   ierr = PetscSFPackGetPack(link,link->rootmtype,&Pack);CHKERRQ(ierr);
140   /* Only do packing when count != 0 so that we can avoid invoking empty CUDA kernels */
141   for (i=0; i<2; i++) {if (p[i].count) {ierr = (*Pack)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);}}
142 
143 #if defined(PETSC_HAVE_CUDA)
144   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) { /* We only care p[1], which is for remote and involves MPI */
145     /* Without use_gpu_aware_mpi, we need to copy rootbuf on device to rootbuf on host. The cudaStreamSynchronize()
146        is to make usre rootbuf is ready before MPI communicatioin starts.
147     */
148     cudaError_t err;
149     if (!use_gpu_aware_mpi) {err = cudaMemcpyAsync(link->rootbuf[PETSC_MEMTYPE_HOST],link->rootbuf[PETSC_MEMTYPE_DEVICE],link->rootbuflen*link->unitbytes,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(err);}
150     err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);
151   }
152 #endif
153   PetscFunctionReturn(0);
154 }
155 
156 /* Utility routine to pack selected entries of leafdata into leaf buffer */
157 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackLeafData(PetscSF sf,PetscSFPack link,const PetscInt *leafloc,const void *leafdata,PetscBool sparse)
158 {
159   PetscErrorCode ierr;
160   PetscInt       i;
161   PetscErrorCode (*Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*);
162   PackInfo       p[2];
163 
164   PetscFunctionBegin;
165   if (sparse) {
166     p[0].count  = sf->roffset[sf->ndranks];       p[1].count  = sf->roffset[sf->nranks]-sf->roffset[sf->ndranks];
167     p[0].idx    = leafloc;                        p[1].idx    = leafloc+sf->roffset[sf->ndranks];
168     p[0].opt    = sf->selfleafpackopt;            p[1].opt    = sf->leafpackopt;
169     p[0].buf    = link->selfbuf[link->leafmtype]; p[1].buf    = link->leafbuf[link->leafmtype];
170     p[0].atomic = PETSC_FALSE;                    p[1].atomic = PETSC_FALSE;
171   } else {
172     p[0].count  = 0;
173     p[1].count  = sf->nleaves;
174     p[1].idx    = NULL;
175     p[1].opt    = NULL;
176     p[1].buf    = link->leafbuf[link->leafmtype];
177     p[1].atomic = PETSC_FALSE;
178   }
179 
180   ierr = PetscSFPackGetPack(link,link->leafmtype,&Pack);CHKERRQ(ierr);
181   for (i=0; i<2; i++) {if (p[i].count) {ierr = (*Pack)(p[i].count,p[i].idx,link,p[i].opt,leafdata,p[i].buf);CHKERRQ(ierr);}}
182 #if defined(PETSC_HAVE_CUDA)
183   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {
184     cudaError_t err;
185     if (!use_gpu_aware_mpi) {err = cudaMemcpyAsync(link->leafbuf[PETSC_MEMTYPE_HOST],link->leafbuf[PETSC_MEMTYPE_DEVICE],link->leafbuflen*link->unitbytes,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(err);}
186     err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);
187   }
188 #endif
189   PetscFunctionReturn(0);
190 }
191 
192 /* Utility routine to unpack data from root buffer and Op it into selected entries of rootdata */
193 PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,void *rootdata,MPI_Op op,PetscBool sparse)
194 {
195   PetscErrorCode ierr;
196   PetscInt       i;
197   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
198   PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
199   PackInfo       p[2];
200 
201   PetscFunctionBegin;
202   if (sparse) {
203     p[0].count  = bas->ioffset[bas->ndiranks];    p[1].count  = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks];
204     p[0].idx    = rootloc;                        p[1].idx    = rootloc+bas->ioffset[bas->ndiranks];
205     p[0].opt    = bas->selfrootpackopt;           p[1].opt    = bas->rootpackopt;
206     p[0].buf    = link->selfbuf[link->rootmtype]; p[1].buf    = link->rootbuf[link->rootmtype];
207     p[0].atomic = bas->selfrootdups;              p[1].atomic = bas->remoterootdups;
208   } else {
209     p[0].count  = 0;
210     p[1].count  = sf->nroots;
211     p[1].idx    = NULL;
212     p[1].opt    = NULL;
213     p[1].buf    = link->rootbuf[link->rootmtype];
214     p[1].atomic = PETSC_FALSE;
215   }
216 
217 #if defined(PETSC_HAVE_CUDA)
218   if (!use_gpu_aware_mpi && link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {
219     cudaError_t err = cudaMemcpyAsync(link->rootbuf[PETSC_MEMTYPE_DEVICE],link->rootbuf[PETSC_MEMTYPE_HOST],link->rootbuflen*link->unitbytes,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err);
220   }
221 #endif
222 
223   for (i=0; i<2; i++) {
224     if (p[i].count) {
225       ierr = PetscSFPackGetUnpackAndOp(link,link->rootmtype,op,p[i].atomic,&UnpackAndOp);CHKERRQ(ierr);
226       if (UnpackAndOp) {ierr = (*UnpackAndOp)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);}
227 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
228       else {
229         PetscInt    j;
230         PetscMPIInt n;
231         if (p[i].idx) {
232           /* Note if done on GPU, this can be very slow due to the huge number of kernel calls. The op is likely user-defined. We must
233              use link->unit (instead of link->basicunit) as the datatype and 1 (instead of link->bs) as the count in MPI_Reduce_local.
234            */
235           for (j=0; j<p[i].count; j++) {ierr = MPI_Reduce_local(p[i].buf+j*link->unitbytes,(char *)rootdata+p[i].idx[j]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
236         } else {
237           ierr = PetscMPIIntCast(p[i].count,&n);CHKERRQ(ierr);
238           ierr = MPI_Reduce_local(p[i].buf,rootdata,n,link->unit,op);CHKERRQ(ierr);
239         }
240       }
241 #else
242     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
243 #endif
244     }
245   }
246 #if defined(PETSC_HAVE_CUDA)
247   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
248 #endif
249   PetscFunctionReturn(0);
250 }
251 
252 /* Utility routine to unpack data from leaf buffer and Op it into selected entries of leafdata */
253 PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpLeafData(PetscSF sf,PetscSFPack link,const PetscInt *leafloc,void *leafdata,MPI_Op op,PetscBool sparse)
254 {
255   PetscErrorCode ierr;
256   PetscInt       i;
257   PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
258   PackInfo       p[2];
259 
260   PetscFunctionBegin;
261   if (sparse) {
262     p[0].count  = sf->roffset[sf->ndranks];       p[1].count  = sf->roffset[sf->nranks]-sf->roffset[sf->ndranks];
263     p[0].idx    = leafloc;                        p[1].idx    = leafloc+sf->roffset[sf->ndranks];
264     p[0].opt    = sf->selfleafpackopt;            p[1].opt    = sf->leafpackopt;
265     p[0].buf    = link->selfbuf[link->leafmtype]; p[1].buf    = link->leafbuf[link->leafmtype];
266     p[0].atomic = sf->selfleafdups;               p[1].atomic = sf->remoteleafdups;
267   } else {
268     p[0].count  = 0;
269     p[1].count  = sf->nleaves;
270     p[1].idx    = NULL;
271     p[1].opt    = NULL;
272     p[1].buf    = link->leafbuf[link->leafmtype];
273     p[1].atomic = PETSC_FALSE;
274   }
275 
276 #if defined(PETSC_HAVE_CUDA)
277   if (!use_gpu_aware_mpi && link->leafmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {
278     cudaError_t err = cudaMemcpyAsync(link->leafbuf[PETSC_MEMTYPE_DEVICE],link->leafbuf[PETSC_MEMTYPE_HOST],link->leafbuflen*link->unitbytes,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err);
279   }
280 #endif
281 
282   for (i=0; i<2; i++) {
283     if (p[i].count) {
284       ierr = PetscSFPackGetUnpackAndOp(link,link->leafmtype,op,p[i].atomic,&UnpackAndOp);CHKERRQ(ierr);
285       if (UnpackAndOp) {ierr = (*UnpackAndOp)(p[i].count,p[i].idx,link,p[i].opt,leafdata,p[i].buf);CHKERRQ(ierr);}
286 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
287       else {
288         PetscInt    j;
289         PetscMPIInt n;
290         if (p[i].idx) {
291           for (j=0; j<p[i].count; j++) {ierr = MPI_Reduce_local(p[i].buf+j*link->unitbytes,(char *)leafdata+p[i].idx[j]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
292         } else {
293           ierr = PetscMPIIntCast(p[i].count,&n);CHKERRQ(ierr);
294           ierr = MPI_Reduce_local(p[i].buf,leafdata,n,link->unit,op);CHKERRQ(ierr);
295         }
296       }
297 #else
298     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
299 #endif
300     }
301   }
302 #if defined(PETSC_HAVE_CUDA)
303   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
304 #endif
305   PetscFunctionReturn(0);
306 }
307 
308 /* Utility routine to fetch and Op selected entries of rootdata */
309 PETSC_STATIC_INLINE PetscErrorCode PetscSFFetchAndOpRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,void *rootdata,MPI_Op op,PetscBool sparse)
310 {
311   PetscErrorCode ierr;
312   PetscInt       i;
313   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
314   PetscErrorCode (*FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*);
315   PackInfo       p[2];
316 
317   PetscFunctionBegin;
318   if (sparse) {
319     /* For SFBasic and SFNeighbor, whose root space is sparse and have separate buffers for self and remote. */
320     p[0].count  = bas->ioffset[bas->ndiranks];    p[1].count  = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks];
321     p[0].idx    = rootloc;                        p[1].idx    = rootloc+bas->ioffset[bas->ndiranks];
322     p[0].opt    = bas->selfrootpackopt;           p[1].opt    = bas->rootpackopt;
323     p[0].buf    = link->selfbuf[link->rootmtype]; p[1].buf    = link->rootbuf[link->rootmtype];
324     p[0].atomic = bas->selfrootdups;              p[1].atomic = bas->remoterootdups;
325   } else {
326     p[0].count  = 0;
327     p[1].count  = sf->nroots;
328     p[1].idx    = NULL;
329     p[1].opt    = NULL;
330     p[1].buf    = link->rootbuf[link->rootmtype];
331     p[1].atomic = PETSC_FALSE;
332   }
333 
334 #if defined(PETSC_HAVE_CUDA)
335   if (!use_gpu_aware_mpi && link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {
336     cudaError_t err = cudaMemcpyAsync(link->rootbuf[PETSC_MEMTYPE_DEVICE],link->rootbuf[PETSC_MEMTYPE_HOST],link->rootbuflen*link->unitbytes,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err);
337   }
338 #endif
339 
340   for (i=0; i<2; i++) {
341     if (p[i].count) {
342       ierr = PetscSFPackGetFetchAndOp(link,link->rootmtype,op,p[i].atomic,&FetchAndOp);CHKERRQ(ierr);
343       ierr = (*FetchAndOp)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);
344     }
345   }
346 #if defined(PETSC_HAVE_CUDA)
347   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
348 #endif
349   PetscFunctionReturn(0);
350 }
351 
352 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackSetupOptimizations_Basic(PetscSF sf)
353 {
354   PetscErrorCode ierr;
355   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
356 
357   PetscFunctionBegin;
358   ierr = PetscSFPackOptCreate(sf->ndranks,               sf->roffset,               sf->rmine,    &sf->selfleafpackopt);CHKERRQ(ierr);
359   ierr = PetscSFPackOptCreate(sf->nranks-sf->ndranks,    sf->roffset+sf->ndranks,   sf->rmine,    &sf->leafpackopt);CHKERRQ(ierr);
360   ierr = PetscSFPackOptCreate(bas->ndiranks,             bas->ioffset,              bas->irootloc,&bas->selfrootpackopt);CHKERRQ(ierr);
361   ierr = PetscSFPackOptCreate(bas->niranks-bas->ndiranks,bas->ioffset+bas->ndiranks,bas->irootloc,&bas->rootpackopt);CHKERRQ(ierr);
362 
363 #if defined(PETSC_HAVE_CUDA)
364   /* Check duplicates in irootloc[] so CUDA packing kernels can use cheaper regular operations
365      instead of atomics to unpack data on leaves/roots, when they know there is not data race.
366    */
367   ierr = PetscCheckDupsInt(sf->roffset[sf->ndranks],                              sf->rmine,                                &sf->selfleafdups);CHKERRQ(ierr);
368   ierr = PetscCheckDupsInt(sf->roffset[sf->nranks]-sf->roffset[sf->ndranks],      sf->rmine+sf->roffset[sf->ndranks],       &sf->remoteleafdups);CHKERRQ(ierr);
369   ierr = PetscCheckDupsInt(bas->ioffset[bas->ndiranks],                           bas->irootloc,                            &bas->selfrootdups);CHKERRQ(ierr);
370   ierr = PetscCheckDupsInt(bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks],bas->irootloc+bas->ioffset[bas->ndiranks],&bas->remoterootdups);CHKERRQ(ierr);
371 #endif
372   PetscFunctionReturn(0);
373 }
374 
375 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackDestroyOptimizations_Basic(PetscSF sf)
376 {
377   PetscErrorCode ierr;
378   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
379 
380   PetscFunctionBegin;
381   ierr = PetscSFPackOptDestroy(&sf->leafpackopt);CHKERRQ(ierr);
382   ierr = PetscSFPackOptDestroy(&sf->selfleafpackopt);CHKERRQ(ierr);
383   ierr = PetscSFPackOptDestroy(&bas->rootpackopt);CHKERRQ(ierr);
384   ierr = PetscSFPackOptDestroy(&bas->selfrootpackopt);CHKERRQ(ierr);
385 #if defined(PETSC_HAVE_CUDA)
386   sf->selfleafdups    = PETSC_TRUE;
387   sf->remoteleafdups  = PETSC_TRUE;
388   bas->selfrootdups   = PETSC_TRUE; /* The default is assuming there are dups so that atomics are used. */
389   bas->remoterootdups = PETSC_TRUE;
390 #endif
391   PetscFunctionReturn(0);
392 }
393 
394 PETSC_INTERN PetscErrorCode PetscSFSetUp_Basic(PetscSF);
395 PETSC_INTERN PetscErrorCode PetscSFView_Basic(PetscSF,PetscViewer);
396 PETSC_INTERN PetscErrorCode PetscSFReset_Basic(PetscSF);
397 PETSC_INTERN PetscErrorCode PetscSFDestroy_Basic(PetscSF);
398 PETSC_INTERN PetscErrorCode PetscSFBcastAndOpEnd_Basic  (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,      void*,      MPI_Op);
399 PETSC_INTERN PetscErrorCode PetscSFReduceEnd_Basic      (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,      void*,      MPI_Op);
400 PETSC_INTERN PetscErrorCode PetscSFFetchAndOpBegin_Basic(PetscSF,MPI_Datatype,PetscMemType,      void*,PetscMemType,const void*,void*,MPI_Op);
401 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*);
402 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedLeafSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*);
403 PETSC_INTERN PetscErrorCode PetscSFGetLeafRanks_Basic(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**);
404 PETSC_INTERN PetscErrorCode PetscSFPackGet_Basic_Common(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,PetscInt,PetscInt,PetscSFPack*);
405 #endif
406