1 #if !defined(__SFBASIC_H) 2 #define __SFBASIC_H 3 4 #include <../src/vec/is/sf/impls/basic/sfpack.h> 5 6 #define SFBASICHEADER \ 7 PetscMPIInt niranks; /* Number of incoming ranks (ranks accessing my roots) */ \ 8 PetscMPIInt ndiranks; /* Number of incoming ranks (ranks accessing my roots) in distinguished set */ \ 9 PetscMPIInt *iranks; /* Array of ranks that reference my roots */ \ 10 PetscInt itotal; /* Total number of graph edges referencing my roots */ \ 11 PetscInt *ioffset; /* Array of length niranks+1 holding offset in irootloc[] for each rank */ \ 12 PetscInt *irootloc; /* Incoming roots referenced by ranks starting at ioffset[rank] */ \ 13 PetscInt *irootloc_d; /* irootloc in device memory when needed */ \ 14 PetscSFPackOpt rootpackopt; /* Optimization plans to (un)pack roots based on patterns in irootloc[]. NULL for no plans */ \ 15 PetscSFPackOpt selfrootpackopt; /* Optimization plans to (un)pack roots connected to local leaves */ \ 16 PetscSFPack avail; /* One or more entries per MPI Datatype, lazily constructed */ \ 17 PetscSFPack inuse; /* Buffers being used for transactions that have not yet completed */ \ 18 PetscBool selfrootdups; /* Indices of roots in irootloc[0,ioffset[ndiranks]) have dups, implying theads working ... */ \ 19 /* ... on these roots in parallel may have data race. */ \ 20 PetscBool remoterootdups /* Indices of roots in irootloc[ioffset[ndiranks],ioffset[niranks]) have dups */ 21 22 typedef struct { 23 SFBASICHEADER; 24 } PetscSF_Basic; 25 26 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootInfo_Basic(PetscSF sf,PetscInt *nrootranks,PetscInt *ndrootranks,const PetscMPIInt **rootranks,const PetscInt **rootoffset,const PetscInt **rootloc) 27 { 28 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 29 30 PetscFunctionBegin; 31 if (nrootranks) *nrootranks = bas->niranks; 32 if (ndrootranks) *ndrootranks = bas->ndiranks; 33 if (rootranks) *rootranks = bas->iranks; 34 if (rootoffset) *rootoffset = bas->ioffset; 35 if (rootloc) *rootloc = bas->irootloc; 36 PetscFunctionReturn(0); 37 } 38 39 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafInfo_Basic(PetscSF sf,PetscInt *nleafranks,PetscInt *ndleafranks,const PetscMPIInt **leafranks,const PetscInt **leafoffset,const PetscInt **leafloc,const PetscInt **leafrremote) 40 { 41 PetscFunctionBegin; 42 if (nleafranks) *nleafranks = sf->nranks; 43 if (ndleafranks) *ndleafranks = sf->ndranks; 44 if (leafranks) *leafranks = sf->ranks; 45 if (leafoffset) *leafoffset = sf->roffset; 46 if (leafloc) *leafloc = sf->rmine; 47 if (leafrremote) *leafrremote = sf->rremote; 48 PetscFunctionReturn(0); 49 } 50 51 /* Get root locations either on Host or Device */ 52 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootIndicesWithMemType_Basic(PetscSF sf,PetscMemType mtype, const PetscInt **rootloc) 53 { 54 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 55 PetscFunctionBegin; 56 if (rootloc) { 57 if (mtype == PETSC_MEMTYPE_HOST) *rootloc = bas->irootloc; 58 #if defined(PETSC_HAVE_CUDA) 59 else if (mtype == PETSC_MEMTYPE_DEVICE) { 60 if (!bas->irootloc_d) { 61 cudaError_t err; 62 size_t size = bas->ioffset[bas->niranks]*sizeof(PetscInt); 63 err = cudaMalloc((void **)&bas->irootloc_d,size);CHKERRCUDA(err); 64 err = cudaMemcpy(bas->irootloc_d,bas->irootloc,size,cudaMemcpyHostToDevice);CHKERRCUDA(err); 65 } 66 *rootloc = bas->irootloc_d; 67 } 68 #endif 69 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType in getting rootloc %d",(int)mtype); 70 } 71 PetscFunctionReturn(0); 72 } 73 74 /* Get leaf locations either on Host (CPU) or Device (GPU) */ 75 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafIndicesWithMemType_Basic(PetscSF sf,PetscMemType mtype, const PetscInt **leafloc) 76 { 77 PetscFunctionBegin; 78 if (leafloc) { 79 if (mtype == PETSC_MEMTYPE_HOST) *leafloc = sf->rmine; 80 #if defined(PETSC_HAVE_CUDA) 81 else if (mtype == PETSC_MEMTYPE_DEVICE) { 82 if (!sf->rmine_d) { 83 cudaError_t err; 84 size_t size = sf->roffset[sf->nranks]*sizeof(PetscInt); 85 err = cudaMalloc((void **)&sf->rmine_d,size);CHKERRCUDA(err); 86 err = cudaMemcpy(sf->rmine_d,sf->rmine,size,cudaMemcpyHostToDevice);CHKERRCUDA(err); 87 } 88 *leafloc = sf->rmine_d; 89 } 90 #endif 91 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType in getting leafloc %d",(int)mtype); 92 } 93 PetscFunctionReturn(0); 94 } 95 96 /* A convenience struct to provide info to the following (un)packing routines so that we can treat packings to self and to remote in one loop. */ 97 typedef struct _n_PackInfo { 98 PetscInt count; /* Number of entries to pack, unpack etc. */ 99 const PetscInt *idx; /* Indices of the entries. NULL for contiguous indices [0,count-1) */ 100 PetscSFPackOpt opt; /* Pack optimizations */ 101 char *buf; /* The contiguous buffer where we pack to or unpack from */ 102 PetscBool atomic; /* Whether the unpack routine needs to use atomics */ 103 } PackInfo; 104 105 /* Utility routine to pack selected entries of rootdata into root buffer. 106 Input Arguments: 107 + sf - The SF this packing works on. 108 . link - The PetscSFPack, which gives the memtype of the roots and also provides root buffer. 109 . rootloc - Indices of the roots, only meaningful if the root space is sparse 110 . rootdata - Where to read the roots. 111 - sparse - Is the root space sparse (for SFBasic, SFNeighbor) or dense (for SFAllgatherv etc) 112 */ 113 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,const void *rootdata,PetscBool sparse) 114 { 115 PetscErrorCode ierr; 116 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 117 PetscInt i; 118 PetscErrorCode (*Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*); 119 PackInfo p[2]; 120 121 PetscFunctionBegin; 122 if (sparse) { 123 /* For SFBasic and SFNeighbor, whose root space is sparse and have separate buffers for self and remote. */ 124 p[0].count = link->selfbuflen; p[1].count = link->rootbuflen; 125 p[0].idx = rootloc; p[1].idx = rootloc+bas->ioffset[bas->ndiranks]; 126 p[0].opt = bas->selfrootpackopt; p[1].opt = bas->rootpackopt; 127 p[0].buf = link->selfbuf[link->rootmtype]; p[1].buf = link->rootbuf[link->rootmtype]; 128 p[0].atomic = PETSC_FALSE; p[1].atomic = PETSC_FALSE; 129 } else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"SFAllgather etc should directly use rootdata instead of packing it"); 130 131 ierr = PetscSFPackGetPack(link,link->rootmtype,&Pack);CHKERRQ(ierr); 132 /* Only do packing when count != 0 so that we can avoid invoking empty CUDA kernels */ 133 for (i=0; i<2; i++) {if (p[i].count) {ierr = (*Pack)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);}} 134 135 #if defined(PETSC_HAVE_CUDA) 136 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) { /* We only care p[1], which is for remote and involves MPI */ 137 /* Without use_gpu_aware_mpi, we need to copy rootbuf on device to rootbuf on host. The cudaStreamSynchronize() 138 is to make sure rootbuf is ready before MPI communicatioin starts. 139 */ 140 cudaError_t err; 141 if (!use_gpu_aware_mpi) {err = cudaMemcpyAsync(link->rootbuf[PETSC_MEMTYPE_HOST],link->rootbuf[PETSC_MEMTYPE_DEVICE],link->rootbuflen*link->unitbytes,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(err);} 142 err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err); 143 } 144 #endif 145 PetscFunctionReturn(0); 146 } 147 148 /* Utility routine to pack selected entries of leafdata into leaf buffer */ 149 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackLeafData(PetscSF sf,PetscSFPack link,const PetscInt *leafloc,const void *leafdata,PetscBool sparse) 150 { 151 PetscErrorCode ierr; 152 PetscInt i; 153 PetscErrorCode (*Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*); 154 PackInfo p[2]; 155 156 PetscFunctionBegin; 157 if (sparse) { 158 p[0].count = link->selfbuflen; p[1].count = link->leafbuflen; 159 p[0].idx = leafloc; p[1].idx = leafloc+sf->roffset[sf->ndranks]; 160 p[0].opt = sf->selfleafpackopt; p[1].opt = sf->leafpackopt; 161 p[0].buf = link->selfbuf[link->leafmtype]; p[1].buf = link->leafbuf[link->leafmtype]; 162 p[0].atomic = PETSC_FALSE; p[1].atomic = PETSC_FALSE; 163 } else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_PLIB,"SFAllgather etc should directly use leafdata instead of packing it"); 164 165 ierr = PetscSFPackGetPack(link,link->leafmtype,&Pack);CHKERRQ(ierr); 166 for (i=0; i<2; i++) {if (p[i].count) {ierr = (*Pack)(p[i].count,p[i].idx,link,p[i].opt,leafdata,p[i].buf);CHKERRQ(ierr);}} 167 #if defined(PETSC_HAVE_CUDA) 168 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && p[1].count) { 169 cudaError_t err; 170 if (!use_gpu_aware_mpi) {err = cudaMemcpyAsync(link->leafbuf[PETSC_MEMTYPE_HOST],link->leafbuf[PETSC_MEMTYPE_DEVICE],link->leafbuflen*link->unitbytes,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(err);} 171 err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err); 172 } 173 #endif 174 PetscFunctionReturn(0); 175 } 176 177 /* Utility routine to unpack data from root buffer and Op it into selected entries of rootdata */ 178 PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,void *rootdata,MPI_Op op,PetscBool sparse) 179 { 180 PetscErrorCode ierr; 181 PetscInt i; 182 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 183 PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 184 PackInfo p[2]; 185 186 PetscFunctionBegin; 187 #if defined(PETSC_HAVE_CUDA) 188 if (!use_gpu_aware_mpi && link->rootmtype == PETSC_MEMTYPE_DEVICE) { /* Copy roots from host to device when needed */ 189 cudaError_t err; 190 if (!link->rootbuf[PETSC_MEMTYPE_DEVICE]) {ierr = PetscMallocWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuflen*link->unitbytes,(void**)&link->rootbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);} 191 err = cudaMemcpyAsync(link->rootbuf[PETSC_MEMTYPE_DEVICE],link->rootbuf[PETSC_MEMTYPE_HOST],link->rootbuflen*link->unitbytes,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err); 192 } 193 #endif 194 195 if (sparse) { 196 p[0].count = link->selfbuflen; p[1].count = link->rootbuflen; 197 p[0].idx = rootloc; p[1].idx = rootloc+bas->ioffset[bas->ndiranks]; 198 p[0].opt = bas->selfrootpackopt; p[1].opt = bas->rootpackopt; 199 p[0].buf = link->selfbuf[link->rootmtype]; p[1].buf = link->rootbuf[link->rootmtype]; 200 p[0].atomic = bas->selfrootdups; p[1].atomic = bas->remoterootdups; 201 } else { 202 p[0].count = 0; 203 p[1].count = sf->nroots; 204 p[1].idx = NULL; 205 p[1].opt = NULL; 206 p[1].buf = link->rootbuf[link->rootmtype]; /* Might just allocated by PetscMallocWithMemType() above */ 207 p[1].atomic = PETSC_FALSE; 208 } 209 210 for (i=0; i<2; i++) { 211 if (p[i].count) { 212 ierr = PetscSFPackGetUnpackAndOp(link,link->rootmtype,op,p[i].atomic,&UnpackAndOp);CHKERRQ(ierr); 213 if (UnpackAndOp) {ierr = (*UnpackAndOp)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);} 214 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 215 else { 216 PetscInt j; 217 PetscMPIInt n; 218 if (p[i].idx) { 219 /* Note if done on GPU, this can be very slow due to the huge number of kernel calls. The op is likely user-defined. We must 220 use link->unit (instead of link->basicunit) as the datatype and 1 (instead of link->bs) as the count in MPI_Reduce_local. 221 */ 222 for (j=0; j<p[i].count; j++) {ierr = MPI_Reduce_local(p[i].buf+j*link->unitbytes,(char *)rootdata+p[i].idx[j]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 223 } else { 224 ierr = PetscMPIIntCast(p[i].count,&n);CHKERRQ(ierr); 225 ierr = MPI_Reduce_local(p[i].buf,rootdata,n,link->unit,op);CHKERRQ(ierr); 226 } 227 } 228 #else 229 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 230 #endif 231 } 232 } 233 #if defined(PETSC_HAVE_CUDA) 234 /* Make sure rootdata is ready to use by SF client */ 235 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);} 236 #endif 237 PetscFunctionReturn(0); 238 } 239 240 /* Utility routine to unpack data from leaf buffer and Op it into selected entries of leafdata */ 241 PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpLeafData(PetscSF sf,PetscSFPack link,const PetscInt *leafloc,void *leafdata,MPI_Op op,PetscBool sparse) 242 { 243 PetscErrorCode ierr; 244 PetscInt i; 245 PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 246 PackInfo p[2]; 247 248 PetscFunctionBegin; 249 #if defined(PETSC_HAVE_CUDA) 250 if (!use_gpu_aware_mpi && link->leafmtype == PETSC_MEMTYPE_DEVICE) { 251 cudaError_t err; 252 if (!link->leafbuf[PETSC_MEMTYPE_DEVICE]) {ierr = PetscMallocWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuflen*link->unitbytes,(void**)&link->leafbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);} 253 err = cudaMemcpyAsync(link->leafbuf[PETSC_MEMTYPE_DEVICE],link->leafbuf[PETSC_MEMTYPE_HOST],link->leafbuflen*link->unitbytes,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err); 254 } 255 #endif 256 257 if (sparse) { 258 p[0].count = link->selfbuflen; p[1].count = link->leafbuflen; 259 p[0].idx = leafloc; p[1].idx = leafloc+sf->roffset[sf->ndranks]; 260 p[0].opt = sf->selfleafpackopt; p[1].opt = sf->leafpackopt; 261 p[0].buf = link->selfbuf[link->leafmtype]; p[1].buf = link->leafbuf[link->leafmtype]; 262 p[0].atomic = sf->selfleafdups; p[1].atomic = sf->remoteleafdups; 263 } else { 264 p[0].count = 0; 265 p[1].count = sf->nleaves; 266 p[1].idx = NULL; 267 p[1].opt = NULL; 268 p[1].buf = link->leafbuf[link->leafmtype]; 269 p[1].atomic = PETSC_FALSE; 270 } 271 272 for (i=0; i<2; i++) { 273 if (p[i].count) { 274 ierr = PetscSFPackGetUnpackAndOp(link,link->leafmtype,op,p[i].atomic,&UnpackAndOp);CHKERRQ(ierr); 275 if (UnpackAndOp) {ierr = (*UnpackAndOp)(p[i].count,p[i].idx,link,p[i].opt,leafdata,p[i].buf);CHKERRQ(ierr);} 276 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 277 else { 278 PetscInt j; 279 PetscMPIInt n; 280 if (p[i].idx) { 281 for (j=0; j<p[i].count; j++) {ierr = MPI_Reduce_local(p[i].buf+j*link->unitbytes,(char *)leafdata+p[i].idx[j]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 282 } else { 283 ierr = PetscMPIIntCast(p[i].count,&n);CHKERRQ(ierr); 284 ierr = MPI_Reduce_local(p[i].buf,leafdata,n,link->unit,op);CHKERRQ(ierr); 285 } 286 } 287 #else 288 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 289 #endif 290 } 291 } 292 #if defined(PETSC_HAVE_CUDA) 293 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);} 294 #endif 295 PetscFunctionReturn(0); 296 } 297 298 /* Utility routine to fetch and Op selected entries of rootdata */ 299 PETSC_STATIC_INLINE PetscErrorCode PetscSFFetchAndOpRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,void *rootdata,MPI_Op op,PetscBool sparse) 300 { 301 PetscErrorCode ierr; 302 PetscInt i; 303 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 304 PetscErrorCode (*FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*); 305 PackInfo p[2]; 306 307 PetscFunctionBegin; 308 #if defined(PETSC_HAVE_CUDA) 309 if (!use_gpu_aware_mpi && link->rootmtype == PETSC_MEMTYPE_DEVICE) { 310 cudaError_t err; 311 if (!link->rootbuf[PETSC_MEMTYPE_DEVICE]) {ierr = PetscMallocWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuflen*link->unitbytes,(void**)&link->rootbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);} 312 err = cudaMemcpyAsync(link->rootbuf[PETSC_MEMTYPE_DEVICE],link->rootbuf[PETSC_MEMTYPE_HOST],link->rootbuflen*link->unitbytes,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err); 313 } 314 #endif 315 316 if (sparse) { 317 /* For SFBasic and SFNeighbor, whose root space is sparse and have separate buffers for self and remote. */ 318 p[0].count = link->selfbuflen; p[1].count = link->rootbuflen; 319 p[0].idx = rootloc; p[1].idx = rootloc+bas->ioffset[bas->ndiranks]; 320 p[0].opt = bas->selfrootpackopt; p[1].opt = bas->rootpackopt; 321 p[0].buf = link->selfbuf[link->rootmtype]; p[1].buf = link->rootbuf[link->rootmtype]; 322 p[0].atomic = bas->selfrootdups; p[1].atomic = bas->remoterootdups; 323 } else { 324 p[0].count = 0; 325 p[1].count = sf->nroots; 326 p[1].idx = NULL; 327 p[1].opt = NULL; 328 p[1].buf = link->rootbuf[link->rootmtype]; 329 p[1].atomic = PETSC_FALSE; 330 } 331 332 for (i=0; i<2; i++) { 333 if (p[i].count) { 334 ierr = PetscSFPackGetFetchAndOp(link,link->rootmtype,op,p[i].atomic,&FetchAndOp);CHKERRQ(ierr); 335 ierr = (*FetchAndOp)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr); 336 } 337 } 338 #if defined(PETSC_HAVE_CUDA) 339 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);} 340 #endif 341 PetscFunctionReturn(0); 342 } 343 344 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackSetupOptimizations_Basic(PetscSF sf) 345 { 346 PetscErrorCode ierr; 347 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 348 349 PetscFunctionBegin; 350 ierr = PetscSFPackOptCreate(sf->ndranks, sf->roffset, sf->rmine, &sf->selfleafpackopt);CHKERRQ(ierr); 351 ierr = PetscSFPackOptCreate(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt);CHKERRQ(ierr); 352 ierr = PetscSFPackOptCreate(bas->ndiranks, bas->ioffset, bas->irootloc,&bas->selfrootpackopt);CHKERRQ(ierr); 353 ierr = PetscSFPackOptCreate(bas->niranks-bas->ndiranks,bas->ioffset+bas->ndiranks,bas->irootloc,&bas->rootpackopt);CHKERRQ(ierr); 354 355 #if defined(PETSC_HAVE_CUDA) 356 /* Check duplicates in irootloc[] so CUDA packing kernels can use cheaper regular operations 357 instead of atomics to unpack data on leaves/roots, when they know there is not data race. 358 */ 359 ierr = PetscCheckDupsInt(sf->roffset[sf->ndranks], sf->rmine, &sf->selfleafdups);CHKERRQ(ierr); 360 ierr = PetscCheckDupsInt(sf->roffset[sf->nranks]-sf->roffset[sf->ndranks], sf->rmine+sf->roffset[sf->ndranks], &sf->remoteleafdups);CHKERRQ(ierr); 361 ierr = PetscCheckDupsInt(bas->ioffset[bas->ndiranks], bas->irootloc, &bas->selfrootdups);CHKERRQ(ierr); 362 ierr = PetscCheckDupsInt(bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks],bas->irootloc+bas->ioffset[bas->ndiranks],&bas->remoterootdups);CHKERRQ(ierr); 363 #endif 364 PetscFunctionReturn(0); 365 } 366 367 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackDestroyOptimizations_Basic(PetscSF sf) 368 { 369 PetscErrorCode ierr; 370 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 371 372 PetscFunctionBegin; 373 ierr = PetscSFPackOptDestroy(&sf->leafpackopt);CHKERRQ(ierr); 374 ierr = PetscSFPackOptDestroy(&sf->selfleafpackopt);CHKERRQ(ierr); 375 ierr = PetscSFPackOptDestroy(&bas->rootpackopt);CHKERRQ(ierr); 376 ierr = PetscSFPackOptDestroy(&bas->selfrootpackopt);CHKERRQ(ierr); 377 #if defined(PETSC_HAVE_CUDA) 378 sf->selfleafdups = PETSC_TRUE; 379 sf->remoteleafdups = PETSC_TRUE; 380 bas->selfrootdups = PETSC_TRUE; /* The default is assuming there are dups so that atomics are used. */ 381 bas->remoterootdups = PETSC_TRUE; 382 #endif 383 PetscFunctionReturn(0); 384 } 385 386 PETSC_INTERN PetscErrorCode PetscSFSetUp_Basic(PetscSF); 387 PETSC_INTERN PetscErrorCode PetscSFView_Basic(PetscSF,PetscViewer); 388 PETSC_INTERN PetscErrorCode PetscSFReset_Basic(PetscSF); 389 PETSC_INTERN PetscErrorCode PetscSFDestroy_Basic(PetscSF); 390 PETSC_INTERN PetscErrorCode PetscSFBcastAndOpEnd_Basic (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType, void*, MPI_Op); 391 PETSC_INTERN PetscErrorCode PetscSFReduceEnd_Basic (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType, void*, MPI_Op); 392 PETSC_INTERN PetscErrorCode PetscSFFetchAndOpBegin_Basic(PetscSF,MPI_Datatype,PetscMemType, void*,PetscMemType,const void*,void*,MPI_Op); 393 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*); 394 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedLeafSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*); 395 PETSC_INTERN PetscErrorCode PetscSFGetLeafRanks_Basic(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**); 396 PETSC_INTERN PetscErrorCode PetscSFPackGet_Basic_Common(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,PetscInt,PetscInt,PetscSFPack*); 397 #endif 398