1 #if !defined(__SFBASIC_H) 2 #define __SFBASIC_H 3 4 #include <../src/vec/is/sf/impls/basic/sfpack.h> 5 6 typedef enum {PETSCSF_LEAF2ROOT_REDUCE=0, PETSCSF_ROOT2LEAF_BCAST=1} PetscSFDirection; 7 8 #define SFBASICHEADER \ 9 PetscMPIInt niranks; /* Number of incoming ranks (ranks accessing my roots) */ \ 10 PetscMPIInt ndiranks; /* Number of incoming ranks (ranks accessing my roots) in distinguished set */ \ 11 PetscMPIInt *iranks; /* Array of ranks that reference my roots */ \ 12 PetscInt itotal; /* Total number of graph edges referencing my roots */ \ 13 PetscInt *ioffset; /* Array of length niranks+1 holding offset in irootloc[] for each rank */ \ 14 PetscInt *irootloc; /* Incoming roots referenced by ranks starting at ioffset[rank] */ \ 15 PetscInt *irootloc_d; /* irootloc in device memory when needed */ \ 16 PetscSFPackOpt rootpackopt; /* Optimization plans to (un)pack roots based on patterns in irootloc[]. NULL for no plans */ \ 17 PetscSFPackOpt selfrootpackopt; /* Optimization plans to (un)pack roots connected to local leaves */ \ 18 PetscSFPack avail; /* One or more entries per MPI Datatype, lazily constructed */ \ 19 PetscSFPack inuse; /* Buffers being used for transactions that have not yet completed */ \ 20 PetscBool selfrootdups; /* Indices of roots in irootloc[0,ioffset[ndiranks]) have dups, implying theads working ... */ \ 21 /* ... on these roots in parallel may have data race. */ \ 22 PetscBool remoterootdups /* Indices of roots in irootloc[ioffset[ndiranks],ioffset[niranks]) have dups */ 23 24 typedef struct { 25 SFBASICHEADER; 26 } PetscSF_Basic; 27 28 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootInfo_Basic(PetscSF sf,PetscInt *nrootranks,PetscInt *ndrootranks,const PetscMPIInt **rootranks,const PetscInt **rootoffset,const PetscInt **rootloc) 29 { 30 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 31 32 PetscFunctionBegin; 33 if (nrootranks) *nrootranks = bas->niranks; 34 if (ndrootranks) *ndrootranks = bas->ndiranks; 35 if (rootranks) *rootranks = bas->iranks; 36 if (rootoffset) *rootoffset = bas->ioffset; 37 if (rootloc) *rootloc = bas->irootloc; 38 PetscFunctionReturn(0); 39 } 40 41 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafInfo_Basic(PetscSF sf,PetscInt *nleafranks,PetscInt *ndleafranks,const PetscMPIInt **leafranks,const PetscInt **leafoffset,const PetscInt **leafloc,const PetscInt **leafrremote) 42 { 43 PetscFunctionBegin; 44 if (nleafranks) *nleafranks = sf->nranks; 45 if (ndleafranks) *ndleafranks = sf->ndranks; 46 if (leafranks) *leafranks = sf->ranks; 47 if (leafoffset) *leafoffset = sf->roffset; 48 if (leafloc) *leafloc = sf->rmine; 49 if (leafrremote) *leafrremote = sf->rremote; 50 PetscFunctionReturn(0); 51 } 52 53 /* Get root locations either on Host or Device */ 54 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootIndicesWithMemType_Basic(PetscSF sf,PetscMemType mtype, const PetscInt **rootloc) 55 { 56 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 57 PetscFunctionBegin; 58 if (rootloc) { 59 if (mtype == PETSC_MEMTYPE_HOST) *rootloc = bas->irootloc; 60 #if defined(PETSC_HAVE_CUDA) 61 else if (mtype == PETSC_MEMTYPE_DEVICE) { 62 if (!bas->irootloc_d) { 63 cudaError_t err; 64 size_t size = bas->ioffset[bas->niranks]*sizeof(PetscInt); 65 err = cudaMalloc((void **)&bas->irootloc_d,size);CHKERRCUDA(err); 66 err = cudaMemcpy(bas->irootloc_d,bas->irootloc,size,cudaMemcpyHostToDevice);CHKERRCUDA(err); 67 } 68 *rootloc = bas->irootloc_d; 69 } 70 #endif 71 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType in getting rootloc %d",(int)mtype); 72 } 73 PetscFunctionReturn(0); 74 } 75 76 /* Get leaf locations either on Host (CPU) or Device (GPU) */ 77 PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafIndicesWithMemType_Basic(PetscSF sf,PetscMemType mtype, const PetscInt **leafloc) 78 { 79 PetscFunctionBegin; 80 if (leafloc) { 81 if (mtype == PETSC_MEMTYPE_HOST) *leafloc = sf->rmine; 82 #if defined(PETSC_HAVE_CUDA) 83 else if (mtype == PETSC_MEMTYPE_DEVICE) { 84 if (!sf->rmine_d) { 85 cudaError_t err; 86 size_t size = sf->roffset[sf->nranks]*sizeof(PetscInt); 87 err = cudaMalloc((void **)&sf->rmine_d,size);CHKERRCUDA(err); 88 err = cudaMemcpy(sf->rmine_d,sf->rmine,size,cudaMemcpyHostToDevice);CHKERRCUDA(err); 89 } 90 *leafloc = sf->rmine_d; 91 } 92 #endif 93 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType in getting leafloc %d",(int)mtype); 94 } 95 PetscFunctionReturn(0); 96 } 97 98 /* A convenience struct to provide info to the following (un)packing routines so that we can treat packings to self and to remote in one loop. */ 99 typedef struct _n_PackInfo { 100 PetscInt count; /* Number of entries to pack, unpack etc. */ 101 const PetscInt *idx; /* Indices of the entries. NULL for contiguous indices [0,count-1) */ 102 PetscSFPackOpt opt; /* Pack optimizations */ 103 char *buf; /* The contiguous buffer where we pack to or unpack from */ 104 PetscBool atomic; /* Whether the unpack routine needs to use atomics */ 105 } PackInfo; 106 107 /* Utility routine to pack selected entries of rootdata into root buffer. 108 Input Arguments: 109 + sf - The SF this packing works on. 110 . link - The PetscSFPack, which gives the memtype of the roots and also provides root buffer. 111 . rootloc - Indices of the roots, only meaningful if the root space is sparse 112 . rootdata - Where to read the roots. 113 - sparse - Is the root space sparse (for SFBasic, SFNeighbor) or dense (for SFAllgatherv etc) 114 */ 115 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,const void *rootdata,PetscBool sparse) 116 { 117 PetscErrorCode ierr; 118 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 119 PetscInt i; 120 PetscErrorCode (*Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*); 121 PackInfo p[2]; 122 123 PetscFunctionBegin; 124 if (sparse) { 125 /* For SFBasic and SFNeighbor, whose root space is sparse and have separate buffers for self and remote. */ 126 p[0].count = bas->ioffset[bas->ndiranks]; p[1].count = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks]; 127 p[0].idx = rootloc; p[1].idx = rootloc+bas->ioffset[bas->ndiranks]; 128 p[0].opt = bas->selfrootpackopt; p[1].opt = bas->rootpackopt; 129 p[0].buf = link->selfbuf[link->rootmtype]; p[1].buf = link->rootbuf[link->rootmtype]; 130 p[0].atomic = PETSC_FALSE; p[1].atomic = PETSC_FALSE; 131 } else { 132 /* For SFAllgatherv etc collectives, which have a dense root space and do not differentiate self/remote communication. */ 133 p[0].count = 0; /* Setting to 0 to skip self, in other words, do not single out self communication */ 134 p[1].count = sf->nroots; 135 p[1].idx = NULL; 136 p[1].opt = NULL; 137 p[1].buf = link->rootbuf[link->rootmtype]; 138 p[1].atomic = PETSC_FALSE; 139 } 140 141 ierr = PetscSFPackGetPack(link,link->rootmtype,&Pack);CHKERRQ(ierr); 142 /* Only do packing when count != 0 so that we can avoid invoking empty CUDA kernels */ 143 for (i=0; i<2; i++) {if (p[i].count) {ierr = (*Pack)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);}} 144 145 #if defined(PETSC_HAVE_CUDA) 146 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) { /* We only care p[1], which is for remote and involves MPI */ 147 /* Without use_gpu_aware_mpi, we need to copy rootbuf on device to rootbuf on host. The cudaStreamSynchronize() 148 is to make usre rootbuf is ready before MPI communicatioin starts. 149 */ 150 cudaError_t err; 151 if (!use_gpu_aware_mpi) {err = cudaMemcpyAsync(link->rootbuf[PETSC_MEMTYPE_HOST],link->rootbuf[PETSC_MEMTYPE_DEVICE],link->rootbuflen*link->unitbytes,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(err);} 152 err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err); 153 } 154 #endif 155 PetscFunctionReturn(0); 156 } 157 158 /* Utility routine to pack selected entries of leafdata into leaf buffer */ 159 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackLeafData(PetscSF sf,PetscSFPack link,const PetscInt *leafloc,const void *leafdata,PetscBool sparse) 160 { 161 PetscErrorCode ierr; 162 PetscInt i; 163 PetscErrorCode (*Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*); 164 PackInfo p[2]; 165 166 PetscFunctionBegin; 167 if (sparse) { 168 p[0].count = sf->roffset[sf->ndranks]; p[1].count = sf->roffset[sf->nranks]-sf->roffset[sf->ndranks]; 169 p[0].idx = leafloc; p[1].idx = leafloc+sf->roffset[sf->ndranks]; 170 p[0].opt = sf->selfleafpackopt; p[1].opt = sf->leafpackopt; 171 p[0].buf = link->selfbuf[link->leafmtype]; p[1].buf = link->leafbuf[link->leafmtype]; 172 p[0].atomic = PETSC_FALSE; p[1].atomic = PETSC_FALSE; 173 } else { 174 p[0].count = 0; 175 p[1].count = sf->nleaves; 176 p[1].idx = NULL; 177 p[1].opt = NULL; 178 p[1].buf = link->leafbuf[link->leafmtype]; 179 p[1].atomic = PETSC_FALSE; 180 } 181 182 ierr = PetscSFPackGetPack(link,link->leafmtype,&Pack);CHKERRQ(ierr); 183 for (i=0; i<2; i++) {if (p[i].count) {ierr = (*Pack)(p[i].count,p[i].idx,link,p[i].opt,leafdata,p[i].buf);CHKERRQ(ierr);}} 184 #if defined(PETSC_HAVE_CUDA) 185 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && p[1].count) { 186 cudaError_t err; 187 if (!use_gpu_aware_mpi) {err = cudaMemcpyAsync(link->leafbuf[PETSC_MEMTYPE_HOST],link->leafbuf[PETSC_MEMTYPE_DEVICE],link->leafbuflen*link->unitbytes,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(err);} 188 err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err); 189 } 190 #endif 191 PetscFunctionReturn(0); 192 } 193 194 /* Utility routine to unpack data from root buffer and Op it into selected entries of rootdata */ 195 PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,void *rootdata,MPI_Op op,PetscBool sparse) 196 { 197 PetscErrorCode ierr; 198 PetscInt i; 199 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 200 PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 201 PackInfo p[2]; 202 203 PetscFunctionBegin; 204 if (sparse) { 205 p[0].count = bas->ioffset[bas->ndiranks]; p[1].count = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks]; 206 p[0].idx = rootloc; p[1].idx = rootloc+bas->ioffset[bas->ndiranks]; 207 p[0].opt = bas->selfrootpackopt; p[1].opt = bas->rootpackopt; 208 p[0].buf = link->selfbuf[link->rootmtype]; p[1].buf = link->rootbuf[link->rootmtype]; 209 p[0].atomic = bas->selfrootdups; p[1].atomic = bas->remoterootdups; 210 } else { 211 p[0].count = 0; 212 p[1].count = sf->nroots; 213 p[1].idx = NULL; 214 p[1].opt = NULL; 215 p[1].buf = link->rootbuf[link->rootmtype]; 216 p[1].atomic = PETSC_FALSE; 217 } 218 219 #if defined(PETSC_HAVE_CUDA) 220 if (!use_gpu_aware_mpi && link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) { 221 cudaError_t err = cudaMemcpyAsync(link->rootbuf[PETSC_MEMTYPE_DEVICE],link->rootbuf[PETSC_MEMTYPE_HOST],link->rootbuflen*link->unitbytes,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err); 222 } 223 #endif 224 225 for (i=0; i<2; i++) { 226 if (p[i].count) { 227 ierr = PetscSFPackGetUnpackAndOp(link,link->rootmtype,op,p[i].atomic,&UnpackAndOp);CHKERRQ(ierr); 228 if (UnpackAndOp) {ierr = (*UnpackAndOp)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);} 229 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 230 else { 231 PetscInt j; 232 PetscMPIInt n; 233 if (p[i].idx) { 234 /* Note if done on GPU, this can be very slow due to the huge number of kernel calls. The op is likely user-defined. We must 235 use link->unit (instead of link->basicunit) as the datatype and 1 (instead of link->bs) as the count in MPI_Reduce_local. 236 */ 237 for (j=0; j<p[i].count; j++) {ierr = MPI_Reduce_local(p[i].buf+j*link->unitbytes,(char *)rootdata+p[i].idx[j]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 238 } else { 239 ierr = PetscMPIIntCast(p[i].count,&n);CHKERRQ(ierr); 240 ierr = MPI_Reduce_local(p[i].buf,rootdata,n,link->unit,op);CHKERRQ(ierr); 241 } 242 } 243 #else 244 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 245 #endif 246 } 247 } 248 #if defined(PETSC_HAVE_CUDA) 249 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);} 250 #endif 251 PetscFunctionReturn(0); 252 } 253 254 /* Utility routine to unpack data from leaf buffer and Op it into selected entries of leafdata */ 255 PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpLeafData(PetscSF sf,PetscSFPack link,const PetscInt *leafloc,void *leafdata,MPI_Op op,PetscBool sparse) 256 { 257 PetscErrorCode ierr; 258 PetscInt i; 259 PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 260 PackInfo p[2]; 261 262 PetscFunctionBegin; 263 if (sparse) { 264 p[0].count = sf->roffset[sf->ndranks]; p[1].count = sf->roffset[sf->nranks]-sf->roffset[sf->ndranks]; 265 p[0].idx = leafloc; p[1].idx = leafloc+sf->roffset[sf->ndranks]; 266 p[0].opt = sf->selfleafpackopt; p[1].opt = sf->leafpackopt; 267 p[0].buf = link->selfbuf[link->leafmtype]; p[1].buf = link->leafbuf[link->leafmtype]; 268 p[0].atomic = sf->selfleafdups; p[1].atomic = sf->remoteleafdups; 269 } else { 270 p[0].count = 0; 271 p[1].count = sf->nleaves; 272 p[1].idx = NULL; 273 p[1].opt = NULL; 274 p[1].buf = link->leafbuf[link->leafmtype]; 275 p[1].atomic = PETSC_FALSE; 276 } 277 278 #if defined(PETSC_HAVE_CUDA) 279 if (!use_gpu_aware_mpi && link->leafmtype == PETSC_MEMTYPE_DEVICE && p[1].count) { 280 cudaError_t err = cudaMemcpyAsync(link->leafbuf[PETSC_MEMTYPE_DEVICE],link->leafbuf[PETSC_MEMTYPE_HOST],link->leafbuflen*link->unitbytes,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err); 281 } 282 #endif 283 284 for (i=0; i<2; i++) { 285 if (p[i].count) { 286 ierr = PetscSFPackGetUnpackAndOp(link,link->leafmtype,op,p[i].atomic,&UnpackAndOp);CHKERRQ(ierr); 287 if (UnpackAndOp) {ierr = (*UnpackAndOp)(p[i].count,p[i].idx,link,p[i].opt,leafdata,p[i].buf);CHKERRQ(ierr);} 288 #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 289 else { 290 PetscInt j; 291 PetscMPIInt n; 292 if (p[i].idx) { 293 for (j=0; j<p[i].count; j++) {ierr = MPI_Reduce_local(p[i].buf+j*link->unitbytes,(char *)leafdata+p[i].idx[j]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 294 } else { 295 ierr = PetscMPIIntCast(p[i].count,&n);CHKERRQ(ierr); 296 ierr = MPI_Reduce_local(p[i].buf,leafdata,n,link->unit,op);CHKERRQ(ierr); 297 } 298 } 299 #else 300 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 301 #endif 302 } 303 } 304 #if defined(PETSC_HAVE_CUDA) 305 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);} 306 #endif 307 PetscFunctionReturn(0); 308 } 309 310 /* Utility routine to fetch and Op selected entries of rootdata */ 311 PETSC_STATIC_INLINE PetscErrorCode PetscSFFetchAndOpRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,void *rootdata,MPI_Op op,PetscBool sparse) 312 { 313 PetscErrorCode ierr; 314 PetscInt i; 315 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 316 PetscErrorCode (*FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*); 317 PackInfo p[2]; 318 319 PetscFunctionBegin; 320 if (sparse) { 321 /* For SFBasic and SFNeighbor, whose root space is sparse and have separate buffers for self and remote. */ 322 p[0].count = bas->ioffset[bas->ndiranks]; p[1].count = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks]; 323 p[0].idx = rootloc; p[1].idx = rootloc+bas->ioffset[bas->ndiranks]; 324 p[0].opt = bas->selfrootpackopt; p[1].opt = bas->rootpackopt; 325 p[0].buf = link->selfbuf[link->rootmtype]; p[1].buf = link->rootbuf[link->rootmtype]; 326 p[0].atomic = bas->selfrootdups; p[1].atomic = bas->remoterootdups; 327 } else { 328 p[0].count = 0; 329 p[1].count = sf->nroots; 330 p[1].idx = NULL; 331 p[1].opt = NULL; 332 p[1].buf = link->rootbuf[link->rootmtype]; 333 p[1].atomic = PETSC_FALSE; 334 } 335 336 #if defined(PETSC_HAVE_CUDA) 337 if (!use_gpu_aware_mpi && link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) { 338 cudaError_t err = cudaMemcpyAsync(link->rootbuf[PETSC_MEMTYPE_DEVICE],link->rootbuf[PETSC_MEMTYPE_HOST],link->rootbuflen*link->unitbytes,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err); 339 } 340 #endif 341 342 for (i=0; i<2; i++) { 343 if (p[i].count) { 344 ierr = PetscSFPackGetFetchAndOp(link,link->rootmtype,op,p[i].atomic,&FetchAndOp);CHKERRQ(ierr); 345 ierr = (*FetchAndOp)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr); 346 } 347 } 348 #if defined(PETSC_HAVE_CUDA) 349 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && p[1].count) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);} 350 #endif 351 PetscFunctionReturn(0); 352 } 353 354 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackSetupOptimizations_Basic(PetscSF sf) 355 { 356 PetscErrorCode ierr; 357 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 358 359 PetscFunctionBegin; 360 ierr = PetscSFPackOptCreate(sf->ndranks, sf->roffset, sf->rmine, &sf->selfleafpackopt);CHKERRQ(ierr); 361 ierr = PetscSFPackOptCreate(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt);CHKERRQ(ierr); 362 ierr = PetscSFPackOptCreate(bas->ndiranks, bas->ioffset, bas->irootloc,&bas->selfrootpackopt);CHKERRQ(ierr); 363 ierr = PetscSFPackOptCreate(bas->niranks-bas->ndiranks,bas->ioffset+bas->ndiranks,bas->irootloc,&bas->rootpackopt);CHKERRQ(ierr); 364 365 #if defined(PETSC_HAVE_CUDA) 366 /* Check duplicates in irootloc[] so CUDA packing kernels can use cheaper regular operations 367 instead of atomics to unpack data on leaves/roots, when they know there is not data race. 368 */ 369 ierr = PetscCheckDupsInt(sf->roffset[sf->ndranks], sf->rmine, &sf->selfleafdups);CHKERRQ(ierr); 370 ierr = PetscCheckDupsInt(sf->roffset[sf->nranks]-sf->roffset[sf->ndranks], sf->rmine+sf->roffset[sf->ndranks], &sf->remoteleafdups);CHKERRQ(ierr); 371 ierr = PetscCheckDupsInt(bas->ioffset[bas->ndiranks], bas->irootloc, &bas->selfrootdups);CHKERRQ(ierr); 372 ierr = PetscCheckDupsInt(bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks],bas->irootloc+bas->ioffset[bas->ndiranks],&bas->remoterootdups);CHKERRQ(ierr); 373 #endif 374 PetscFunctionReturn(0); 375 } 376 377 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackDestroyOptimizations_Basic(PetscSF sf) 378 { 379 PetscErrorCode ierr; 380 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 381 382 PetscFunctionBegin; 383 ierr = PetscSFPackOptDestroy(&sf->leafpackopt);CHKERRQ(ierr); 384 ierr = PetscSFPackOptDestroy(&sf->selfleafpackopt);CHKERRQ(ierr); 385 ierr = PetscSFPackOptDestroy(&bas->rootpackopt);CHKERRQ(ierr); 386 ierr = PetscSFPackOptDestroy(&bas->selfrootpackopt);CHKERRQ(ierr); 387 #if defined(PETSC_HAVE_CUDA) 388 sf->selfleafdups = PETSC_TRUE; 389 sf->remoteleafdups = PETSC_TRUE; 390 bas->selfrootdups = PETSC_TRUE; /* The default is assuming there are dups so that atomics are used. */ 391 bas->remoterootdups = PETSC_TRUE; 392 #endif 393 PetscFunctionReturn(0); 394 } 395 396 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackWaitall_Basic(PetscSFPack link,PetscSFDirection direction) 397 { 398 PetscErrorCode ierr; 399 PetscMemType rootmtype,leafmtype; 400 401 PetscFunctionBegin; 402 if (use_gpu_aware_mpi) { 403 rootmtype = link->rootmtype; 404 leafmtype = link->leafmtype; 405 } else { 406 rootmtype = PETSC_MEMTYPE_HOST; 407 leafmtype = PETSC_MEMTYPE_HOST; 408 } 409 ierr = MPI_Waitall(link->nrootreqs,link->rootreqs[direction][rootmtype],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 410 ierr = MPI_Waitall(link->nleafreqs,link->leafreqs[direction][leafmtype],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 411 PetscFunctionReturn(0); 412 } 413 414 PETSC_INTERN PetscErrorCode PetscSFSetUp_Basic(PetscSF); 415 PETSC_INTERN PetscErrorCode PetscSFView_Basic(PetscSF,PetscViewer); 416 PETSC_INTERN PetscErrorCode PetscSFReset_Basic(PetscSF); 417 PETSC_INTERN PetscErrorCode PetscSFDestroy_Basic(PetscSF); 418 PETSC_INTERN PetscErrorCode PetscSFBcastAndOpEnd_Basic (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType, void*, MPI_Op); 419 PETSC_INTERN PetscErrorCode PetscSFReduceEnd_Basic (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType, void*, MPI_Op); 420 PETSC_INTERN PetscErrorCode PetscSFFetchAndOpBegin_Basic(PetscSF,MPI_Datatype,PetscMemType, void*,PetscMemType,const void*,void*,MPI_Op); 421 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*); 422 PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedLeafSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*); 423 PETSC_INTERN PetscErrorCode PetscSFGetLeafRanks_Basic(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**); 424 PETSC_INTERN PetscErrorCode PetscSFPackGet_Basic_Common(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,PetscInt,PetscInt,PetscSFPack*); 425 #endif 426