1 #pragma once 2 3 #include <petscvec.h> 4 #include <petscsf.h> 5 #include <petsc/private/deviceimpl.h> 6 #include <petsc/private/petscimpl.h> 7 8 PETSC_EXTERN PetscLogEvent PETSCSF_SetGraph; 9 PETSC_EXTERN PetscLogEvent PETSCSF_SetUp; 10 PETSC_EXTERN PetscLogEvent PETSCSF_BcastBegin; 11 PETSC_EXTERN PetscLogEvent PETSCSF_BcastEnd; 12 PETSC_EXTERN PetscLogEvent PETSCSF_BcastBegin; 13 PETSC_EXTERN PetscLogEvent PETSCSF_BcastEnd; 14 PETSC_EXTERN PetscLogEvent PETSCSF_ReduceBegin; 15 PETSC_EXTERN PetscLogEvent PETSCSF_ReduceEnd; 16 PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpBegin; 17 PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpEnd; 18 PETSC_EXTERN PetscLogEvent PETSCSF_EmbedSF; 19 PETSC_EXTERN PetscLogEvent PETSCSF_DistSect; 20 PETSC_EXTERN PetscLogEvent PETSCSF_SectSF; 21 PETSC_EXTERN PetscLogEvent PETSCSF_RemoteOff; 22 PETSC_EXTERN PetscLogEvent PETSCSF_Pack; 23 PETSC_EXTERN PetscLogEvent PETSCSF_Unpack; 24 25 struct _PetscSFOps { 26 PetscErrorCode (*Reset)(PetscSF); 27 PetscErrorCode (*Destroy)(PetscSF); 28 PetscErrorCode (*SetUp)(PetscSF); 29 PetscErrorCode (*SetFromOptions)(PetscSF, PetscOptionItems); 30 PetscErrorCode (*View)(PetscSF, PetscViewer); 31 PetscErrorCode (*Duplicate)(PetscSF, PetscSFDuplicateOption, PetscSF); 32 PetscErrorCode (*BcastBegin)(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, void *, MPI_Op); 33 PetscErrorCode (*BcastEnd)(PetscSF, MPI_Datatype, const void *, void *, MPI_Op); 34 PetscErrorCode (*ReduceBegin)(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, void *, MPI_Op); 35 PetscErrorCode (*ReduceEnd)(PetscSF, MPI_Datatype, const void *, void *, MPI_Op); 36 PetscErrorCode (*FetchAndOpBegin)(PetscSF, MPI_Datatype, PetscMemType, void *, PetscMemType, const void *, void *, MPI_Op); 37 PetscErrorCode (*FetchAndOpEnd)(PetscSF, MPI_Datatype, void *, const void *, void *, MPI_Op); 38 PetscErrorCode (*BcastToZero)(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, void *); /* For internal use only */ 39 PetscErrorCode (*GetRootRanks)(PetscSF, PetscMPIInt *, const PetscMPIInt **, const PetscInt **, const PetscInt **, const PetscInt **); 40 PetscErrorCode (*GetLeafRanks)(PetscSF, PetscMPIInt *, const PetscMPIInt **, const PetscInt **, const PetscInt **); 41 PetscErrorCode (*CreateLocalSF)(PetscSF, PetscSF *); 42 PetscErrorCode (*GetGraph)(PetscSF, PetscInt *, PetscInt *, const PetscInt **, const PetscSFNode **); 43 PetscErrorCode (*CreateEmbeddedRootSF)(PetscSF, PetscInt, const PetscInt *, PetscSF *); 44 PetscErrorCode (*CreateEmbeddedLeafSF)(PetscSF, PetscInt, const PetscInt *, PetscSF *); 45 PetscErrorCode (*SetCommunicationOps)(PetscSF, PetscSFLink); 46 47 PetscErrorCode (*Malloc)(PetscMemType, size_t, void **); 48 PetscErrorCode (*Free)(PetscMemType, void *); 49 }; 50 51 typedef struct _n_PetscSFPackOpt *PetscSFPackOpt; 52 53 struct _p_PetscSF { 54 PETSCHEADER(struct _PetscSFOps); 55 struct { /* Fields needed to implement VecScatter behavior */ 56 PetscInt from_n, to_n; /* Recorded local sizes of the input from/to vectors in VecScatterCreate(). Used subsequently for error checking. */ 57 PetscBool beginandendtogether; /* Indicates that the scatter begin and end function are called together, VecScatterEnd() is then treated as a nop */ 58 const PetscScalar *xdata; /* Vector data to read from */ 59 PetscScalar *ydata; /* Vector data to write to. The two pointers are recorded in VecScatterBegin. Memory is not managed by SF. */ 60 PetscSF lsf; /* The local part of the scatter, used in SCATTER_LOCAL. Built on demand. */ 61 PetscInt bs; /* Block size, determined by IS passed to VecScatterCreate */ 62 MPI_Datatype unit; /* one unit = bs PetscScalars */ 63 PetscBool logging; /* Indicate if vscat log events are happening. If yes, avoid duplicated SF logging to have clear -log_view */ 64 } vscat; 65 66 /* Fields for generic PetscSF functionality */ 67 PetscInt nroots; /* Number of root vertices on current process (candidates for incoming edges) */ 68 PetscInt nleaves; /* Number of leaf vertices on current process (this process specifies a root for each leaf) */ 69 PetscInt *mine; /* Location of leaves in leafdata arrays provided to the communication routines */ 70 PetscInt *mine_alloc; 71 PetscInt minleaf, maxleaf; 72 PetscSFNode *remote; /* Remote references to roots for each local leaf */ 73 PetscSFNode *remote_alloc; 74 PetscMPIInt nranks; /* Number of ranks owning roots connected to my leaves */ 75 PetscMPIInt ndranks; /* Number of ranks in distinguished group holding roots connected to my leaves */ 76 PetscMPIInt *ranks; /* List of ranks referenced by "remote" */ 77 PetscInt *roffset; /* Array of length nranks+1, offset in rmine/rremote for each rank */ 78 PetscInt *rmine; /* Concatenated array holding local indices referencing each remote rank */ 79 PetscInt *rmine_d[2]; /* A copy of rmine[local/remote] in device memory if needed */ 80 PetscBool monitor; /* monitor the sf communication */ 81 82 /* Some results useful in packing by analyzing rmine[] */ 83 PetscInt leafbuflen[2]; /* Length (in unit) of leaf buffers, in layout of [PETSCSF_LOCAL/REMOTE] */ 84 PetscBool leafcontig[2]; /* True means indices in rmine[self part] or rmine[remote part] are contiguous, and they start from ... */ 85 PetscInt leafstart[2]; /* ... leafstart[0] and leafstart[1] respectively */ 86 PetscSFPackOpt leafpackopt[2]; /* Optimization plans to (un)pack leaves connected to remote roots, based on index patterns in rmine[]. NULL for no optimization */ 87 PetscSFPackOpt leafpackopt_d[2]; /* Copy of leafpackopt_d[] on device if needed */ 88 PetscBool leafdups[2]; /* Indices in rmine[] for self(0)/remote(1) communication have dups respectively? TRUE implies threads working on them in parallel may have data race. */ 89 90 PetscMPIInt nleafreqs; /* Number of MPI requests for leaves */ 91 PetscInt *rremote; /* Concatenated array holding remote indices referenced for each remote rank */ 92 PetscBool degreeknown; /* The degree is currently known, do not have to recompute */ 93 PetscInt *degree; /* Degree of each of my root vertices */ 94 PetscInt *degreetmp; /* Temporary local array for computing degree */ 95 PetscBool rankorder; /* Sort ranks for gather and scatter operations */ 96 MPI_Group ingroup; /* Group of processes connected to my roots */ 97 MPI_Group outgroup; /* Group of processes connected to my leaves */ 98 PetscSF multi; /* Internal graph used to implement gather and scatter operations */ 99 PetscSF rankssf; /* Internal graph used to implement communications with root ranks */ 100 PetscBool graphset; /* Flag indicating that the graph has been set, required before calling communication routines */ 101 PetscBool setupcalled; /* Type and communication structures have been set up */ 102 PetscSFPattern pattern; /* Pattern of the graph */ 103 PetscBool persistent; /* Does this SF use MPI persistent requests for communication */ 104 PetscBool collective; /* Is this SF collective? Currently only SFBASIC/SFWINDOW are not collective */ 105 PetscLayout map; /* Layout of leaves over all processes when building a patterned graph */ 106 PetscBool unknown_input_stream; /* If true, SF does not know which streams root/leafdata is on. Default is false, since we only use PETSc default stream */ 107 PetscBool use_gpu_aware_mpi; /* If true, SF assumes it can pass GPU pointers to MPI */ 108 PetscBool use_stream_aware_mpi; /* If true, SF assumes the underlying MPI is cuda-stream aware and we won't sync streams for send/recv buffers passed to MPI */ 109 PetscInt maxResidentThreadsPerGPU; 110 PetscBool allow_multi_leaves; 111 PetscSFBackend backend; /* The device backend (if any) SF will use */ 112 void *data; /* Pointer to implementation */ 113 114 #if defined(PETSC_HAVE_NVSHMEM) 115 PetscBool use_nvshmem; /* TRY to use nvshmem on cuda devices with this SF when possible */ 116 PetscBool use_nvshmem_get; /* If true, use nvshmem_get based protocol, otherwise, use nvshmem_put based protocol */ 117 PetscBool checked_nvshmem_eligibility; /* Have we checked eligibility of using NVSHMEM on this sf? */ 118 PetscBool setup_nvshmem; /* Have we already set up NVSHMEM related fields below? These fields are built on-demand */ 119 PetscInt leafbuflen_rmax; /* max leafbuflen[REMOTE] over comm */ 120 PetscInt nRemoteRootRanks; /* nranks - ndranks */ 121 PetscInt nRemoteRootRanksMax; /* max nranks-ndranks over comm */ 122 123 /* The following two fields look confusing but actually make sense: They are offsets of buffers at the remote side. We're doing one-sided communication! */ 124 PetscInt *rootsigdisp; /* [nRemoteRootRanks]. For my i-th remote root rank, I will access its rootsigdisp[i]-th root signal */ 125 PetscInt *rootbufdisp; /* [nRemoteRootRanks]. For my i-th remote root rank, I will access its root buf at offset rootbufdisp[i], in <unit> to be set */ 126 127 PetscInt *rootbufdisp_d; 128 PetscInt *rootsigdisp_d; /* Copy of rootsigdisp[] on device */ 129 PetscMPIInt *ranks_d; /* Copy of the remote part of (root) ranks[] on device */ 130 PetscInt *roffset_d; /* Copy of the remote part of roffset[] on device */ 131 #endif 132 #if defined(PETSC_HAVE_MPIX_STREAM) 133 MPIX_Stream mpi_stream; 134 MPI_Comm stream_comm; /* gpu stream aware MPI communicator */ 135 #endif 136 }; 137 138 PETSC_EXTERN PetscBool PetscSFRegisterAllCalled; 139 PETSC_EXTERN PetscErrorCode PetscSFRegisterAll(void); 140 141 PETSC_INTERN PetscErrorCode PetscSFGetDatatypeSize_Internal(MPI_Comm, MPI_Datatype, MPI_Aint *); 142 143 PETSC_INTERN PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF, PetscSF *); 144 PETSC_INTERN PetscErrorCode PetscSFBcastToZero_Private(PetscSF, MPI_Datatype, const void *, void *) PETSC_ATTRIBUTE_MPI_POINTER_WITH_TYPE(3, 2) PETSC_ATTRIBUTE_MPI_POINTER_WITH_TYPE(4, 2); 145 146 PETSC_INTERN PetscErrorCode MPIPetsc_Type_unwrap(MPI_Datatype, MPI_Datatype *, PetscBool *); 147 PETSC_INTERN PetscErrorCode MPIPetsc_Type_compare(MPI_Datatype, MPI_Datatype, PetscBool *); 148 PETSC_INTERN PetscErrorCode MPIPetsc_Type_compare_contig(MPI_Datatype, MPI_Datatype, PetscInt *); 149 150 PETSC_INTERN PetscErrorCode MPIPetsc_Type_get_envelope(MPI_Datatype, MPIU_Count *, MPIU_Count *, MPIU_Count *, MPIU_Count *, PetscMPIInt *); 151 PETSC_INTERN PetscErrorCode MPIPetsc_Type_get_contents(MPI_Datatype, MPIU_Count, MPIU_Count, MPIU_Count, MPIU_Count, int *, MPI_Aint *, MPIU_Count *, MPI_Datatype *); 152 153 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES) 154 #define MPIU_Ibcast(a, b, c, d, e, req) MPI_Ibcast(a, b, c, d, e, req) 155 #define MPIU_Ireduce(a, b, c, d, e, f, g, req) MPI_Ireduce(a, b, c, d, e, f, g, req) 156 #define MPIU_Iscatter(a, b, c, d, e, f, g, h, req) MPI_Iscatter(a, b, c, d, e, f, g, h, req) 157 #define MPIU_Iscatterv(a, b, c, d, e, f, g, h, i, req) MPI_Iscatterv(a, b, c, d, e, f, g, h, i, req) 158 #define MPIU_Igather(a, b, c, d, e, f, g, h, req) MPI_Igather(a, b, c, d, e, f, g, h, req) 159 #define MPIU_Igatherv(a, b, c, d, e, f, g, h, i, req) MPI_Igatherv(a, b, c, d, e, f, g, h, i, req) 160 #define MPIU_Iallgather(a, b, c, d, e, f, g, req) MPI_Iallgather(a, b, c, d, e, f, g, req) 161 #define MPIU_Iallgatherv(a, b, c, d, e, f, g, h, req) MPI_Iallgatherv(a, b, c, d, e, f, g, h, req) 162 #define MPIU_Ialltoall(a, b, c, d, e, f, g, req) MPI_Ialltoall(a, b, c, d, e, f, g, req) 163 #else 164 /* Ignore req, the MPI_Request argument, and use MPI blocking collectives. One should initialize req 165 to MPI_REQUEST_NULL so that one can do MPI_Wait(req,status) no matter the call is blocking or not. 166 */ 167 #define MPIU_Ibcast(a, b, c, d, e, req) MPI_Bcast(a, b, c, d, e) 168 #define MPIU_Ireduce(a, b, c, d, e, f, g, req) MPI_Reduce(a, b, c, d, e, f, g) 169 #define MPIU_Iscatter(a, b, c, d, e, f, g, h, req) MPI_Scatter(a, b, c, d, e, f, g, h) 170 #define MPIU_Iscatterv(a, b, c, d, e, f, g, h, i, req) MPI_Scatterv(a, b, c, d, e, f, g, h, i) 171 #define MPIU_Igather(a, b, c, d, e, f, g, h, req) MPI_Gather(a, b, c, d, e, f, g, h) 172 #define MPIU_Igatherv(a, b, c, d, e, f, g, h, i, req) MPI_Gatherv(a, b, c, d, e, f, g, h, i) 173 #define MPIU_Iallgather(a, b, c, d, e, f, g, req) MPI_Allgather(a, b, c, d, e, f, g) 174 #define MPIU_Iallgatherv(a, b, c, d, e, f, g, h, req) MPI_Allgatherv(a, b, c, d, e, f, g, h) 175 #define MPIU_Ialltoall(a, b, c, d, e, f, g, req) MPI_Alltoall(a, b, c, d, e, f, g) 176 #endif 177 178 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode VecScatterGetRemoteCount_Private(VecScatter, PetscBool, PetscInt *, PetscInt *); 179 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode VecScatterGetRemote_Private(VecScatter, PetscBool, PetscMPIInt *, const PetscInt **, const PetscInt **, const PetscMPIInt **, PetscInt *); 180 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode VecScatterGetRemoteOrdered_Private(VecScatter, PetscBool, PetscMPIInt *, const PetscInt **, const PetscInt **, const PetscMPIInt **, PetscInt *); 181 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode VecScatterRestoreRemote_Private(VecScatter, PetscBool, PetscMPIInt *, const PetscInt **, const PetscInt **, const PetscMPIInt **, PetscInt *); 182 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode VecScatterRestoreRemoteOrdered_Private(VecScatter, PetscBool, PetscMPIInt *, const PetscInt **, const PetscInt **, const PetscMPIInt **, PetscInt *); 183 184 #if defined(PETSC_HAVE_CUDA) 185 PETSC_INTERN PetscErrorCode PetscSFMalloc_CUDA(PetscMemType, size_t, void **); 186 PETSC_INTERN PetscErrorCode PetscSFFree_CUDA(PetscMemType, void *); 187 #endif 188 #if defined(PETSC_HAVE_HIP) 189 PETSC_INTERN PetscErrorCode PetscSFMalloc_HIP(PetscMemType, size_t, void **); 190 PETSC_INTERN PetscErrorCode PetscSFFree_HIP(PetscMemType, void *); 191 #endif 192 #if defined(PETSC_HAVE_KOKKOS) 193 PETSC_INTERN PetscErrorCode PetscSFMalloc_Kokkos(PetscMemType, size_t, void **); 194 PETSC_INTERN PetscErrorCode PetscSFFree_Kokkos(PetscMemType, void *); 195 #endif 196 197 /* SF only supports CUDA and Kokkos devices. Even VIENNACL is a device, its device pointers are invisible to SF. 198 Through VecGetArray(), we copy data of VECVIENNACL from device to host and pass host pointers to SF. 199 */ 200 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_KOKKOS) || defined(PETSC_HAVE_HIP) 201 #define PetscSFMalloc(sf, mtype, sz, ptr) ((*(sf)->ops->Malloc)(mtype, sz, ptr)) 202 /* Free memory and set ptr to NULL when succeeded */ 203 #define PetscSFFree(sf, mtype, ptr) ((PetscErrorCode)((ptr) && ((*(sf)->ops->Free)(mtype, ptr) || ((ptr) = NULL, PETSC_SUCCESS)))) 204 #else 205 /* If pure host code, do with less indirection */ 206 #define PetscSFMalloc(sf, mtype, sz, ptr) PetscMalloc(sz, ptr) 207 #define PetscSFFree(sf, mtype, ptr) PetscFree(ptr) 208 #endif 209