xref: /petsc/include/petsc/private/sfimpl.h (revision a02648fdf9ec0d41d7b5ca02cb70ddcfa0e65728)
1 #pragma once
2 
3 #include <petscvec.h>
4 #include <petscsf.h>
5 #include <petsc/private/deviceimpl.h>
6 #include <petsc/private/petscimpl.h>
7 
8 PETSC_EXTERN PetscLogEvent PETSCSF_SetGraph;
9 PETSC_EXTERN PetscLogEvent PETSCSF_SetUp;
10 PETSC_EXTERN PetscLogEvent PETSCSF_BcastBegin;
11 PETSC_EXTERN PetscLogEvent PETSCSF_BcastEnd;
12 PETSC_EXTERN PetscLogEvent PETSCSF_BcastBegin;
13 PETSC_EXTERN PetscLogEvent PETSCSF_BcastEnd;
14 PETSC_EXTERN PetscLogEvent PETSCSF_ReduceBegin;
15 PETSC_EXTERN PetscLogEvent PETSCSF_ReduceEnd;
16 PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpBegin;
17 PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpEnd;
18 PETSC_EXTERN PetscLogEvent PETSCSF_EmbedSF;
19 PETSC_EXTERN PetscLogEvent PETSCSF_DistSect;
20 PETSC_EXTERN PetscLogEvent PETSCSF_SectSF;
21 PETSC_EXTERN PetscLogEvent PETSCSF_RemoteOff;
22 PETSC_EXTERN PetscLogEvent PETSCSF_Pack;
23 PETSC_EXTERN PetscLogEvent PETSCSF_Unpack;
24 
25 struct _PetscSFOps {
26   PetscErrorCode (*Reset)(PetscSF);
27   PetscErrorCode (*Destroy)(PetscSF);
28   PetscErrorCode (*SetUp)(PetscSF);
29   PetscErrorCode (*SetFromOptions)(PetscSF, PetscOptionItems);
30   PetscErrorCode (*View)(PetscSF, PetscViewer);
31   PetscErrorCode (*Duplicate)(PetscSF, PetscSFDuplicateOption, PetscSF);
32   PetscErrorCode (*BcastBegin)(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, void *, MPI_Op);
33   PetscErrorCode (*BcastEnd)(PetscSF, MPI_Datatype, const void *, void *, MPI_Op);
34   PetscErrorCode (*ReduceBegin)(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, void *, MPI_Op);
35   PetscErrorCode (*ReduceEnd)(PetscSF, MPI_Datatype, const void *, void *, MPI_Op);
36   PetscErrorCode (*FetchAndOpBegin)(PetscSF, MPI_Datatype, PetscMemType, void *, PetscMemType, const void *, void *, MPI_Op);
37   PetscErrorCode (*FetchAndOpEnd)(PetscSF, MPI_Datatype, void *, const void *, void *, MPI_Op);
38   PetscErrorCode (*BcastToZero)(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, void *); /* For internal use only */
39   PetscErrorCode (*GetRootRanks)(PetscSF, PetscMPIInt *, const PetscMPIInt **, const PetscInt **, const PetscInt **, const PetscInt **);
40   PetscErrorCode (*GetLeafRanks)(PetscSF, PetscMPIInt *, const PetscMPIInt **, const PetscInt **, const PetscInt **);
41   PetscErrorCode (*CreateLocalSF)(PetscSF, PetscSF *);
42   PetscErrorCode (*GetGraph)(PetscSF, PetscInt *, PetscInt *, const PetscInt **, const PetscSFNode **);
43   PetscErrorCode (*CreateEmbeddedRootSF)(PetscSF, PetscInt, const PetscInt *, PetscSF *);
44   PetscErrorCode (*CreateEmbeddedLeafSF)(PetscSF, PetscInt, const PetscInt *, PetscSF *);
45   PetscErrorCode (*SetCommunicationOps)(PetscSF, PetscSFLink);
46 
47   PetscErrorCode (*Malloc)(PetscMemType, size_t, void **);
48   PetscErrorCode (*Free)(PetscMemType, void *);
49 };
50 
51 typedef struct _n_PetscSFPackOpt *PetscSFPackOpt;
52 
53 struct _p_PetscSF {
54   PETSCHEADER(struct _PetscSFOps);
55   struct {                                  /* Fields needed to implement VecScatter behavior */
56     PetscInt           from_n, to_n;        /* Recorded local sizes of the input from/to vectors in VecScatterCreate(). Used subsequently for error checking. */
57     PetscBool          beginandendtogether; /* Indicates that the scatter begin and end  function are called together, VecScatterEnd() is then treated as a nop */
58     const PetscScalar *xdata;               /* Vector data to read from */
59     PetscScalar       *ydata;               /* Vector data to write to. The two pointers are recorded in VecScatterBegin. Memory is not managed by SF. */
60     PetscSF            lsf;                 /* The local part of the scatter, used in SCATTER_LOCAL. Built on demand. */
61     PetscInt           bs;                  /* Block size, determined by IS passed to VecScatterCreate */
62     MPI_Datatype       unit;                /* one unit = bs PetscScalars */
63     PetscBool          logging;             /* Indicate if vscat log events are happening. If yes, avoid duplicated SF logging to have clear -log_view */
64   } vscat;
65 
66   /* Fields for generic PetscSF functionality */
67   PetscInt     nroots;  /* Number of root vertices on current process (candidates for incoming edges) */
68   PetscInt     nleaves; /* Number of leaf vertices on current process (this process specifies a root for each leaf) */
69   PetscInt    *mine;    /* Location of leaves in leafdata arrays provided to the communication routines */
70   PetscInt    *mine_alloc;
71   PetscInt     minleaf, maxleaf;
72   PetscSFNode *remote; /* Remote references to roots for each local leaf */
73   PetscSFNode *remote_alloc;
74   PetscMPIInt  nranks;     /* Number of ranks owning roots connected to my leaves */
75   PetscMPIInt  ndranks;    /* Number of ranks in distinguished group holding roots connected to my leaves */
76   PetscMPIInt *ranks;      /* List of ranks referenced by "remote" */
77   PetscInt    *roffset;    /* Array of length nranks+1, offset in rmine/rremote for each rank */
78   PetscInt    *rmine;      /* Concatenated array holding local indices referencing each remote rank */
79   PetscInt    *rmine_d[2]; /* A copy of rmine[local/remote] in device memory if needed */
80   PetscBool    monitor;    /* monitor the sf communication */
81 
82   /* Some results useful in packing by analyzing rmine[] */
83   PetscInt       leafbuflen[2];    /* Length (in unit) of leaf buffers, in layout of [PETSCSF_LOCAL/REMOTE] */
84   PetscBool      leafcontig[2];    /* True means indices in rmine[self part] or rmine[remote part] are contiguous, and they start from ... */
85   PetscInt       leafstart[2];     /* ... leafstart[0] and leafstart[1] respectively */
86   PetscSFPackOpt leafpackopt[2];   /* Optimization plans to (un)pack leaves connected to remote roots, based on index patterns in rmine[]. NULL for no optimization */
87   PetscSFPackOpt leafpackopt_d[2]; /* Copy of leafpackopt_d[] on device if needed */
88   PetscBool      leafdups[2];      /* Indices in rmine[] for self(0)/remote(1) communication have dups respectively? TRUE implies threads working on them in parallel may have data race. */
89 
90   PetscMPIInt    nleafreqs;            /* Number of MPI requests for leaves */
91   PetscInt      *rremote;              /* Concatenated array holding remote indices referenced for each remote rank */
92   PetscBool      degreeknown;          /* The degree is currently known, do not have to recompute */
93   PetscInt      *degree;               /* Degree of each of my root vertices */
94   PetscInt      *degreetmp;            /* Temporary local array for computing degree */
95   PetscBool      rankorder;            /* Sort ranks for gather and scatter operations */
96   MPI_Group      ingroup;              /* Group of processes connected to my roots */
97   MPI_Group      outgroup;             /* Group of processes connected to my leaves */
98   PetscSF        multi;                /* Internal graph used to implement gather and scatter operations */
99   PetscSF        rankssf;              /* Internal graph used to implement communications with root ranks */
100   PetscBool      graphset;             /* Flag indicating that the graph has been set, required before calling communication routines */
101   PetscBool      setupcalled;          /* Type and communication structures have been set up */
102   PetscSFPattern pattern;              /* Pattern of the graph */
103   PetscBool      persistent;           /* Does this SF use MPI persistent requests for communication */
104   PetscBool      collective;           /* Is this SF collective? Currently only SFBASIC/SFWINDOW are not collective */
105   PetscLayout    map;                  /* Layout of leaves over all processes when building a patterned graph */
106   PetscBool      unknown_input_stream; /* If true, SF does not know which streams root/leafdata is on. Default is false, since we only use PETSc default stream */
107   PetscBool      use_gpu_aware_mpi;    /* If true, SF assumes it can pass GPU pointers to MPI */
108   PetscBool      use_stream_aware_mpi; /* If true, SF assumes the underlying MPI is cuda-stream aware and we won't sync streams for send/recv buffers passed to MPI */
109   PetscInt       maxResidentThreadsPerGPU;
110   PetscBool      allow_multi_leaves;
111   PetscSFBackend backend; /* The device backend (if any) SF will use */
112   void          *data;    /* Pointer to implementation */
113 
114 #if defined(PETSC_HAVE_NVSHMEM)
115   PetscBool use_nvshmem;                 /* TRY to use nvshmem on cuda devices with this SF when possible */
116   PetscBool use_nvshmem_get;             /* If true, use nvshmem_get based protocol, otherwise, use nvshmem_put based protocol */
117   PetscBool checked_nvshmem_eligibility; /* Have we checked eligibility of using NVSHMEM on this sf? */
118   PetscBool setup_nvshmem;               /* Have we already set up NVSHMEM related fields below? These fields are built on-demand */
119   PetscInt  leafbuflen_rmax;             /* max leafbuflen[REMOTE] over comm */
120   PetscInt  nRemoteRootRanks;            /* nranks - ndranks */
121   PetscInt  nRemoteRootRanksMax;         /* max nranks-ndranks over comm */
122 
123   /* The following two fields look confusing but actually make sense: They are offsets of buffers at the remote side. We're doing one-sided communication! */
124   PetscInt *rootsigdisp; /* [nRemoteRootRanks]. For my i-th remote root rank, I will access its rootsigdisp[i]-th root signal */
125   PetscInt *rootbufdisp; /* [nRemoteRootRanks]. For my i-th remote root rank, I will access its root buf at offset rootbufdisp[i], in <unit> to be set */
126 
127   PetscInt    *rootbufdisp_d;
128   PetscInt    *rootsigdisp_d; /* Copy of rootsigdisp[] on device */
129   PetscMPIInt *ranks_d;       /* Copy of the remote part of (root) ranks[] on device */
130   PetscInt    *roffset_d;     /* Copy of the remote part of roffset[] on device */
131 #endif
132 #if defined(PETSC_HAVE_MPIX_STREAM)
133   MPIX_Stream mpi_stream;
134   MPI_Comm    stream_comm; /* gpu stream aware MPI communicator */
135 #endif
136 };
137 
138 PETSC_EXTERN PetscBool      PetscSFRegisterAllCalled;
139 PETSC_EXTERN PetscErrorCode PetscSFRegisterAll(void);
140 
141 PETSC_INTERN PetscErrorCode PetscSFGetDatatypeSize_Internal(MPI_Comm, MPI_Datatype, MPI_Aint *);
142 
143 PETSC_INTERN PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF, PetscSF *);
144 PETSC_INTERN PetscErrorCode PetscSFBcastToZero_Private(PetscSF, MPI_Datatype, const void *, void *) PETSC_ATTRIBUTE_MPI_POINTER_WITH_TYPE(3, 2) PETSC_ATTRIBUTE_MPI_POINTER_WITH_TYPE(4, 2);
145 
146 PETSC_INTERN PetscErrorCode MPIPetsc_Type_unwrap(MPI_Datatype, MPI_Datatype *, PetscBool *);
147 PETSC_INTERN PetscErrorCode MPIPetsc_Type_compare(MPI_Datatype, MPI_Datatype, PetscBool *);
148 PETSC_INTERN PetscErrorCode MPIPetsc_Type_compare_contig(MPI_Datatype, MPI_Datatype, PetscInt *);
149 
150 PETSC_INTERN PetscErrorCode MPIPetsc_Type_get_envelope(MPI_Datatype, MPIU_Count *, MPIU_Count *, MPIU_Count *, MPIU_Count *, PetscMPIInt *);
151 PETSC_INTERN PetscErrorCode MPIPetsc_Type_get_contents(MPI_Datatype, MPIU_Count, MPIU_Count, MPIU_Count, MPIU_Count, int *, MPI_Aint *, MPIU_Count *, MPI_Datatype *);
152 
153 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
154   #define MPIU_Ibcast(a, b, c, d, e, req)                MPI_Ibcast(a, b, c, d, e, req)
155   #define MPIU_Ireduce(a, b, c, d, e, f, g, req)         MPI_Ireduce(a, b, c, d, e, f, g, req)
156   #define MPIU_Iscatter(a, b, c, d, e, f, g, h, req)     MPI_Iscatter(a, b, c, d, e, f, g, h, req)
157   #define MPIU_Iscatterv(a, b, c, d, e, f, g, h, i, req) MPI_Iscatterv(a, b, c, d, e, f, g, h, i, req)
158   #define MPIU_Igather(a, b, c, d, e, f, g, h, req)      MPI_Igather(a, b, c, d, e, f, g, h, req)
159   #define MPIU_Igatherv(a, b, c, d, e, f, g, h, i, req)  MPI_Igatherv(a, b, c, d, e, f, g, h, i, req)
160   #define MPIU_Iallgather(a, b, c, d, e, f, g, req)      MPI_Iallgather(a, b, c, d, e, f, g, req)
161   #define MPIU_Iallgatherv(a, b, c, d, e, f, g, h, req)  MPI_Iallgatherv(a, b, c, d, e, f, g, h, req)
162   #define MPIU_Ialltoall(a, b, c, d, e, f, g, req)       MPI_Ialltoall(a, b, c, d, e, f, g, req)
163 #else
164   /* Ignore req, the MPI_Request argument, and use MPI blocking collectives. One should initialize req
165    to MPI_REQUEST_NULL so that one can do MPI_Wait(req,status) no matter the call is blocking or not.
166  */
167   #define MPIU_Ibcast(a, b, c, d, e, req)                MPI_Bcast(a, b, c, d, e)
168   #define MPIU_Ireduce(a, b, c, d, e, f, g, req)         MPI_Reduce(a, b, c, d, e, f, g)
169   #define MPIU_Iscatter(a, b, c, d, e, f, g, h, req)     MPI_Scatter(a, b, c, d, e, f, g, h)
170   #define MPIU_Iscatterv(a, b, c, d, e, f, g, h, i, req) MPI_Scatterv(a, b, c, d, e, f, g, h, i)
171   #define MPIU_Igather(a, b, c, d, e, f, g, h, req)      MPI_Gather(a, b, c, d, e, f, g, h)
172   #define MPIU_Igatherv(a, b, c, d, e, f, g, h, i, req)  MPI_Gatherv(a, b, c, d, e, f, g, h, i)
173   #define MPIU_Iallgather(a, b, c, d, e, f, g, req)      MPI_Allgather(a, b, c, d, e, f, g)
174   #define MPIU_Iallgatherv(a, b, c, d, e, f, g, h, req)  MPI_Allgatherv(a, b, c, d, e, f, g, h)
175   #define MPIU_Ialltoall(a, b, c, d, e, f, g, req)       MPI_Alltoall(a, b, c, d, e, f, g)
176 #endif
177 
178 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode VecScatterGetRemoteCount_Private(VecScatter, PetscBool, PetscInt *, PetscInt *);
179 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode VecScatterGetRemote_Private(VecScatter, PetscBool, PetscMPIInt *, const PetscInt **, const PetscInt **, const PetscMPIInt **, PetscInt *);
180 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode VecScatterGetRemoteOrdered_Private(VecScatter, PetscBool, PetscMPIInt *, const PetscInt **, const PetscInt **, const PetscMPIInt **, PetscInt *);
181 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode VecScatterRestoreRemote_Private(VecScatter, PetscBool, PetscMPIInt *, const PetscInt **, const PetscInt **, const PetscMPIInt **, PetscInt *);
182 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode VecScatterRestoreRemoteOrdered_Private(VecScatter, PetscBool, PetscMPIInt *, const PetscInt **, const PetscInt **, const PetscMPIInt **, PetscInt *);
183 
184 #if defined(PETSC_HAVE_CUDA)
185 PETSC_INTERN PetscErrorCode PetscSFMalloc_CUDA(PetscMemType, size_t, void **);
186 PETSC_INTERN PetscErrorCode PetscSFFree_CUDA(PetscMemType, void *);
187 #endif
188 #if defined(PETSC_HAVE_HIP)
189 PETSC_INTERN PetscErrorCode PetscSFMalloc_HIP(PetscMemType, size_t, void **);
190 PETSC_INTERN PetscErrorCode PetscSFFree_HIP(PetscMemType, void *);
191 #endif
192 #if defined(PETSC_HAVE_KOKKOS)
193 PETSC_INTERN PetscErrorCode PetscSFMalloc_Kokkos(PetscMemType, size_t, void **);
194 PETSC_INTERN PetscErrorCode PetscSFFree_Kokkos(PetscMemType, void *);
195 #endif
196 
197 /* SF only supports CUDA and Kokkos devices. Even VIENNACL is a device, its device pointers are invisible to SF.
198    Through VecGetArray(), we copy data of VECVIENNACL from device to host and pass host pointers to SF.
199  */
200 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_KOKKOS) || defined(PETSC_HAVE_HIP)
201   #define PetscSFMalloc(sf, mtype, sz, ptr) ((*(sf)->ops->Malloc)(mtype, sz, ptr))
202   /* Free memory and set ptr to NULL when succeeded */
203   #define PetscSFFree(sf, mtype, ptr) ((PetscErrorCode)((ptr) && ((*(sf)->ops->Free)(mtype, ptr) || ((ptr) = NULL, PETSC_SUCCESS))))
204 #else
205   /* If pure host code, do with less indirection */
206   #define PetscSFMalloc(sf, mtype, sz, ptr) PetscMalloc(sz, ptr)
207   #define PetscSFFree(sf, mtype, ptr)       PetscFree(ptr)
208 #endif
209