xref: /petsc/src/vec/is/sf/impls/basic/neighbor/sfneighbor.c (revision d8e47b638cf8f604a99e9678e1df24f82d959cd7)
1 #include <../src/vec/is/sf/impls/basic/sfpack.h>
2 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
3 #include <petscpkg_version.h>
4 
5 /* Convenience local types and wrappers */
6 #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES)
7 typedef MPI_Count PetscSFCount;
8 typedef MPI_Aint  PetscSFAint;
9   #define MPIU_Neighbor_alltoallv(a, b, c, d, e, f, g, h, i)            MPI_Neighbor_alltoallv_c(a, b, c, d, e, f, g, h, i)
10   #define MPIU_Neighbor_alltoallv_init(a, b, c, d, e, f, g, h, i, j, k) MPI_Neighbor_alltoallv_init_c(a, b, c, d, e, f, g, h, i, j, k)
11   #define MPIU_Ineighbor_alltoallv(a, b, c, d, e, f, g, h, i, j)        MPI_Ineighbor_alltoallv_c(a, b, c, d, e, f, g, h, i, j)
12 #else
13 typedef PetscMPIInt PetscSFCount;
14 typedef PetscMPIInt PetscSFAint;
15   #define MPIU_Neighbor_alltoallv(a, b, c, d, e, f, g, h, i)            MPI_Neighbor_alltoallv(a, b, c, d, e, f, g, h, i)
16   #define MPIU_Neighbor_alltoallv_init(a, b, c, d, e, f, g, h, i, j, k) MPI_Neighbor_alltoallv_init(a, b, c, d, e, f, g, h, i, j, k)
17   #define MPIU_Ineighbor_alltoallv(a, b, c, d, e, f, g, h, i, j)        MPI_Ineighbor_alltoallv(a, b, c, d, e, f, g, h, i, j)
18 #endif
19 
20 typedef struct {
21   SFBASICHEADER;
22   MPI_Comm      comms[2];                /* Communicators with distributed topology in both directions */
23   PetscBool     initialized[2];          /* Are the two communicators initialized? */
24   PetscSFCount *rootcounts, *leafcounts; /* counts for non-distinguished ranks */
25   PetscSFAint  *rootdispls, *leafdispls; /* displs for non-distinguished ranks */
26   PetscMPIInt  *rootweights, *leafweights;
27   PetscInt      rootdegree, leafdegree;
28 } PetscSF_Neighbor;
29 
30 /*===================================================================================*/
31 /*              Internal utility routines                                            */
32 /*===================================================================================*/
33 
PetscLogMPIMessages(PetscInt nsend,PetscSFCount * sendcnts,MPI_Datatype sendtype,PetscInt nrecv,PetscSFCount * recvcnts,MPI_Datatype recvtype)34 static inline PetscErrorCode PetscLogMPIMessages(PetscInt nsend, PetscSFCount *sendcnts, MPI_Datatype sendtype, PetscInt nrecv, PetscSFCount *recvcnts, MPI_Datatype recvtype)
35 {
36   PetscFunctionBegin;
37   if (PetscDefined(USE_LOG)) {
38     petsc_isend_ct += (PetscLogDouble)nsend;
39     petsc_irecv_ct += (PetscLogDouble)nrecv;
40 
41     if (sendtype != MPI_DATATYPE_NULL) {
42       PetscMPIInt i, typesize;
43       PetscCallMPI(MPI_Type_size(sendtype, &typesize));
44       for (i = 0; i < nsend; i++) petsc_isend_len += (PetscLogDouble)(sendcnts[i] * typesize);
45     }
46 
47     if (recvtype != MPI_DATATYPE_NULL) {
48       PetscMPIInt i, typesize;
49       PetscCallMPI(MPI_Type_size(recvtype, &typesize));
50       for (i = 0; i < nrecv; i++) petsc_irecv_len += (PetscLogDouble)(recvcnts[i] * typesize);
51     }
52   }
53   PetscFunctionReturn(PETSC_SUCCESS);
54 }
55 
56 /* Get the communicator with distributed graph topology, which is not cheap to build so we do it on demand (instead of at PetscSFSetUp time) */
PetscSFGetDistComm_Neighbor(PetscSF sf,PetscSFDirection direction,MPI_Comm * distcomm)57 static PetscErrorCode PetscSFGetDistComm_Neighbor(PetscSF sf, PetscSFDirection direction, MPI_Comm *distcomm)
58 {
59   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
60 
61   PetscFunctionBegin;
62   if (!dat->initialized[direction]) {
63     PetscMPIInt        nrootranks, ndrootranks, nleafranks, ndleafranks;
64     PetscMPIInt        indegree, outdegree;
65     const PetscMPIInt *rootranks, *leafranks, *sources, *destinations;
66     MPI_Comm           comm, *mycomm = &dat->comms[direction];
67 
68     PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, &rootranks, NULL, NULL));       /* Which ranks will access my roots (I am a destination) */
69     PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, &leafranks, NULL, NULL, NULL)); /* My leaves will access whose roots (I am a source) */
70     indegree     = nrootranks - ndrootranks;
71     outdegree    = nleafranks - ndleafranks;
72     sources      = PetscSafePointerPlusOffset(rootranks, ndrootranks);
73     destinations = PetscSafePointerPlusOffset(leafranks, ndleafranks);
74     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
75     if (direction == PETSCSF_LEAF2ROOT) {
76       PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, indegree, sources, dat->rootweights, outdegree, destinations, dat->leafweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm));
77     } else { /* PETSCSF_ROOT2LEAF, reverse src & dest */
78       PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, outdegree, destinations, dat->leafweights, indegree, sources, dat->rootweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm));
79     }
80     dat->initialized[direction] = PETSC_TRUE;
81   }
82   *distcomm = dat->comms[direction];
83   PetscFunctionReturn(PETSC_SUCCESS);
84 }
85 
86 // start MPI_Ineighbor_alltoallv (only used for inter-proccess communication)
PetscSFLinkStartCommunication_Neighbor(PetscSF sf,PetscSFLink link,PetscSFDirection direction)87 static PetscErrorCode PetscSFLinkStartCommunication_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
88 {
89   PetscSF_Neighbor *dat      = (PetscSF_Neighbor *)sf->data;
90   MPI_Comm          distcomm = MPI_COMM_NULL;
91   void             *rootbuf = NULL, *leafbuf = NULL;
92   MPI_Request      *req = NULL;
93 
94   PetscFunctionBegin;
95   if (direction == PETSCSF_ROOT2LEAF) {
96     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */));
97   } else {
98     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */));
99   }
100 
101   PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm));
102   PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, &rootbuf, &leafbuf, &req, NULL));
103   PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link));
104 
105   if (dat->rootdegree || dat->leafdegree) { // OpenMPI-3.0 ran into error with rootdegree = leafdegree = 0, so we skip the call in this case
106     if (direction == PETSCSF_ROOT2LEAF) {
107       PetscCallMPI(MPIU_Ineighbor_alltoallv(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, req));
108       PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit));
109     } else {
110       PetscCallMPI(MPIU_Ineighbor_alltoallv(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, req));
111       PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit));
112     }
113   }
114   PetscFunctionReturn(PETSC_SUCCESS);
115 }
116 
117 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
PetscSFLinkInitMPIRequests_Persistent_Neighbor(PetscSF sf,PetscSFLink link,PetscSFDirection direction)118 static PetscErrorCode PetscSFLinkInitMPIRequests_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
119 {
120   PetscSF_Neighbor  *dat           = (PetscSF_Neighbor *)sf->data;
121   MPI_Comm           distcomm      = MPI_COMM_NULL;
122   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
123   const PetscInt     rootdirect_mpi = link->rootdirect_mpi;
124   MPI_Request       *req            = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
125   void              *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi], *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
126   MPI_Info           info;
127 
128   PetscFunctionBegin;
129   PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm));
130   if (dat->rootdegree || dat->leafdegree) {
131     if (!link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
132       PetscCallMPI(MPI_Info_create(&info)); // currently, we don't use info
133       if (direction == PETSCSF_ROOT2LEAF) {
134         PetscCallMPI(MPIU_Neighbor_alltoallv_init(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, info, req));
135       } else {
136         PetscCallMPI(MPIU_Neighbor_alltoallv_init(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, info, req));
137       }
138       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
139       PetscCallMPI(MPI_Info_free(&info));
140     }
141   }
142   PetscFunctionReturn(PETSC_SUCCESS);
143 }
144 
145 // Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf
PetscSFLinkStartCommunication_Persistent_Neighbor(PetscSF sf,PetscSFLink link,PetscSFDirection direction)146 static PetscErrorCode PetscSFLinkStartCommunication_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
147 {
148   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
149   MPI_Request      *req = NULL;
150 
151   PetscFunctionBegin;
152   if (direction == PETSCSF_ROOT2LEAF) {
153     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */));
154   } else {
155     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */));
156   }
157 
158   PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &req, NULL));
159   PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link));
160   if (dat->rootdegree || dat->leafdegree) {
161     PetscCallMPI(MPI_Start(req));
162     if (direction == PETSCSF_ROOT2LEAF) {
163       PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit));
164     } else {
165       PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit));
166     }
167   }
168   PetscFunctionReturn(PETSC_SUCCESS);
169 }
170 #endif
171 
PetscSFSetCommunicationOps_Neighbor(PetscSF sf,PetscSFLink link)172 static PetscErrorCode PetscSFSetCommunicationOps_Neighbor(PetscSF sf, PetscSFLink link)
173 {
174   PetscFunctionBegin;
175 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
176   if (sf->persistent) {
177     link->InitMPIRequests    = PetscSFLinkInitMPIRequests_Persistent_Neighbor;
178     link->StartCommunication = PetscSFLinkStartCommunication_Persistent_Neighbor;
179   } else
180 #endif
181   {
182     link->StartCommunication = PetscSFLinkStartCommunication_Neighbor;
183   }
184   PetscFunctionReturn(PETSC_SUCCESS);
185 }
186 
187 /*===================================================================================*/
188 /*              Implementations of SF public APIs                                    */
189 /*===================================================================================*/
PetscSFSetUp_Neighbor(PetscSF sf)190 static PetscErrorCode PetscSFSetUp_Neighbor(PetscSF sf)
191 {
192   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
193   PetscMPIInt       nrootranks, ndrootranks, nleafranks, ndleafranks;
194   const PetscInt   *rootoffset, *leafoffset;
195   PetscMPIInt       m, n, m2, n2;
196 
197   PetscFunctionBegin;
198   /* SFNeighbor inherits from Basic */
199   PetscCall(PetscSFSetUp_Basic(sf));
200   /* SFNeighbor specific */
201   PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
202   PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
203   dat->rootdegree = m = (PetscMPIInt)(nrootranks - ndrootranks);
204   dat->leafdegree = n = (PetscMPIInt)(nleafranks - ndleafranks);
205   sf->nleafreqs       = 0;
206   dat->nrootreqs      = 1; // collectives only need one MPI_Request. We just put it in rootreqs[]
207 
208   m2 = m;
209   n2 = n;
210 #if defined(PETSC_HAVE_OPENMPI) // workaround for an OpenMPI 5.0.x bug, https://github.com/open-mpi/ompi/pull/12614
211   #if PETSC_PKG_OPENMPI_VERSION_LE(5, 0, 3)
212   m2 = m ? m : 1;
213   n2 = n ? n : 1;
214   #endif
215 #endif
216   // Only setup MPI displs/counts for non-distinguished ranks. Distinguished ranks use shared memory
217   PetscCall(PetscMalloc6(m2, &dat->rootdispls, m2, &dat->rootcounts, m2, &dat->rootweights, n2, &dat->leafdispls, n2, &dat->leafcounts, n2, &dat->leafweights));
218 
219 #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES)
220   for (PetscMPIInt i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
221     dat->rootdispls[j]  = rootoffset[i] - rootoffset[ndrootranks];
222     dat->rootcounts[j]  = rootoffset[i + 1] - rootoffset[i];
223     dat->rootweights[j] = (PetscMPIInt)((PetscReal)dat->rootcounts[j] / (PetscReal)PETSC_INT_MAX * 2147483647); /* Scale to range of PetscMPIInt */
224   }
225 
226   for (PetscMPIInt i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
227     dat->leafdispls[j]  = leafoffset[i] - leafoffset[ndleafranks];
228     dat->leafcounts[j]  = leafoffset[i + 1] - leafoffset[i];
229     dat->leafweights[j] = (PetscMPIInt)((PetscReal)dat->leafcounts[j] / (PetscReal)PETSC_INT_MAX * 2147483647);
230   }
231 #else
232   for (PetscMPIInt i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
233     PetscCall(PetscMPIIntCast(rootoffset[i] - rootoffset[ndrootranks], &m));
234     dat->rootdispls[j] = m;
235     PetscCall(PetscMPIIntCast(rootoffset[i + 1] - rootoffset[i], &n));
236     dat->rootcounts[j]  = n;
237     dat->rootweights[j] = n;
238   }
239 
240   for (PetscMPIInt i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
241     PetscCall(PetscMPIIntCast(leafoffset[i] - leafoffset[ndleafranks], &m));
242     dat->leafdispls[j] = m;
243     PetscCall(PetscMPIIntCast(leafoffset[i + 1] - leafoffset[i], &n));
244     dat->leafcounts[j]  = n;
245     dat->leafweights[j] = n;
246   }
247 #endif
248   PetscFunctionReturn(PETSC_SUCCESS);
249 }
250 
PetscSFReset_Neighbor(PetscSF sf)251 static PetscErrorCode PetscSFReset_Neighbor(PetscSF sf)
252 {
253   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
254 
255   PetscFunctionBegin;
256   PetscCheck(!dat->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Outstanding operation has not been completed");
257   PetscCall(PetscFree6(dat->rootdispls, dat->rootcounts, dat->rootweights, dat->leafdispls, dat->leafcounts, dat->leafweights));
258   for (int i = 0; i < 2; i++) {
259     if (dat->initialized[i]) {
260       PetscCallMPI(MPI_Comm_free(&dat->comms[i]));
261       dat->initialized[i] = PETSC_FALSE;
262     }
263   }
264   PetscCall(PetscSFReset_Basic(sf)); /* Common part */
265   PetscFunctionReturn(PETSC_SUCCESS);
266 }
267 
PetscSFDestroy_Neighbor(PetscSF sf)268 static PetscErrorCode PetscSFDestroy_Neighbor(PetscSF sf)
269 {
270   PetscFunctionBegin;
271   PetscCall(PetscSFReset_Neighbor(sf));
272   PetscCall(PetscFree(sf->data));
273   PetscFunctionReturn(PETSC_SUCCESS);
274 }
275 
PetscSFCreate_Neighbor(PetscSF sf)276 PETSC_INTERN PetscErrorCode PetscSFCreate_Neighbor(PetscSF sf)
277 {
278   PetscSF_Neighbor *dat;
279 
280   PetscFunctionBegin;
281   sf->ops->CreateEmbeddedRootSF = PetscSFCreateEmbeddedRootSF_Basic;
282   sf->ops->BcastBegin           = PetscSFBcastBegin_Basic;
283   sf->ops->BcastEnd             = PetscSFBcastEnd_Basic;
284   sf->ops->ReduceBegin          = PetscSFReduceBegin_Basic;
285   sf->ops->ReduceEnd            = PetscSFReduceEnd_Basic;
286   sf->ops->FetchAndOpBegin      = PetscSFFetchAndOpBegin_Basic;
287   sf->ops->FetchAndOpEnd        = PetscSFFetchAndOpEnd_Basic;
288   sf->ops->GetLeafRanks         = PetscSFGetLeafRanks_Basic;
289   sf->ops->View                 = PetscSFView_Basic;
290 
291   sf->ops->SetUp               = PetscSFSetUp_Neighbor;
292   sf->ops->Reset               = PetscSFReset_Neighbor;
293   sf->ops->Destroy             = PetscSFDestroy_Neighbor;
294   sf->ops->SetCommunicationOps = PetscSFSetCommunicationOps_Neighbor;
295 
296 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
297   PetscObjectOptionsBegin((PetscObject)sf);
298   PetscCall(PetscOptionsBool("-sf_neighbor_persistent", "Use MPI-4 persistent neighborhood collectives; used along with -sf_type neighbor", "PetscSFCreate", sf->persistent, &sf->persistent, NULL));
299   PetscOptionsEnd();
300 #endif
301   sf->collective = PETSC_TRUE;
302 
303   PetscCall(PetscNew(&dat));
304   sf->data = (void *)dat;
305   PetscFunctionReturn(PETSC_SUCCESS);
306 }
307