xref: /petsc/src/vec/is/sf/impls/basic/sfmpi.c (revision fbf9dbe564678ed6eff1806adbc4c4f01b9743f4)
1 /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version,
2    we can also do abstractions like Prepare/StartCommunication.
3 */
4 
5 #include <../src/vec/is/sf/impls/basic/sfpack.h>
6 
7 /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */
8 static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
9 {
10   PetscMPIInt    nreqs;
11   MPI_Request   *reqs = NULL;
12   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
13   PetscInt       buflen;
14 
15   PetscFunctionBegin;
16   buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
17   if (buflen) {
18     if (direction == PETSCSF_ROOT2LEAF) {
19       nreqs = sf->nleafreqs;
20       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
21     } else { /* leaf to root */
22       nreqs = bas->nrootreqs;
23       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
24     }
25     PetscCallMPI(MPI_Startall_irecv(buflen, link->unit, nreqs, reqs));
26   }
27 
28   buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
29   if (buflen) {
30     if (direction == PETSCSF_ROOT2LEAF) {
31       nreqs = bas->nrootreqs;
32       PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */));
33       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
34     } else { /* leaf to root */
35       nreqs = sf->nleafreqs;
36       PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE));
37       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
38     }
39     PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction));
40     PetscCallMPI(MPI_Startall_isend(buflen, link->unit, nreqs, reqs));
41   }
42   PetscFunctionReturn(PETSC_SUCCESS);
43 }
44 
45 static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
46 {
47   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
48   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
49   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
50 
51   PetscFunctionBegin;
52   PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
53   PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
54   if (direction == PETSCSF_ROOT2LEAF) {
55     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
56   } else {
57     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
58   }
59   PetscFunctionReturn(PETSC_SUCCESS);
60 }
61 
62 #if defined(PETSC_HAVE_MPIX_STREAM)
63 // issue MPIX_Isend/Irecv_enqueue()
64 static PetscErrorCode PetscSFLinkStartEnqueue_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
65 {
66   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
67   PetscInt           i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
68   const PetscInt    *rootoffset, *leafoffset;
69   MPI_Aint           disp;
70   MPI_Comm           stream_comm   = sf->stream_comm;
71   MPI_Datatype       unit          = link->unit;
72   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
73   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
74 
75   PetscFunctionBegin;
76   if (bas->rootbuflen[PETSCSF_REMOTE]) {
77     PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
78     if (direction == PETSCSF_LEAF2ROOT) {
79       for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
80         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
81         cnt  = rootoffset[i + 1] - rootoffset[i];
82         PetscCallMPI(MPIX_Irecv_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
83       }
84     } else { // PETSCSF_ROOT2LEAF
85       for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
86         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
87         cnt  = rootoffset[i + 1] - rootoffset[i];
88         // no need to sync the gpu stream!
89         PetscCallMPI(MPIX_Isend_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
90       }
91     }
92   }
93 
94   if (sf->leafbuflen[PETSCSF_REMOTE]) {
95     PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
96     if (direction == PETSCSF_LEAF2ROOT) {
97       for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
98         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
99         cnt  = leafoffset[i + 1] - leafoffset[i];
100         // no need to sync the gpu stream!
101         PetscCallMPI(MPIX_Isend_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
102       }
103     } else { // PETSCSF_ROOT2LEAF
104       for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
105         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
106         cnt  = leafoffset[i + 1] - leafoffset[i];
107         PetscCallMPI(MPIX_Irecv_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
108       }
109     }
110   }
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 static PetscErrorCode PetscSFLinkWaitEnqueue_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
115 {
116   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
117   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
118   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
119 
120   PetscFunctionBegin;
121   PetscCallMPI(MPIX_Waitall_enqueue(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
122   PetscCallMPI(MPIX_Waitall_enqueue(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
123   PetscFunctionReturn(PETSC_SUCCESS);
124 }
125 #endif
126 
127 /*
128    The routine Creates a communication link for the given operation. It first looks up its link cache. If
129    there is a free & suitable one, it uses it. Otherwise it creates a new one.
130 
131    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
132    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
133    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
134    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
135 
136    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
137 
138    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
139 
140    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
141    need pack/unpack data.
142 */
143 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
144 {
145   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
146   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
147   PetscSFLink     *p, link;
148   PetscSFDirection direction;
149   MPI_Request     *reqs = NULL;
150   PetscBool        match, rootdirect[2], leafdirect[2];
151   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
152   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
153   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
154   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/
155 
156   PetscFunctionBegin;
157 
158   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
159   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
160     if (sfop == PETSCSF_BCAST) {
161       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
162       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
163     } else if (sfop == PETSCSF_REDUCE) {
164       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
165       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
166     } else {                                                                                /* PETSCSF_FETCH */
167       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
168       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
169     }
170   }
171 
172   if (sf->use_gpu_aware_mpi) {
173     rootmtype_mpi = rootmtype;
174     leafmtype_mpi = leafmtype;
175   } else {
176     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
177   }
178   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */
179   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
180   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;
181 
182   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
183   nrootreqs = bas->nrootreqs;
184   nleafreqs = sf->nleafreqs;
185 
186   /* Look for free links in cache */
187   for (p = &bas->avail; (link = *p); p = &link->next) {
188     if (!link->use_nvshmem) { /* Only check with MPI links */
189       PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
190       if (match) {
191         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
192            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
193         */
194         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
195           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
196           for (i = 0; i < nrootreqs; i++) {
197             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
198           }
199           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
200         }
201         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
202           reqs = link->leafreqs[direction][leafmtype][1];
203           for (i = 0; i < nleafreqs; i++) {
204             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
205           }
206           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
207         }
208         *p = link->next; /* Remove from available list */
209         goto found;
210       }
211     }
212   }
213 
214   PetscCall(PetscNew(&link));
215   PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
216   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */
217 
218   nreqs = (nrootreqs + nleafreqs) * 8;
219   PetscCall(PetscMalloc1(nreqs, &link->reqs));
220   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
221 
222   for (i = 0; i < 2; i++) {     /* Two communication directions */
223     for (j = 0; j < 2; j++) {   /* Two memory types */
224       for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
225         link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
226         link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
227       }
228     }
229   }
230   link->StartCommunication  = PetscSFLinkStartRequests_MPI;
231   link->FinishCommunication = PetscSFLinkWaitRequests_MPI;
232 #if defined(PETSC_HAVE_MPIX_STREAM)
233   if (sf->use_stream_aware_mpi && (PetscMemTypeDevice(rootmtype_mpi) || PetscMemTypeDevice(leafmtype_mpi))) {
234     link->StartCommunication  = PetscSFLinkStartEnqueue_MPIX_Stream;
235     link->FinishCommunication = PetscSFLinkWaitEnqueue_MPIX_Stream;
236   }
237 #endif
238 
239 found:
240 
241 #if defined(PETSC_HAVE_DEVICE)
242   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
243   #if defined(PETSC_HAVE_CUDA)
244     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
245   #endif
246   #if defined(PETSC_HAVE_HIP)
247     if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
248   #endif
249   #if defined(PETSC_HAVE_KOKKOS)
250     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
251   #endif
252   }
253 #endif
254 
255   /* Allocate buffers along root/leafdata */
256   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
257     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
258     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
259     if (bas->rootbuflen[i]) {
260       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
261         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
262       } else { /* Have to have a separate rootbuf */
263         if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
264         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
265       }
266     }
267 
268     if (sf->leafbuflen[i]) {
269       if (leafdirect[i]) {
270         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
271       } else {
272         if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
273         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
274       }
275     }
276   }
277 
278 #if defined(PETSC_HAVE_DEVICE)
279   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
280   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
281     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
282     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
283   }
284   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
285     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
286     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
287   }
288 #endif
289 
290   /* Set `current` state of the link. They may change between different SF invocations with the same link */
291   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
292     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
293     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
294   }
295 
296   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
297   link->leafdata = leafdata;
298   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
299     link->rootdirect[i] = rootdirect[i];
300     link->leafdirect[i] = leafdirect[i];
301   }
302   link->rootdirect_mpi = rootdirect_mpi;
303   link->leafdirect_mpi = leafdirect_mpi;
304   link->rootmtype      = rootmtype;
305   link->leafmtype      = leafmtype;
306   link->rootmtype_mpi  = rootmtype_mpi;
307   link->leafmtype_mpi  = leafmtype_mpi;
308 
309   link->next = bas->inuse;
310   bas->inuse = link;
311   *mylink    = link;
312   PetscFunctionReturn(PETSC_SUCCESS);
313 }
314