xref: /petsc/src/vec/is/sf/impls/basic/sfmpi.c (revision 1d27aa22b2f6148b2c4e3f06a75e0638d6493e09)
1 /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version,
2    we can also do abstractions like Prepare/StartCommunication.
3 */
4 
5 #include <../src/vec/is/sf/impls/basic/sfpack.h>
6 
7 /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */
8 static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
9 {
10   PetscMPIInt    nreqs;
11   MPI_Request   *reqs = NULL;
12   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
13   PetscInt       buflen;
14 
15   PetscFunctionBegin;
16   buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
17   if (buflen) {
18     if (direction == PETSCSF_ROOT2LEAF) {
19       nreqs = sf->nleafreqs;
20       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
21     } else { /* leaf to root */
22       nreqs = bas->nrootreqs;
23       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
24     }
25     PetscCallMPI(MPI_Startall_irecv(buflen, link->unit, nreqs, reqs));
26   }
27 
28   buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
29   if (buflen) {
30     if (direction == PETSCSF_ROOT2LEAF) {
31       nreqs = bas->nrootreqs;
32       PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */));
33       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
34     } else { /* leaf to root */
35       nreqs = sf->nleafreqs;
36       PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE));
37       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
38     }
39     PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction));
40     PetscCallMPI(MPI_Startall_isend(buflen, link->unit, nreqs, reqs));
41   }
42   PetscFunctionReturn(PETSC_SUCCESS);
43 }
44 
45 static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
46 {
47   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
48   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
49   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
50 
51   PetscFunctionBegin;
52   PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
53   PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
54   if (direction == PETSCSF_ROOT2LEAF) {
55     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
56   } else {
57     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
58   }
59   PetscFunctionReturn(PETSC_SUCCESS);
60 }
61 
62 #if defined(PETSC_HAVE_MPIX_STREAM)
63 // issue MPIX_Isend/Irecv_enqueue()
64 static PetscErrorCode PetscSFLinkStartEnqueue_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
65 {
66   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
67   PetscInt           i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
68   const PetscInt    *rootoffset, *leafoffset;
69   MPI_Aint           disp;
70   MPI_Comm           stream_comm   = sf->stream_comm;
71   MPI_Datatype       unit          = link->unit;
72   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
73   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
74 
75   PetscFunctionBegin;
76   if (bas->rootbuflen[PETSCSF_REMOTE]) {
77     PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
78     if (direction == PETSCSF_LEAF2ROOT) {
79       for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
80         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
81         cnt  = rootoffset[i + 1] - rootoffset[i];
82         PetscCallMPI(MPIX_Irecv_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
83       }
84     } else { // PETSCSF_ROOT2LEAF
85       for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
86         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
87         cnt  = rootoffset[i + 1] - rootoffset[i];
88         // no need to sync the gpu stream!
89         PetscCallMPI(MPIX_Isend_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
90       }
91     }
92   }
93 
94   if (sf->leafbuflen[PETSCSF_REMOTE]) {
95     PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
96     if (direction == PETSCSF_LEAF2ROOT) {
97       for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
98         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
99         cnt  = leafoffset[i + 1] - leafoffset[i];
100         // no need to sync the gpu stream!
101         PetscCallMPI(MPIX_Isend_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
102       }
103     } else { // PETSCSF_ROOT2LEAF
104       for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
105         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
106         cnt  = leafoffset[i + 1] - leafoffset[i];
107         PetscCallMPI(MPIX_Irecv_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
108       }
109     }
110   }
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 static PetscErrorCode PetscSFLinkWaitEnqueue_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
115 {
116   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
117   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
118   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
119 
120   PetscFunctionBegin;
121   PetscCallMPI(MPIX_Waitall_enqueue(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
122   PetscCallMPI(MPIX_Waitall_enqueue(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
123   PetscFunctionReturn(PETSC_SUCCESS);
124 }
125 #endif
126 
127 /*
128    The routine Creates a communication link for the given operation. It first looks up its link cache. If
129    there is a free & suitable one, it uses it. Otherwise it creates a new one.
130 
131    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
132    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
133    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
134    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
135 
136    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
137 
138    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
139 
140    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
141    need pack/unpack data.
142 */
143 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
144 {
145   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
146   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
147   PetscSFLink     *p, link;
148   PetscSFDirection direction;
149   MPI_Request     *reqs = NULL;
150   PetscBool        match, rootdirect[2], leafdirect[2];
151   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
152   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
153   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
154   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/
155 
156   PetscFunctionBegin;
157 
158   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
159   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
160     if (sfop == PETSCSF_BCAST) {
161       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
162       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
163     } else if (sfop == PETSCSF_REDUCE) {
164       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
165       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
166     } else {                                                                                /* PETSCSF_FETCH */
167       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
168       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
169     }
170   }
171 
172   if (sf->use_gpu_aware_mpi) {
173     rootmtype_mpi = rootmtype;
174     leafmtype_mpi = leafmtype;
175   } else {
176     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
177   }
178   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */
179   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
180   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;
181 
182   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
183   nrootreqs = bas->nrootreqs;
184   nleafreqs = sf->nleafreqs;
185 
186   /* Look for free links in cache */
187   for (p = &bas->avail; (link = *p); p = &link->next) {
188     if (!link->use_nvshmem) { /* Only check with MPI links */
189       PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
190       if (match) {
191         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
192            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
193         */
194         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
195           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
196           for (i = 0; i < nrootreqs; i++) {
197             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
198           }
199           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
200         }
201         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
202           reqs = link->leafreqs[direction][leafmtype][1];
203           for (i = 0; i < nleafreqs; i++) {
204             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
205           }
206           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
207         }
208         *p = link->next; /* Remove from available list */
209         goto found;
210       }
211     }
212   }
213 
214   PetscCall(PetscNew(&link));
215   PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
216   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */
217 
218   nreqs = (nrootreqs + nleafreqs) * 8;
219   PetscCall(PetscMalloc1(nreqs, &link->reqs));
220   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
221 
222   if (nreqs)
223     for (i = 0; i < 2; i++) {     /* Two communication directions */
224       for (j = 0; j < 2; j++) {   /* Two memory types */
225         for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
226           link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
227           link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
228         }
229       }
230     }
231   link->StartCommunication  = PetscSFLinkStartRequests_MPI;
232   link->FinishCommunication = PetscSFLinkWaitRequests_MPI;
233 #if defined(PETSC_HAVE_MPIX_STREAM)
234   if (sf->use_stream_aware_mpi && (PetscMemTypeDevice(rootmtype_mpi) || PetscMemTypeDevice(leafmtype_mpi))) {
235     link->StartCommunication  = PetscSFLinkStartEnqueue_MPIX_Stream;
236     link->FinishCommunication = PetscSFLinkWaitEnqueue_MPIX_Stream;
237   }
238 #endif
239 
240 found:
241 
242 #if defined(PETSC_HAVE_DEVICE)
243   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
244   #if defined(PETSC_HAVE_CUDA)
245     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
246   #endif
247   #if defined(PETSC_HAVE_HIP)
248     if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
249   #endif
250   #if defined(PETSC_HAVE_KOKKOS)
251     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
252   #endif
253   }
254 #endif
255 
256   /* Allocate buffers along root/leafdata */
257   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
258     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
259     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
260     if (bas->rootbuflen[i]) {
261       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
262         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
263       } else { /* Have to have a separate rootbuf */
264         if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
265         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
266       }
267     }
268 
269     if (sf->leafbuflen[i]) {
270       if (leafdirect[i]) {
271         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
272       } else {
273         if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
274         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
275       }
276     }
277   }
278 
279 #if defined(PETSC_HAVE_DEVICE)
280   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
281   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
282     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
283     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
284   }
285   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
286     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
287     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
288   }
289 #endif
290 
291   /* Set `current` state of the link. They may change between different SF invocations with the same link */
292   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
293     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
294     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
295   }
296 
297   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
298   link->leafdata = leafdata;
299   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
300     link->rootdirect[i] = rootdirect[i];
301     link->leafdirect[i] = leafdirect[i];
302   }
303   link->rootdirect_mpi = rootdirect_mpi;
304   link->leafdirect_mpi = leafdirect_mpi;
305   link->rootmtype      = rootmtype;
306   link->leafmtype      = leafmtype;
307   link->rootmtype_mpi  = rootmtype_mpi;
308   link->leafmtype_mpi  = leafmtype_mpi;
309 
310   link->next = bas->inuse;
311   bas->inuse = link;
312   *mylink    = link;
313   PetscFunctionReturn(PETSC_SUCCESS);
314 }
315