xref: /petsc/src/vec/is/sf/impls/basic/sfmpi.c (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
1 /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version,
2    we can also do abstractions like Prepare/StartCommunication.
3 */
4 
5 #include <../src/vec/is/sf/impls/basic/sfpack.h>
6 
7 /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */
8 static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
9 {
10   PetscMPIInt    nreqs;
11   MPI_Request   *reqs = NULL;
12   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
13   PetscInt       buflen;
14 
15   PetscFunctionBegin;
16   buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
17   if (buflen) {
18     if (direction == PETSCSF_ROOT2LEAF) {
19       nreqs = sf->nleafreqs;
20       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
21     } else { /* leaf to root */
22       nreqs = bas->nrootreqs;
23       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
24     }
25     PetscCallMPI(MPI_Startall_irecv(buflen, link->unit, nreqs, reqs));
26   }
27 
28   buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
29   if (buflen) {
30     if (direction == PETSCSF_ROOT2LEAF) {
31       nreqs = bas->nrootreqs;
32       PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */));
33       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
34     } else { /* leaf to root */
35       nreqs = sf->nleafreqs;
36       PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE));
37       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
38     }
39     PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction));
40     PetscCallMPI(MPI_Startall_isend(buflen, link->unit, nreqs, reqs));
41   }
42   PetscFunctionReturn(0);
43 }
44 
45 static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
46 {
47   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
48   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
49   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
50 
51   PetscFunctionBegin;
52   PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
53   PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
54   if (direction == PETSCSF_ROOT2LEAF) {
55     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
56   } else {
57     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
58   }
59   PetscFunctionReturn(0);
60 }
61 
62 /*
63    The routine Creates a communication link for the given operation. It first looks up its link cache. If
64    there is a free & suitable one, it uses it. Otherwise it creates a new one.
65 
66    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
67    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
68    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
69    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
70 
71    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
72 
73    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
74 
75    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
76    need pack/unpack data.
77 */
78 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
79 {
80   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
81   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
82   PetscSFLink     *p, link;
83   PetscSFDirection direction;
84   MPI_Request     *reqs = NULL;
85   PetscBool        match, rootdirect[2], leafdirect[2];
86   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
87   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
88   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
89   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/
90 
91   PetscFunctionBegin;
92 
93   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
94   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
95     if (sfop == PETSCSF_BCAST) {
96       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
97       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
98     } else if (sfop == PETSCSF_REDUCE) {
99       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
100       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
101     } else {                                                                                /* PETSCSF_FETCH */
102       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
103       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
104     }
105   }
106 
107   if (sf->use_gpu_aware_mpi) {
108     rootmtype_mpi = rootmtype;
109     leafmtype_mpi = leafmtype;
110   } else {
111     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
112   }
113   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
114   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
115   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;
116 
117   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
118   nrootreqs = bas->nrootreqs;
119   nleafreqs = sf->nleafreqs;
120 
121   /* Look for free links in cache */
122   for (p = &bas->avail; (link = *p); p = &link->next) {
123     if (!link->use_nvshmem) { /* Only check with MPI links */
124       PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
125       if (match) {
126         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
127            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
128         */
129         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
130           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
131           for (i = 0; i < nrootreqs; i++) {
132             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
133           }
134           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
135         }
136         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
137           reqs = link->leafreqs[direction][leafmtype][1];
138           for (i = 0; i < nleafreqs; i++) {
139             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
140           }
141           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
142         }
143         *p = link->next; /* Remove from available list */
144         goto found;
145       }
146     }
147   }
148 
149   PetscCall(PetscNew(&link));
150   PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
151   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */
152 
153   nreqs = (nrootreqs + nleafreqs) * 8;
154   PetscCall(PetscMalloc1(nreqs, &link->reqs));
155   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
156 
157   for (i = 0; i < 2; i++) {     /* Two communication directions */
158     for (j = 0; j < 2; j++) {   /* Two memory types */
159       for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
160         link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
161         link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
162       }
163     }
164   }
165   link->StartCommunication  = PetscSFLinkStartRequests_MPI;
166   link->FinishCommunication = PetscSFLinkWaitRequests_MPI;
167 
168 found:
169 
170 #if defined(PETSC_HAVE_DEVICE)
171   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
172   #if defined(PETSC_HAVE_CUDA)
173     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
174   #endif
175   #if defined(PETSC_HAVE_HIP)
176     if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
177   #endif
178   #if defined(PETSC_HAVE_KOKKOS)
179     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
180   #endif
181   }
182 #endif
183 
184   /* Allocate buffers along root/leafdata */
185   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
186     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
187     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
188     if (bas->rootbuflen[i]) {
189       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
190         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
191       } else { /* Have to have a separate rootbuf */
192         if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
193         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
194       }
195     }
196 
197     if (sf->leafbuflen[i]) {
198       if (leafdirect[i]) {
199         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
200       } else {
201         if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
202         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
203       }
204     }
205   }
206 
207 #if defined(PETSC_HAVE_DEVICE)
208   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
209   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
210     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
211     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
212   }
213   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
214     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
215     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
216   }
217 #endif
218 
219   /* Set `current` state of the link. They may change between different SF invocations with the same link */
220   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
221     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
222     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
223   }
224 
225   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
226   link->leafdata = leafdata;
227   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
228     link->rootdirect[i] = rootdirect[i];
229     link->leafdirect[i] = leafdirect[i];
230   }
231   link->rootdirect_mpi = rootdirect_mpi;
232   link->leafdirect_mpi = leafdirect_mpi;
233   link->rootmtype      = rootmtype;
234   link->leafmtype      = leafmtype;
235   link->rootmtype_mpi  = rootmtype_mpi;
236   link->leafmtype_mpi  = leafmtype_mpi;
237 
238   link->next = bas->inuse;
239   bas->inuse = link;
240   *mylink    = link;
241   PetscFunctionReturn(0);
242 }
243