xref: /petsc/src/vec/is/sf/impls/basic/sfmpi.c (revision a69119a591a03a9d906b29c0a4e9802e4d7c9795)
1 /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version,
2    we can also do abstractions like Prepare/StartCommunication.
3 */
4 
5 #include <../src/vec/is/sf/impls/basic/sfpack.h>
6 
7 /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */
8 static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) {
9   PetscMPIInt    nreqs;
10   MPI_Request   *reqs = NULL;
11   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
12   PetscInt       buflen;
13 
14   PetscFunctionBegin;
15   buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
16   if (buflen) {
17     if (direction == PETSCSF_ROOT2LEAF) {
18       nreqs = sf->nleafreqs;
19       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
20     } else { /* leaf to root */
21       nreqs = bas->nrootreqs;
22       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
23     }
24     PetscCallMPI(MPI_Startall_irecv(buflen, link->unit, nreqs, reqs));
25   }
26 
27   buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
28   if (buflen) {
29     if (direction == PETSCSF_ROOT2LEAF) {
30       nreqs = bas->nrootreqs;
31       PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */));
32       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
33     } else { /* leaf to root */
34       nreqs = sf->nleafreqs;
35       PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE));
36       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
37     }
38     PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction));
39     PetscCallMPI(MPI_Startall_isend(buflen, link->unit, nreqs, reqs));
40   }
41   PetscFunctionReturn(0);
42 }
43 
44 static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) {
45   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
46   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
47   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
48 
49   PetscFunctionBegin;
50   PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
51   PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
52   if (direction == PETSCSF_ROOT2LEAF) {
53     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
54   } else {
55     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
56   }
57   PetscFunctionReturn(0);
58 }
59 
60 /*
61    The routine Creates a communication link for the given operation. It first looks up its link cache. If
62    there is a free & suitable one, it uses it. Otherwise it creates a new one.
63 
64    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
65    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
66    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
67    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
68 
69    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
70 
71    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
72 
73    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
74    need pack/unpack data.
75 */
76 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) {
77   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
78   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
79   PetscSFLink     *p, link;
80   PetscSFDirection direction;
81   MPI_Request     *reqs = NULL;
82   PetscBool        match, rootdirect[2], leafdirect[2];
83   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
84   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
85   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
86   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/
87 
88   PetscFunctionBegin;
89 
90   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
91   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
92     if (sfop == PETSCSF_BCAST) {
93       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
94       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
95     } else if (sfop == PETSCSF_REDUCE) {
96       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
97       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
98     } else {                                                                                /* PETSCSF_FETCH */
99       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
100       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
101     }
102   }
103 
104   if (sf->use_gpu_aware_mpi) {
105     rootmtype_mpi = rootmtype;
106     leafmtype_mpi = leafmtype;
107   } else {
108     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
109   }
110   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
111   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
112   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;
113 
114   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
115   nrootreqs = bas->nrootreqs;
116   nleafreqs = sf->nleafreqs;
117 
118   /* Look for free links in cache */
119   for (p = &bas->avail; (link = *p); p = &link->next) {
120     if (!link->use_nvshmem) { /* Only check with MPI links */
121       PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
122       if (match) {
123         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
124            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
125         */
126         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
127           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
128           for (i = 0; i < nrootreqs; i++) {
129             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
130           }
131           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
132         }
133         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
134           reqs = link->leafreqs[direction][leafmtype][1];
135           for (i = 0; i < nleafreqs; i++) {
136             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
137           }
138           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
139         }
140         *p = link->next; /* Remove from available list */
141         goto found;
142       }
143     }
144   }
145 
146   PetscCall(PetscNew(&link));
147   PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
148   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */
149 
150   nreqs = (nrootreqs + nleafreqs) * 8;
151   PetscCall(PetscMalloc1(nreqs, &link->reqs));
152   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
153 
154   for (i = 0; i < 2; i++) {     /* Two communication directions */
155     for (j = 0; j < 2; j++) {   /* Two memory types */
156       for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
157         link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
158         link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
159       }
160     }
161   }
162   link->StartCommunication  = PetscSFLinkStartRequests_MPI;
163   link->FinishCommunication = PetscSFLinkWaitRequests_MPI;
164 
165 found:
166 
167 #if defined(PETSC_HAVE_DEVICE)
168   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
169 #if defined(PETSC_HAVE_CUDA)
170     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
171 #endif
172 #if defined(PETSC_HAVE_HIP)
173     if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
174 #endif
175 #if defined(PETSC_HAVE_KOKKOS)
176     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
177 #endif
178   }
179 #endif
180 
181   /* Allocate buffers along root/leafdata */
182   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
183     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
184     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
185     if (bas->rootbuflen[i]) {
186       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
187         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
188       } else { /* Have to have a separate rootbuf */
189         if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
190         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
191       }
192     }
193 
194     if (sf->leafbuflen[i]) {
195       if (leafdirect[i]) {
196         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
197       } else {
198         if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
199         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
200       }
201     }
202   }
203 
204 #if defined(PETSC_HAVE_DEVICE)
205   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
206   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
207     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
208     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
209   }
210   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
211     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
212     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
213   }
214 #endif
215 
216   /* Set `current` state of the link. They may change between different SF invocations with the same link */
217   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
218     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
219     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
220   }
221 
222   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
223   link->leafdata = leafdata;
224   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
225     link->rootdirect[i] = rootdirect[i];
226     link->leafdirect[i] = leafdirect[i];
227   }
228   link->rootdirect_mpi = rootdirect_mpi;
229   link->leafdirect_mpi = leafdirect_mpi;
230   link->rootmtype      = rootmtype;
231   link->leafmtype      = leafmtype;
232   link->rootmtype_mpi  = rootmtype_mpi;
233   link->leafmtype_mpi  = leafmtype_mpi;
234 
235   link->next = bas->inuse;
236   bas->inuse = link;
237   *mylink    = link;
238   PetscFunctionReturn(0);
239 }
240