xref: /petsc/src/vec/is/sf/impls/basic/sfmpi.c (revision 34c645fd3b0199e05bec2fcc32d3597bfeb7f4f2)
1 #include <../src/vec/is/sf/impls/basic/sfpack.h>
2 
3 // Though there is no default mechanism to start a communication, we have a
4 // default to finish communication, which is just waiting on the requests.
5 // It should work for both non-blocking or persistent send/recvs or collectivwes.
6 static PetscErrorCode PetscSFLinkFinishCommunication_Default(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
7 {
8   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
9   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
10   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
11 
12   PetscFunctionBegin;
13   if (bas->nrootreqs) PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
14   if (sf->nleafreqs) PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
15   if (direction == PETSCSF_ROOT2LEAF) {
16     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
17   } else {
18     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
19   }
20   PetscFunctionReturn(PETSC_SUCCESS);
21 }
22 
23 /*
24    The routine Creates a communication link for the given operation. It first looks up its link cache. If
25    there is a free & suitable one, it uses it. Otherwise it creates a new one.
26 
27    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
28    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
29    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
30    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
31 
32    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
33 
34    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
35 
36    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
37    need pack/unpack data.
38 */
39 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
40 {
41   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
42   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
43   PetscSFLink     *p, link;
44   PetscSFDirection direction;
45   MPI_Request     *reqs = NULL;
46   PetscBool        match, rootdirect[2], leafdirect[2];
47   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
48   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
49   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
50   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/
51 
52   PetscFunctionBegin;
53   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
54   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
55     if (sfop == PETSCSF_BCAST) {
56       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
57       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
58     } else if (sfop == PETSCSF_REDUCE) {
59       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
60       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
61     } else {                                                                                /* PETSCSF_FETCH */
62       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
63       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
64     }
65   }
66 
67   // NEVER use root/leafdirect[] for persistent collectives. Otherwise, suppose for the first time, all ranks build
68   // a persistent MPI request in a collective call. Then in a second call to PetscSFBcast, one rank uses root/leafdirect
69   // but with new rootdata/leafdata pointers. Other ranks keep using the same rootdata/leafdata pointers as last time.
70   // Only that rank will try to rebuild the request with a collective call, resulting in hanging. We could to call
71   // MPI_Allreduce() every time to detect changes in root/leafdata, but that is too expensive for sparse communication.
72   // So we always set root/leafdirect[] to false and allocate additional root/leaf buffers for persistent collectives.
73   if (sf->persistent && sf->collective) {
74     rootdirect[PETSCSF_REMOTE] = PETSC_FALSE;
75     leafdirect[PETSCSF_REMOTE] = PETSC_FALSE;
76   }
77 
78   if (sf->use_gpu_aware_mpi) {
79     rootmtype_mpi = rootmtype;
80     leafmtype_mpi = leafmtype;
81   } else {
82     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
83   }
84   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */
85   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
86   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;
87 
88   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
89   nrootreqs = bas->nrootreqs;
90   nleafreqs = sf->nleafreqs;
91 
92   /* Look for free links in cache */
93   for (p = &bas->avail; (link = *p); p = &link->next) {
94     if (!link->use_nvshmem) { /* Only check with MPI links */
95       PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
96       if (match) {
97         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
98            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests() with the same tag.
99         */
100         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
101           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
102           for (i = 0; i < nrootreqs; i++) {
103             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
104           }
105           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
106         }
107         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
108           reqs = link->leafreqs[direction][leafmtype][1];
109           for (i = 0; i < nleafreqs; i++) {
110             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
111           }
112           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
113         }
114         *p = link->next; /* Remove from available list */
115         goto found;
116       }
117     }
118   }
119 
120   PetscCall(PetscNew(&link));
121   PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
122   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */
123 
124   nreqs = (nrootreqs + nleafreqs) * 8;
125   PetscCall(PetscMalloc1(nreqs, &link->reqs));
126   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
127 
128   if (nreqs)
129     for (i = 0; i < 2; i++) {     /* Two communication directions */
130       for (j = 0; j < 2; j++) {   /* Two memory types */
131         for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
132           link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
133           link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
134         }
135       }
136     }
137 
138   link->FinishCommunication = PetscSFLinkFinishCommunication_Default;
139   // each SF type could customize their communication by setting function pointers in the link.
140   // Currently only BASIC and NEIGHBOR use this abstraction.
141   PetscTryTypeMethod(sf, SetCommunicationOps, link);
142 
143 found:
144 
145 #if defined(PETSC_HAVE_DEVICE)
146   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
147   #if defined(PETSC_HAVE_CUDA)
148     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
149   #endif
150   #if defined(PETSC_HAVE_HIP)
151     if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
152   #endif
153   #if defined(PETSC_HAVE_KOKKOS)
154     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
155   #endif
156   }
157 #endif
158 
159   /* Allocate buffers along root/leafdata */
160   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
161     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
162     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
163     if (bas->rootbuflen[i]) {
164       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
165         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
166       } else { /* Have to have a separate rootbuf */
167         if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
168         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
169       }
170     }
171 
172     if (sf->leafbuflen[i]) {
173       if (leafdirect[i]) {
174         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
175       } else {
176         if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
177         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
178       }
179     }
180   }
181 
182 #if defined(PETSC_HAVE_DEVICE)
183   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
184   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
185     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
186     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
187   }
188   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
189     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
190     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
191   }
192 #endif
193 
194   /* Set `current` state of the link. They may change between different SF invocations with the same link */
195   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
196     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
197     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
198   }
199 
200   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
201   link->leafdata = leafdata;
202   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
203     link->rootdirect[i] = rootdirect[i];
204     link->leafdirect[i] = leafdirect[i];
205   }
206   link->rootdirect_mpi = rootdirect_mpi;
207   link->leafdirect_mpi = leafdirect_mpi;
208   link->rootmtype      = rootmtype;
209   link->leafmtype      = leafmtype;
210   link->rootmtype_mpi  = rootmtype_mpi;
211   link->leafmtype_mpi  = leafmtype_mpi;
212 
213   link->next = bas->inuse;
214   bas->inuse = link;
215   *mylink    = link;
216   PetscFunctionReturn(PETSC_SUCCESS);
217 }
218