xref: /petsc/src/vec/is/sf/impls/basic/sfmpi.c (revision 0baf8eba40dbc839082666f9f7396a225d6f663c)
1 #include <../src/vec/is/sf/impls/basic/sfpack.h>
2 
3 // Though there is no default mechanism to start a communication, we have a
4 // default to finish communication, which is just waiting on the requests.
5 // It should work for both non-blocking or persistent send/recvs or collectivwes.
6 static PetscErrorCode PetscSFLinkFinishCommunication_Default(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
7 {
8   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
9   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
10   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
11 
12   PetscFunctionBegin;
13   if (sf->monitor) {
14     PetscMPIInt rank;
15     const char *rootaction = (direction == PETSCSF_ROOT2LEAF) ? "sending to  " : "recving from";
16     const char *leafaction = (direction == PETSCSF_ROOT2LEAF) ? "recving from" : "sending to  ";
17     const char *sfaction   = (direction == PETSCSF_ROOT2LEAF) ? "PetscSFBcast" : "PetscSFReduce";
18 
19     PetscCall(PetscPrintf(PETSC_COMM_SELF, "---------------- Begin %s communication -------------------\n", sfaction));
20     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));
21 
22     for (PetscMPIInt i = 0; i < bas->nrootreqs; i++) {
23       size_t size = (bas->ioffset[i + bas->ndiranks + 1] - bas->ioffset[i + bas->ndiranks]) * link->unitbytes;
24       PetscCall(PetscPrintf(PETSC_COMM_SELF, "Rank %6d %s Rank %6d (%16zu bytes) with MPI tag %10d ... ", rank, rootaction, bas->iranks[i + bas->ndiranks], size, link->tag));
25       PetscCallMPI(MPI_Wait(link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + i, MPI_STATUS_IGNORE));
26       PetscCall(PetscPrintf(PETSC_COMM_SELF, "DONE\n"));
27     }
28     for (PetscMPIInt i = 0; i < sf->nleafreqs; i++) {
29       size_t size = (sf->roffset[i + sf->ndranks + 1] - sf->roffset[i + sf->ndranks]) * link->unitbytes;
30       PetscCall(PetscPrintf(PETSC_COMM_SELF, "Rank %6d %s Rank %6d (%16zu bytes) with MPI tag %10d ... ", rank, leafaction, sf->ranks[i + sf->ndranks], size, link->tag));
31       PetscCallMPI(MPI_Wait(link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + i, MPI_STATUS_IGNORE));
32       PetscCall(PetscPrintf(PETSC_COMM_SELF, "DONE\n"));
33     }
34     PetscCall(PetscPrintf(PETSC_COMM_SELF, "---------------- End   %s communication -------------------\n\n", sfaction));
35   } else {
36     if (bas->nrootreqs) PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
37     if (sf->nleafreqs) PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
38   }
39 
40   if (direction == PETSCSF_ROOT2LEAF) {
41     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
42   } else {
43     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
44   }
45   PetscFunctionReturn(PETSC_SUCCESS);
46 }
47 
48 /*
49    The routine Creates a communication link for the given operation. It first looks up its link cache. If
50    there is a free & suitable one, it uses it. Otherwise it creates a new one.
51 
52    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
53    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
54    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
55    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
56 
57    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
58 
59    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
60 
61    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
62    need pack/unpack data.
63 */
64 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
65 {
66   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
67   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
68   PetscSFLink     *p, link;
69   PetscSFDirection direction;
70   MPI_Request     *reqs = NULL;
71   PetscBool        match, rootdirect[2], leafdirect[2];
72   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
73   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
74   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
75   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/
76 
77   PetscFunctionBegin;
78   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
79   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
80     if (sfop == PETSCSF_BCAST) {
81       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
82       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
83     } else if (sfop == PETSCSF_REDUCE) {
84       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
85       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
86     } else {                                                                                /* PETSCSF_FETCH */
87       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
88       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
89     }
90   }
91 
92   // NEVER use root/leafdirect[] for persistent collectives. Otherwise, suppose for the first time, all ranks build
93   // a persistent MPI request in a collective call. Then in a second call to PetscSFBcast, one rank uses root/leafdirect
94   // but with new rootdata/leafdata pointers. Other ranks keep using the same rootdata/leafdata pointers as last time.
95   // Only that rank will try to rebuild the request with a collective call, resulting in hanging. We could to call
96   // MPI_Allreduce() every time to detect changes in root/leafdata, but that is too expensive for sparse communication.
97   // So we always set root/leafdirect[] to false and allocate additional root/leaf buffers for persistent collectives.
98   if (sf->persistent && sf->collective) {
99     rootdirect[PETSCSF_REMOTE] = PETSC_FALSE;
100     leafdirect[PETSCSF_REMOTE] = PETSC_FALSE;
101   }
102 
103   if (sf->use_gpu_aware_mpi) {
104     rootmtype_mpi = rootmtype;
105     leafmtype_mpi = leafmtype;
106   } else {
107     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
108   }
109   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */
110   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
111   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;
112 
113   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
114   nrootreqs = bas->nrootreqs;
115   nleafreqs = sf->nleafreqs;
116 
117   /* Look for free links in cache */
118   for (p = &bas->avail; (link = *p); p = &link->next) {
119     if (!link->use_nvshmem) { /* Only check with MPI links */
120       PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
121       if (match) {
122         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
123            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests() with the same tag.
124         */
125         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
126           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
127           for (i = 0; i < nrootreqs; i++) {
128             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
129           }
130           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
131         }
132         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
133           reqs = link->leafreqs[direction][leafmtype][1];
134           for (i = 0; i < nleafreqs; i++) {
135             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
136           }
137           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
138         }
139         *p = link->next; /* Remove from available list */
140         goto found;
141       }
142     }
143   }
144 
145   PetscCall(PetscNew(&link));
146   PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
147   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */
148 
149   nreqs = (nrootreqs + nleafreqs) * 8;
150   PetscCall(PetscMalloc1(nreqs, &link->reqs));
151   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
152 
153   if (nreqs)
154     for (i = 0; i < 2; i++) {     /* Two communication directions */
155       for (j = 0; j < 2; j++) {   /* Two memory types */
156         for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
157           link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
158           link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
159         }
160       }
161     }
162 
163   link->FinishCommunication = PetscSFLinkFinishCommunication_Default;
164   // each SF type could customize their communication by setting function pointers in the link.
165   // Currently only BASIC and NEIGHBOR use this abstraction.
166   PetscTryTypeMethod(sf, SetCommunicationOps, link);
167 
168 found:
169 
170 #if defined(PETSC_HAVE_DEVICE)
171   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
172   #if defined(PETSC_HAVE_CUDA)
173     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
174   #endif
175   #if defined(PETSC_HAVE_HIP)
176     if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
177   #endif
178   #if defined(PETSC_HAVE_KOKKOS)
179     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
180   #endif
181   }
182 #endif
183 
184   /* Allocate buffers along root/leafdata */
185   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
186     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
187     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
188     if (bas->rootbuflen[i]) {
189       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
190         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
191       } else { /* Have to have a separate rootbuf */
192         if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
193         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
194       }
195     }
196 
197     if (sf->leafbuflen[i]) {
198       if (leafdirect[i]) {
199         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
200       } else {
201         if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
202         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
203       }
204     }
205   }
206 
207 #if defined(PETSC_HAVE_DEVICE)
208   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
209   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
210     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
211     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
212   }
213   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
214     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
215     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
216   }
217 #endif
218 
219   /* Set `current` state of the link. They may change between different SF invocations with the same link */
220   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
221     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
222     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
223   }
224 
225   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
226   link->leafdata = leafdata;
227   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
228     link->rootdirect[i] = rootdirect[i];
229     link->leafdirect[i] = leafdirect[i];
230   }
231   link->rootdirect_mpi = rootdirect_mpi;
232   link->leafdirect_mpi = leafdirect_mpi;
233   link->rootmtype      = rootmtype;
234   link->leafmtype      = leafmtype;
235   link->rootmtype_mpi  = rootmtype_mpi;
236   link->leafmtype_mpi  = leafmtype_mpi;
237 
238   link->next = bas->inuse;
239   bas->inuse = link;
240   *mylink    = link;
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243