xref: /petsc/src/vec/is/sf/impls/basic/sfmpi.c (revision f97672e55eacc8688507b9471cd7ec2664d7f203)
1 /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version,
2    we can also do abstractions like Prepare/StartCommunication.
3 */
4 
5 #include <../src/vec/is/sf/impls/basic/sfpack.h>
6 
7 /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */
8 static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
9 {
10   PetscMPIInt       nreqs;
11   MPI_Request       *reqs = NULL;
12   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
13   PetscInt          buflen;
14 
15   PetscFunctionBegin;
16   buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
17   if (buflen) {
18     if (direction == PETSCSF_ROOT2LEAF) {
19       nreqs = sf->nleafreqs;
20       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,NULL,&reqs));
21     } else { /* leaf to root */
22       nreqs = bas->nrootreqs;
23       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,&reqs,NULL));
24     }
25     PetscCallMPI(MPI_Startall_irecv(buflen,link->unit,nreqs,reqs));
26   }
27 
28   buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
29   if (buflen) {
30     if (direction == PETSCSF_ROOT2LEAF) {
31       nreqs  = bas->nrootreqs;
32       PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host before sending */));
33       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,&reqs,NULL));
34     } else { /* leaf to root */
35       nreqs  = sf->nleafreqs;
36       PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE));
37       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,NULL,&reqs));
38     }
39     PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf,link,direction));
40     PetscCallMPI(MPI_Startall_isend(buflen,link->unit,nreqs,reqs));
41   }
42   PetscFunctionReturn(0);
43 }
44 
45 static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
46 {
47   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
48   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi;
49   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
50 
51   PetscFunctionBegin;
52   PetscCallMPI(MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE));
53   PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE));
54   if (direction == PETSCSF_ROOT2LEAF) {
55     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE/* host2device after recving */));
56   } else {
57     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE));
58   }
59   PetscFunctionReturn(0);
60 }
61 
62 /*
63    The routine Creates a communication link for the given operation. It first looks up its link cache. If
64    there is a free & suitable one, it uses it. Otherwise it creates a new one.
65 
66    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
67    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
68    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
69    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
70 
71    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
72 
73    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
74 
75    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
76    need pack/unpack data.
77 */
78 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
79 {
80   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
81   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
82   PetscSFLink       *p,link;
83   PetscSFDirection  direction;
84   MPI_Request       *reqs = NULL;
85   PetscBool         match,rootdirect[2],leafdirect[2];
86   PetscMemType      rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
87   PetscMemType      leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
88   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
89   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
90 
91   PetscFunctionBegin;
92 
93   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
94   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
95     if (sfop == PETSCSF_BCAST) {
96       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
97       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
98     } else if (sfop == PETSCSF_REDUCE) {
99       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
100       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
101     } else { /* PETSCSF_FETCH */
102       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
103       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
104     }
105   }
106 
107   if (sf->use_gpu_aware_mpi) {
108     rootmtype_mpi = rootmtype;
109     leafmtype_mpi = leafmtype;
110   } else {
111     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
112   }
113   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
114   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
115   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
116 
117   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
118   nrootreqs = bas->nrootreqs;
119   nleafreqs = sf->nleafreqs;
120 
121   /* Look for free links in cache */
122   for (p=&bas->avail; (link=*p); p=&link->next) {
123     if (!link->use_nvshmem) { /* Only check with MPI links */
124       PetscCall(MPIPetsc_Type_compare(unit,link->unit,&match));
125       if (match) {
126         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
127            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
128         */
129         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
130           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
131           for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));}
132           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
133         }
134         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
135           reqs = link->leafreqs[direction][leafmtype][1];
136           for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));}
137           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
138         }
139         *p = link->next; /* Remove from available list */
140         goto found;
141       }
142     }
143   }
144 
145   PetscCall(PetscNew(&link));
146   PetscCall(PetscSFLinkSetUp_Host(sf,link,unit));
147   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag)); /* One tag per link */
148 
149   nreqs = (nrootreqs+nleafreqs)*8;
150   PetscCall(PetscMalloc1(nreqs,&link->reqs));
151   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
152 
153   for (i=0; i<2; i++) { /* Two communication directions */
154     for (j=0; j<2; j++) { /* Two memory types */
155       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
156         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
157         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
158       }
159     }
160   }
161   link->StartCommunication    = PetscSFLinkStartRequests_MPI;
162   link->FinishCommunication   = PetscSFLinkWaitRequests_MPI;
163 
164 found:
165 
166 #if defined(PETSC_HAVE_DEVICE)
167   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
168     #if defined(PETSC_HAVE_CUDA)
169       if (sf->backend == PETSCSF_BACKEND_CUDA)   PetscCall(PetscSFLinkSetUp_CUDA(sf,link,unit)); /* Setup streams etc */
170     #endif
171     #if defined(PETSC_HAVE_HIP)
172       if (sf->backend == PETSCSF_BACKEND_HIP)    PetscCall(PetscSFLinkSetUp_HIP(sf,link,unit)); /* Setup streams etc */
173     #endif
174     #if defined(PETSC_HAVE_KOKKOS)
175       if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf,link,unit));
176     #endif
177   }
178 #endif
179 
180   /* Allocate buffers along root/leafdata */
181   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
182     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
183     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
184     if (bas->rootbuflen[i]) {
185       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
186         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
187       } else { /* Have to have a separate rootbuf */
188         if (!link->rootbuf_alloc[i][rootmtype]) {
189           PetscCall(PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]));
190         }
191         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
192       }
193     }
194 
195     if (sf->leafbuflen[i]) {
196       if (leafdirect[i]) {
197         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
198       } else {
199         if (!link->leafbuf_alloc[i][leafmtype]) {
200           PetscCall(PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]));
201         }
202         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
203       }
204     }
205   }
206 
207 #if defined(PETSC_HAVE_DEVICE)
208   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
209   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
210     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
211       PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
212     }
213     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
214   }
215   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
216     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
217       PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
218     }
219     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
220   }
221 #endif
222 
223   /* Set `current` state of the link. They may change between different SF invocations with the same link */
224   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
225     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
226     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
227   }
228 
229   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
230   link->leafdata = leafdata;
231   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
232     link->rootdirect[i] = rootdirect[i];
233     link->leafdirect[i] = leafdirect[i];
234   }
235   link->rootdirect_mpi  = rootdirect_mpi;
236   link->leafdirect_mpi  = leafdirect_mpi;
237   link->rootmtype       = rootmtype;
238   link->leafmtype       = leafmtype;
239   link->rootmtype_mpi   = rootmtype_mpi;
240   link->leafmtype_mpi   = leafmtype_mpi;
241 
242   link->next            = bas->inuse;
243   bas->inuse            = link;
244   *mylink               = link;
245   PetscFunctionReturn(0);
246 }
247