xref: /petsc/src/vec/is/sf/impls/basic/sfmpi.c (revision cf0762375add12fa0f124e50db9eeb4edce3f335)
1 /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version,
2    we can also do abstractions like Prepare/StartCommunication.
3 */
4 
5 #include <../src/vec/is/sf/impls/basic/sfpack.h>
6 
7 /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */
8 static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
9 {
10   PetscErrorCode    ierr;
11   PetscMPIInt       nreqs;
12   MPI_Request       *reqs = NULL;
13   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
14   PetscInt          buflen;
15 
16   PetscFunctionBegin;
17   buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
18   if (buflen) {
19     if (direction == PETSCSF_ROOT2LEAF) {
20       nreqs = sf->nleafreqs;
21       ierr = PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,NULL,&reqs);CHKERRQ(ierr);
22     } else { /* leaf to root */
23       nreqs = bas->nrootreqs;
24       ierr = PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,&reqs,NULL);CHKERRQ(ierr);
25     }
26     ierr = MPI_Startall_irecv(buflen,link->unit,nreqs,reqs);CHKERRMPI(ierr);
27   }
28 
29   buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
30   if (buflen) {
31     if (direction == PETSCSF_ROOT2LEAF) {
32       nreqs  = bas->nrootreqs;
33       ierr   = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host before sending */);CHKERRQ(ierr);
34       ierr   = PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,&reqs,NULL);CHKERRQ(ierr);
35     } else { /* leaf to root */
36       nreqs  = sf->nleafreqs;
37       ierr   = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
38       ierr   = PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,NULL,&reqs);CHKERRQ(ierr);
39     }
40     ierr = PetscSFLinkSyncStreamBeforeCallMPI(sf,link,direction);CHKERRQ(ierr);
41     ierr = MPI_Startall_isend(buflen,link->unit,nreqs,reqs);CHKERRMPI(ierr);
42   }
43   PetscFunctionReturn(0);
44 }
45 
46 static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
47 {
48   PetscErrorCode       ierr;
49   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
50   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi;
51   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
52 
53   PetscFunctionBegin;
54   ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
55   ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
56   if (direction == PETSCSF_ROOT2LEAF) {
57     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE/* host2device after recving */);CHKERRQ(ierr);
58   } else {
59     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);
60   }
61   PetscFunctionReturn(0);
62 }
63 
64 /*
65    The routine Creates a communication link for the given operation. It first looks up its link cache. If
66    there is a free & suitable one, it uses it. Otherwise it creates a new one.
67 
68    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
69    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
70    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
71    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
72 
73    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
74 
75    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
76 
77    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
78    need pack/unpack data.
79 */
80 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
81 {
82   PetscErrorCode    ierr;
83   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
84   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
85   PetscSFLink       *p,link;
86   PetscSFDirection  direction;
87   MPI_Request       *reqs = NULL;
88   PetscBool         match,rootdirect[2],leafdirect[2];
89   PetscMemType      rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
90   PetscMemType      leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
91   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
92   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
93 
94   PetscFunctionBegin;
95 
96   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
97   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
98     if (sfop == PETSCSF_BCAST) {
99       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
100       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
101     } else if (sfop == PETSCSF_REDUCE) {
102       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
103       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
104     } else { /* PETSCSF_FETCH */
105       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
106       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
107     }
108   }
109 
110   if (sf->use_gpu_aware_mpi) {
111     rootmtype_mpi = rootmtype;
112     leafmtype_mpi = leafmtype;
113   } else {
114     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
115   }
116   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
117   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
118   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
119 
120   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
121   nrootreqs = bas->nrootreqs;
122   nleafreqs = sf->nleafreqs;
123 
124   /* Look for free links in cache */
125   for (p=&bas->avail; (link=*p); p=&link->next) {
126     if (!link->use_nvshmem) { /* Only check with MPI links */
127       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
128       if (match) {
129         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
130            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
131         */
132         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
133           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
134           for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}}
135           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
136         }
137         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
138           reqs = link->leafreqs[direction][leafmtype][1];
139           for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}}
140           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
141         }
142         *p = link->next; /* Remove from available list */
143         goto found;
144       }
145     }
146   }
147 
148   ierr = PetscNew(&link);CHKERRQ(ierr);
149   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
150   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
151 
152   nreqs = (nrootreqs+nleafreqs)*8;
153   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
154   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
155 
156   for (i=0; i<2; i++) { /* Two communication directions */
157     for (j=0; j<2; j++) { /* Two memory types */
158       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
159         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
160         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
161       }
162     }
163   }
164   link->StartCommunication    = PetscSFLinkStartRequests_MPI;
165   link->FinishCommunication   = PetscSFLinkWaitRequests_MPI;
166 
167 found:
168 
169 #if defined(PETSC_HAVE_DEVICE)
170   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
171     #if defined(PETSC_HAVE_CUDA)
172       if (sf->backend == PETSCSF_BACKEND_CUDA)   {ierr = PetscSFLinkSetUp_CUDA(sf,link,unit);CHKERRQ(ierr);} /* Setup streams etc */
173     #endif
174     #if defined(PETSC_HAVE_HIP)
175       if (sf->backend == PETSCSF_BACKEND_HIP)    {ierr = PetscSFLinkSetUp_HIP(sf,link,unit);CHKERRQ(ierr);} /* Setup streams etc */
176     #endif
177     #if defined(PETSC_HAVE_KOKKOS)
178       if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);}
179     #endif
180   }
181 #endif
182 
183   /* Allocate buffers along root/leafdata */
184   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
185     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
186     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
187     if (bas->rootbuflen[i]) {
188       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
189         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
190       } else { /* Have to have a separate rootbuf */
191         if (!link->rootbuf_alloc[i][rootmtype]) {
192           ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
193         }
194         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
195       }
196     }
197 
198     if (sf->leafbuflen[i]) {
199       if (leafdirect[i]) {
200         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
201       } else {
202         if (!link->leafbuf_alloc[i][leafmtype]) {
203           ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
204         }
205         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
206       }
207     }
208   }
209 
210 #if defined(PETSC_HAVE_DEVICE)
211   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
212   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
213     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
214       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
215     }
216     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
217   }
218   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
219     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
220       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
221     }
222     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
223   }
224 #endif
225 
226   /* Set `current` state of the link. They may change between different SF invocations with the same link */
227   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
228     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
229     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
230   }
231 
232   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
233   link->leafdata = leafdata;
234   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
235     link->rootdirect[i] = rootdirect[i];
236     link->leafdirect[i] = leafdirect[i];
237   }
238   link->rootdirect_mpi  = rootdirect_mpi;
239   link->leafdirect_mpi  = leafdirect_mpi;
240   link->rootmtype       = rootmtype;
241   link->leafmtype       = leafmtype;
242   link->rootmtype_mpi   = rootmtype_mpi;
243   link->leafmtype_mpi   = leafmtype_mpi;
244 
245   link->next            = bas->inuse;
246   bas->inuse            = link;
247   *mylink               = link;
248   PetscFunctionReturn(0);
249 }
250 
251