xref: /petsc/src/vec/is/sf/impls/basic/sfmpi.c (revision f5d27ee7dde11c933a8c6f6ef7ab9e5456705271)
1 #include <../src/vec/is/sf/impls/basic/sfpack.h>
2 
3 // Though there is no default machanism to start a communication, we have a
4 // default to finish communication, which is just waiting on the requests.
5 // It should work for both non-blocking or persistent send/recvs or collectivwes.
6 static PetscErrorCode PetscSFLinkFinishCommunication_Default(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
7 {
8   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
9   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
10   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
11 
12   PetscFunctionBegin;
13   if (bas->nrootreqs) PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
14   if (sf->nleafreqs) PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
15   if (direction == PETSCSF_ROOT2LEAF) {
16     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
17   } else {
18     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
19   }
20   PetscFunctionReturn(PETSC_SUCCESS);
21 }
22 
23 /*
24    The routine Creates a communication link for the given operation. It first looks up its link cache. If
25    there is a free & suitable one, it uses it. Otherwise it creates a new one.
26 
27    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
28    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
29    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
30    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
31 
32    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
33 
34    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
35 
36    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
37    need pack/unpack data.
38 */
39 PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
40 {
41   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
42   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
43   PetscSFLink     *p, link;
44   PetscSFDirection direction;
45   MPI_Request     *reqs = NULL;
46   PetscBool        match, rootdirect[2], leafdirect[2];
47   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
48   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
49   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
50   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/
51 
52   PetscFunctionBegin;
53 
54   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
55   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
56     if (sfop == PETSCSF_BCAST) {
57       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
58       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
59     } else if (sfop == PETSCSF_REDUCE) {
60       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
61       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
62     } else {                                                                                /* PETSCSF_FETCH */
63       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
64       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
65     }
66   }
67 
68   if (sf->use_gpu_aware_mpi) {
69     rootmtype_mpi = rootmtype;
70     leafmtype_mpi = leafmtype;
71   } else {
72     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
73   }
74   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */
75   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
76   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;
77 
78   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
79   nrootreqs = bas->nrootreqs;
80   nleafreqs = sf->nleafreqs;
81 
82   /* Look for free links in cache */
83   for (p = &bas->avail; (link = *p); p = &link->next) {
84     if (!link->use_nvshmem) { /* Only check with MPI links */
85       PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
86       if (match) {
87         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
88            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
89         */
90         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
91           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
92           for (i = 0; i < nrootreqs; i++) {
93             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
94           }
95           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
96         }
97         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
98           reqs = link->leafreqs[direction][leafmtype][1];
99           for (i = 0; i < nleafreqs; i++) {
100             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
101           }
102           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
103         }
104         *p = link->next; /* Remove from available list */
105         goto found;
106       }
107     }
108   }
109 
110   PetscCall(PetscNew(&link));
111   PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
112   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */
113 
114   nreqs = (nrootreqs + nleafreqs) * 8;
115   PetscCall(PetscMalloc1(nreqs, &link->reqs));
116   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
117 
118   if (nreqs)
119     for (i = 0; i < 2; i++) {     /* Two communication directions */
120       for (j = 0; j < 2; j++) {   /* Two memory types */
121         for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
122           link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
123           link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
124         }
125       }
126     }
127 
128   link->FinishCommunication = PetscSFLinkFinishCommunication_Default;
129   // each SF type could customize their communication by setting function pointers in the link.
130   // Currently only BASIC and NEIGHBOR use this abstraction.
131   PetscTryTypeMethod(sf, SetCommunicationOps, link);
132 
133 found:
134 
135 #if defined(PETSC_HAVE_DEVICE)
136   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
137   #if defined(PETSC_HAVE_CUDA)
138     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
139   #endif
140   #if defined(PETSC_HAVE_HIP)
141     if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
142   #endif
143   #if defined(PETSC_HAVE_KOKKOS)
144     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
145   #endif
146   }
147 #endif
148 
149   /* Allocate buffers along root/leafdata */
150   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
151     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
152     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
153     if (bas->rootbuflen[i]) {
154       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
155         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
156       } else { /* Have to have a separate rootbuf */
157         if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
158         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
159       }
160     }
161 
162     if (sf->leafbuflen[i]) {
163       if (leafdirect[i]) {
164         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
165       } else {
166         if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
167         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
168       }
169     }
170   }
171 
172 #if defined(PETSC_HAVE_DEVICE)
173   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
174   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
175     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
176     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
177   }
178   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
179     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
180     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
181   }
182 #endif
183 
184   /* Set `current` state of the link. They may change between different SF invocations with the same link */
185   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
186     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
187     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
188   }
189 
190   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
191   link->leafdata = leafdata;
192   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
193     link->rootdirect[i] = rootdirect[i];
194     link->leafdirect[i] = leafdirect[i];
195   }
196   link->rootdirect_mpi = rootdirect_mpi;
197   link->leafdirect_mpi = leafdirect_mpi;
198   link->rootmtype      = rootmtype;
199   link->leafmtype      = leafmtype;
200   link->rootmtype_mpi  = rootmtype_mpi;
201   link->leafmtype_mpi  = leafmtype_mpi;
202 
203   link->next = bas->inuse;
204   bas->inuse = link;
205   *mylink    = link;
206   PetscFunctionReturn(PETSC_SUCCESS);
207 }
208