xref: /petsc/src/vec/is/sf/impls/basic/sfmpi.c (revision 48a46eb9bd028bec07ec0f396b1a3abb43f14558)
171438e86SJunchao Zhang /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version,
271438e86SJunchao Zhang    we can also do abstractions like Prepare/StartCommunication.
371438e86SJunchao Zhang */
471438e86SJunchao Zhang 
571438e86SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
671438e86SJunchao Zhang 
771438e86SJunchao Zhang /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */
89371c9d4SSatish Balay static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) {
971438e86SJunchao Zhang   PetscMPIInt    nreqs;
1071438e86SJunchao Zhang   MPI_Request   *reqs = NULL;
1171438e86SJunchao Zhang   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
1271438e86SJunchao Zhang   PetscInt       buflen;
1371438e86SJunchao Zhang 
1471438e86SJunchao Zhang   PetscFunctionBegin;
1571438e86SJunchao Zhang   buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
1671438e86SJunchao Zhang   if (buflen) {
1771438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
1871438e86SJunchao Zhang       nreqs = sf->nleafreqs;
199566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
2071438e86SJunchao Zhang     } else { /* leaf to root */
2171438e86SJunchao Zhang       nreqs = bas->nrootreqs;
229566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
2371438e86SJunchao Zhang     }
249566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Startall_irecv(buflen, link->unit, nreqs, reqs));
2571438e86SJunchao Zhang   }
2671438e86SJunchao Zhang 
2771438e86SJunchao Zhang   buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
2871438e86SJunchao Zhang   if (buflen) {
2971438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
3071438e86SJunchao Zhang       nreqs = bas->nrootreqs;
319566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */));
329566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
3371438e86SJunchao Zhang     } else { /* leaf to root */
3471438e86SJunchao Zhang       nreqs = sf->nleafreqs;
359566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE));
369566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
3771438e86SJunchao Zhang     }
389566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction));
399566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Startall_isend(buflen, link->unit, nreqs, reqs));
4071438e86SJunchao Zhang   }
4171438e86SJunchao Zhang   PetscFunctionReturn(0);
4271438e86SJunchao Zhang }
4371438e86SJunchao Zhang 
449371c9d4SSatish Balay static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) {
4571438e86SJunchao Zhang   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
4671438e86SJunchao Zhang   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
4771438e86SJunchao Zhang   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
4871438e86SJunchao Zhang 
4971438e86SJunchao Zhang   PetscFunctionBegin;
509566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
519566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
5271438e86SJunchao Zhang   if (direction == PETSCSF_ROOT2LEAF) {
539566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
5471438e86SJunchao Zhang   } else {
559566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
5671438e86SJunchao Zhang   }
5771438e86SJunchao Zhang   PetscFunctionReturn(0);
5871438e86SJunchao Zhang }
5971438e86SJunchao Zhang 
6071438e86SJunchao Zhang /*
6171438e86SJunchao Zhang    The routine Creates a communication link for the given operation. It first looks up its link cache. If
6271438e86SJunchao Zhang    there is a free & suitable one, it uses it. Otherwise it creates a new one.
6371438e86SJunchao Zhang 
6471438e86SJunchao Zhang    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
6571438e86SJunchao Zhang    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
6671438e86SJunchao Zhang    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
6771438e86SJunchao Zhang    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
6871438e86SJunchao Zhang 
6971438e86SJunchao Zhang    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
7071438e86SJunchao Zhang 
7171438e86SJunchao Zhang    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
7271438e86SJunchao Zhang 
7371438e86SJunchao Zhang    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
7471438e86SJunchao Zhang    need pack/unpack data.
7571438e86SJunchao Zhang */
769371c9d4SSatish Balay PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) {
7771438e86SJunchao Zhang   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
7871438e86SJunchao Zhang   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
7971438e86SJunchao Zhang   PetscSFLink     *p, link;
8071438e86SJunchao Zhang   PetscSFDirection direction;
8171438e86SJunchao Zhang   MPI_Request     *reqs = NULL;
8271438e86SJunchao Zhang   PetscBool        match, rootdirect[2], leafdirect[2];
8371438e86SJunchao Zhang   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
8471438e86SJunchao Zhang   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
8571438e86SJunchao Zhang   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
8671438e86SJunchao Zhang   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/
8771438e86SJunchao Zhang 
8871438e86SJunchao Zhang   PetscFunctionBegin;
8971438e86SJunchao Zhang 
9071438e86SJunchao Zhang   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
9171438e86SJunchao Zhang   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
9271438e86SJunchao Zhang     if (sfop == PETSCSF_BCAST) {
9371438e86SJunchao Zhang       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
9471438e86SJunchao Zhang       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
9571438e86SJunchao Zhang     } else if (sfop == PETSCSF_REDUCE) {
9671438e86SJunchao Zhang       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
9771438e86SJunchao Zhang       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
9871438e86SJunchao Zhang     } else {                                                                                /* PETSCSF_FETCH */
9971438e86SJunchao Zhang       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
10071438e86SJunchao Zhang       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
10171438e86SJunchao Zhang     }
10271438e86SJunchao Zhang   }
10371438e86SJunchao Zhang 
10471438e86SJunchao Zhang   if (sf->use_gpu_aware_mpi) {
10571438e86SJunchao Zhang     rootmtype_mpi = rootmtype;
10671438e86SJunchao Zhang     leafmtype_mpi = leafmtype;
10771438e86SJunchao Zhang   } else {
10871438e86SJunchao Zhang     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
10971438e86SJunchao Zhang   }
11071438e86SJunchao Zhang   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
11171438e86SJunchao Zhang   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
11271438e86SJunchao Zhang   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;
11371438e86SJunchao Zhang 
11471438e86SJunchao Zhang   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
11571438e86SJunchao Zhang   nrootreqs = bas->nrootreqs;
11671438e86SJunchao Zhang   nleafreqs = sf->nleafreqs;
11771438e86SJunchao Zhang 
11871438e86SJunchao Zhang   /* Look for free links in cache */
11971438e86SJunchao Zhang   for (p = &bas->avail; (link = *p); p = &link->next) {
12071438e86SJunchao Zhang     if (!link->use_nvshmem) { /* Only check with MPI links */
1219566063dSJacob Faibussowitsch       PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
12271438e86SJunchao Zhang       if (match) {
12371438e86SJunchao Zhang         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
12471438e86SJunchao Zhang            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
12571438e86SJunchao Zhang         */
12671438e86SJunchao Zhang         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
12771438e86SJunchao Zhang           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
1289371c9d4SSatish Balay           for (i = 0; i < nrootreqs; i++) {
1299371c9d4SSatish Balay             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
1309371c9d4SSatish Balay           }
13171438e86SJunchao Zhang           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
13271438e86SJunchao Zhang         }
13371438e86SJunchao Zhang         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
13471438e86SJunchao Zhang           reqs = link->leafreqs[direction][leafmtype][1];
1359371c9d4SSatish Balay           for (i = 0; i < nleafreqs; i++) {
1369371c9d4SSatish Balay             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
1379371c9d4SSatish Balay           }
13871438e86SJunchao Zhang           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
13971438e86SJunchao Zhang         }
14071438e86SJunchao Zhang         *p = link->next; /* Remove from available list */
14171438e86SJunchao Zhang         goto found;
14271438e86SJunchao Zhang       }
14371438e86SJunchao Zhang     }
14471438e86SJunchao Zhang   }
14571438e86SJunchao Zhang 
1469566063dSJacob Faibussowitsch   PetscCall(PetscNew(&link));
1479566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
1489566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */
14971438e86SJunchao Zhang 
15071438e86SJunchao Zhang   nreqs = (nrootreqs + nleafreqs) * 8;
1519566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nreqs, &link->reqs));
15271438e86SJunchao Zhang   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
15371438e86SJunchao Zhang 
15471438e86SJunchao Zhang   for (i = 0; i < 2; i++) {     /* Two communication directions */
15571438e86SJunchao Zhang     for (j = 0; j < 2; j++) {   /* Two memory types */
15671438e86SJunchao Zhang       for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
15771438e86SJunchao Zhang         link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
15871438e86SJunchao Zhang         link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
15971438e86SJunchao Zhang       }
16071438e86SJunchao Zhang     }
16171438e86SJunchao Zhang   }
16271438e86SJunchao Zhang   link->StartCommunication  = PetscSFLinkStartRequests_MPI;
16371438e86SJunchao Zhang   link->FinishCommunication = PetscSFLinkWaitRequests_MPI;
16471438e86SJunchao Zhang 
16571438e86SJunchao Zhang found:
16671438e86SJunchao Zhang 
16771438e86SJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
16871438e86SJunchao Zhang   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
16971438e86SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1709566063dSJacob Faibussowitsch     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
17171438e86SJunchao Zhang #endif
17271438e86SJunchao Zhang #if defined(PETSC_HAVE_HIP)
1739566063dSJacob Faibussowitsch     if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
17471438e86SJunchao Zhang #endif
17571438e86SJunchao Zhang #if defined(PETSC_HAVE_KOKKOS)
1769566063dSJacob Faibussowitsch     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
17771438e86SJunchao Zhang #endif
17871438e86SJunchao Zhang   }
17971438e86SJunchao Zhang #endif
18071438e86SJunchao Zhang 
18171438e86SJunchao Zhang   /* Allocate buffers along root/leafdata */
18271438e86SJunchao Zhang   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
18371438e86SJunchao Zhang     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
18471438e86SJunchao Zhang     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
18571438e86SJunchao Zhang     if (bas->rootbuflen[i]) {
18671438e86SJunchao Zhang       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
18771438e86SJunchao Zhang         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
18871438e86SJunchao Zhang       } else { /* Have to have a separate rootbuf */
189*48a46eb9SPierre Jolivet         if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
19071438e86SJunchao Zhang         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
19171438e86SJunchao Zhang       }
19271438e86SJunchao Zhang     }
19371438e86SJunchao Zhang 
19471438e86SJunchao Zhang     if (sf->leafbuflen[i]) {
19571438e86SJunchao Zhang       if (leafdirect[i]) {
19671438e86SJunchao Zhang         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
19771438e86SJunchao Zhang       } else {
198*48a46eb9SPierre Jolivet         if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
19971438e86SJunchao Zhang         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
20071438e86SJunchao Zhang       }
20171438e86SJunchao Zhang     }
20271438e86SJunchao Zhang   }
20371438e86SJunchao Zhang 
20471438e86SJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
20571438e86SJunchao Zhang   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
20671438e86SJunchao Zhang   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
207*48a46eb9SPierre Jolivet     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
20871438e86SJunchao Zhang     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
20971438e86SJunchao Zhang   }
21071438e86SJunchao Zhang   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
211*48a46eb9SPierre Jolivet     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
21271438e86SJunchao Zhang     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
21371438e86SJunchao Zhang   }
21471438e86SJunchao Zhang #endif
21571438e86SJunchao Zhang 
21671438e86SJunchao Zhang   /* Set `current` state of the link. They may change between different SF invocations with the same link */
21771438e86SJunchao Zhang   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
21871438e86SJunchao Zhang     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
21971438e86SJunchao Zhang     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
22071438e86SJunchao Zhang   }
22171438e86SJunchao Zhang 
22271438e86SJunchao Zhang   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
22371438e86SJunchao Zhang   link->leafdata = leafdata;
22471438e86SJunchao Zhang   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
22571438e86SJunchao Zhang     link->rootdirect[i] = rootdirect[i];
22671438e86SJunchao Zhang     link->leafdirect[i] = leafdirect[i];
22771438e86SJunchao Zhang   }
22871438e86SJunchao Zhang   link->rootdirect_mpi = rootdirect_mpi;
22971438e86SJunchao Zhang   link->leafdirect_mpi = leafdirect_mpi;
23071438e86SJunchao Zhang   link->rootmtype      = rootmtype;
23171438e86SJunchao Zhang   link->leafmtype      = leafmtype;
23271438e86SJunchao Zhang   link->rootmtype_mpi  = rootmtype_mpi;
23371438e86SJunchao Zhang   link->leafmtype_mpi  = leafmtype_mpi;
23471438e86SJunchao Zhang 
23571438e86SJunchao Zhang   link->next = bas->inuse;
23671438e86SJunchao Zhang   bas->inuse = link;
23771438e86SJunchao Zhang   *mylink    = link;
23871438e86SJunchao Zhang   PetscFunctionReturn(0);
23971438e86SJunchao Zhang }
240