xref: /petsc/src/sys/utils/mpishm.c (revision 2c71b3e237ead271e4f3aa1505f92bf476e3413d)
15f7487a0SJunchao Zhang #include <petscsys.h>        /*I  "petscsys.h"  I*/
25f7487a0SJunchao Zhang #include <petsc/private/petscimpl.h>
35f7487a0SJunchao Zhang 
45f7487a0SJunchao Zhang struct _n_PetscShmComm {
55f7487a0SJunchao Zhang   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
65f7487a0SJunchao Zhang   PetscMPIInt shmsize;          /* size of the shared memory communicator */
75f7487a0SJunchao Zhang   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
85f7487a0SJunchao Zhang };
95f7487a0SJunchao Zhang 
105f7487a0SJunchao Zhang /*
1133779a13SJunchao Zhang    Private routine to delete internal shared memory communicator when a communicator is freed.
125f7487a0SJunchao Zhang 
135f7487a0SJunchao Zhang    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
145f7487a0SJunchao Zhang 
155f7487a0SJunchao Zhang    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
165f7487a0SJunchao Zhang 
175f7487a0SJunchao Zhang */
1833779a13SJunchao Zhang PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
195f7487a0SJunchao Zhang {
205f7487a0SJunchao Zhang   PetscErrorCode  ierr;
215f7487a0SJunchao Zhang   PetscShmComm p = (PetscShmComm)val;
225f7487a0SJunchao Zhang 
235f7487a0SJunchao Zhang   PetscFunctionBegin;
247d3de750SJacob Faibussowitsch   ierr = PetscInfo(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
255f7487a0SJunchao Zhang   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
265f7487a0SJunchao Zhang   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
275f7487a0SJunchao Zhang   ierr = PetscFree(val);CHKERRMPI(ierr);
285f7487a0SJunchao Zhang   PetscFunctionReturn(MPI_SUCCESS);
295f7487a0SJunchao Zhang }
305f7487a0SJunchao Zhang 
31b48189acSJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
32b48189acSJunchao Zhang /* Data structures to support freeing comms created in PetscShmCommGet().
33b48189acSJunchao Zhang   Since we predict communicators passed to PetscShmCommGet() are very likely
34b48189acSJunchao Zhang   either a petsc inner communicator or an MPI communicator with a linked petsc
35b48189acSJunchao Zhang   inner communicator, we use a simple static array to store dupped communicators
36b48189acSJunchao Zhang   on rare cases otherwise.
37b48189acSJunchao Zhang  */
38b48189acSJunchao Zhang #define MAX_SHMCOMM_DUPPED_COMMS 16
39b48189acSJunchao Zhang static PetscInt       num_dupped_comms=0;
40b48189acSJunchao Zhang static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
41b48189acSJunchao Zhang static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
42b48189acSJunchao Zhang {
43b48189acSJunchao Zhang   PetscErrorCode   ierr;
44b48189acSJunchao Zhang   PetscInt         i;
45b48189acSJunchao Zhang   PetscFunctionBegin;
46b48189acSJunchao Zhang   for (i=0; i<num_dupped_comms; i++) {ierr = PetscCommDestroy(&shmcomm_dupped_comms[i]);CHKERRQ(ierr);}
47b48189acSJunchao Zhang   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
48b48189acSJunchao Zhang   PetscFunctionReturn(0);
49b48189acSJunchao Zhang }
50b48189acSJunchao Zhang #endif
51b48189acSJunchao Zhang 
525f7487a0SJunchao Zhang /*@C
53b48189acSJunchao Zhang     PetscShmCommGet - Given a communicator returns a sub-communicator of all ranks that share a common memory
545f7487a0SJunchao Zhang 
55d083f849SBarry Smith     Collective.
565f7487a0SJunchao Zhang 
575f7487a0SJunchao Zhang     Input Parameter:
58b48189acSJunchao Zhang .   globcomm - MPI_Comm, which can be a user MPI_Comm or a PETSc inner MPI_Comm
595f7487a0SJunchao Zhang 
605f7487a0SJunchao Zhang     Output Parameter:
615f7487a0SJunchao Zhang .   pshmcomm - the PETSc shared memory communicator object
625f7487a0SJunchao Zhang 
635f7487a0SJunchao Zhang     Level: developer
645f7487a0SJunchao Zhang 
655f7487a0SJunchao Zhang     Notes:
665f7487a0SJunchao Zhang        When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
675f7487a0SJunchao Zhang 
685f7487a0SJunchao Zhang @*/
695f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
705f7487a0SJunchao Zhang {
715f7487a0SJunchao Zhang #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
725f7487a0SJunchao Zhang   PetscErrorCode   ierr;
735f7487a0SJunchao Zhang   MPI_Group        globgroup,shmgroup;
745f7487a0SJunchao Zhang   PetscMPIInt      *shmranks,i,flg;
755f7487a0SJunchao Zhang   PetscCommCounter *counter;
765f7487a0SJunchao Zhang 
775f7487a0SJunchao Zhang   PetscFunctionBegin;
783ca90d2dSJacob Faibussowitsch   PetscValidPointer(pshmcomm,2);
79b48189acSJunchao Zhang   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
80ffc4695bSBarry Smith   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRMPI(ierr);
81b48189acSJunchao Zhang   if (!flg) { /* globcomm is not a petsc comm */
82b48189acSJunchao Zhang     union {MPI_Comm comm; void *ptr;} ucomm;
83b48189acSJunchao Zhang     /* check if globcomm already has a linked petsc inner comm */
84b48189acSJunchao Zhang     ierr = MPI_Comm_get_attr(globcomm,Petsc_InnerComm_keyval,&ucomm,&flg);CHKERRMPI(ierr);
85b48189acSJunchao Zhang     if (!flg) {
86b48189acSJunchao Zhang       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
87*2c71b3e2SJacob Faibussowitsch       PetscCheckFalse(num_dupped_comms >= MAX_SHMCOMM_DUPPED_COMMS,globcomm,PETSC_ERR_PLIB,"PetscShmCommGet() is trying to dup more than %d MPI_Comms",MAX_SHMCOMM_DUPPED_COMMS);
88b48189acSJunchao Zhang       ierr = PetscCommDuplicate(globcomm,&globcomm,NULL);CHKERRQ(ierr);
89b48189acSJunchao Zhang       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
90b48189acSJunchao Zhang       if (num_dupped_comms == 0) {ierr = PetscRegisterFinalize(PetscShmCommDestroyDuppedComms);CHKERRQ(ierr);}
91b48189acSJunchao Zhang       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
92b48189acSJunchao Zhang       num_dupped_comms++;
93b48189acSJunchao Zhang     } else {
94b48189acSJunchao Zhang       /* otherwise, we pull out the inner comm and use it as globcomm */
95b48189acSJunchao Zhang       globcomm = ucomm.comm;
96b48189acSJunchao Zhang     }
97b48189acSJunchao Zhang   }
985f7487a0SJunchao Zhang 
99b48189acSJunchao Zhang   /* Check if globcomm already has an attached pshmcomm. If no, create one */
100ffc4695bSBarry Smith   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRMPI(ierr);
1015f7487a0SJunchao Zhang   if (flg) PetscFunctionReturn(0);
1025f7487a0SJunchao Zhang 
1035f7487a0SJunchao Zhang   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
1045f7487a0SJunchao Zhang   (*pshmcomm)->globcomm = globcomm;
1055f7487a0SJunchao Zhang 
106ffc4695bSBarry Smith   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRMPI(ierr);
1075f7487a0SJunchao Zhang 
108ffc4695bSBarry Smith   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRMPI(ierr);
109ffc4695bSBarry Smith   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRMPI(ierr);
110ffc4695bSBarry Smith   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRMPI(ierr);
1115f7487a0SJunchao Zhang   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
1125f7487a0SJunchao Zhang   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
1135f7487a0SJunchao Zhang   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
114ffc4695bSBarry Smith   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRMPI(ierr);
1155f7487a0SJunchao Zhang   ierr = PetscFree(shmranks);CHKERRQ(ierr);
116ffc4695bSBarry Smith   ierr = MPI_Group_free(&globgroup);CHKERRMPI(ierr);
117ffc4695bSBarry Smith   ierr = MPI_Group_free(&shmgroup);CHKERRMPI(ierr);
1185f7487a0SJunchao Zhang 
1195f7487a0SJunchao Zhang   for (i=0; i<(*pshmcomm)->shmsize; i++) {
1207d3de750SJacob Faibussowitsch     ierr = PetscInfo(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
1215f7487a0SJunchao Zhang   }
122ffc4695bSBarry Smith   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRMPI(ierr);
1235f7487a0SJunchao Zhang   PetscFunctionReturn(0);
1245f7487a0SJunchao Zhang #else
1255f7487a0SJunchao Zhang   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
1265f7487a0SJunchao Zhang #endif
1275f7487a0SJunchao Zhang }
1285f7487a0SJunchao Zhang 
1295f7487a0SJunchao Zhang /*@C
1305f7487a0SJunchao Zhang     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
1315f7487a0SJunchao Zhang 
1325f7487a0SJunchao Zhang     Input Parameters:
1335f7487a0SJunchao Zhang +   pshmcomm - the shared memory communicator object
1345f7487a0SJunchao Zhang -   grank    - the global rank
1355f7487a0SJunchao Zhang 
1365f7487a0SJunchao Zhang     Output Parameter:
1375f7487a0SJunchao Zhang .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
1385f7487a0SJunchao Zhang 
1395f7487a0SJunchao Zhang     Level: developer
1405f7487a0SJunchao Zhang 
1415f7487a0SJunchao Zhang     Developer Notes:
1425f7487a0SJunchao Zhang     Assumes the pshmcomm->globranks[] is sorted
1435f7487a0SJunchao Zhang 
1445f7487a0SJunchao Zhang     It may be better to rewrite this to map multiple global ranks to local in the same function call
1455f7487a0SJunchao Zhang 
1465f7487a0SJunchao Zhang @*/
1475f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
1485f7487a0SJunchao Zhang {
1495f7487a0SJunchao Zhang   PetscMPIInt    low,high,t,i;
1505f7487a0SJunchao Zhang   PetscBool      flg = PETSC_FALSE;
1515f7487a0SJunchao Zhang   PetscErrorCode ierr;
1525f7487a0SJunchao Zhang 
1535f7487a0SJunchao Zhang   PetscFunctionBegin;
1543ca90d2dSJacob Faibussowitsch   PetscValidPointer(pshmcomm,1);
1553ca90d2dSJacob Faibussowitsch   PetscValidPointer(lrank,3);
1565f7487a0SJunchao Zhang   *lrank = MPI_PROC_NULL;
1575f7487a0SJunchao Zhang   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
1585f7487a0SJunchao Zhang   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
1595f7487a0SJunchao Zhang   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
1605f7487a0SJunchao Zhang   if (flg) PetscFunctionReturn(0);
1615f7487a0SJunchao Zhang   low  = 0;
1625f7487a0SJunchao Zhang   high = pshmcomm->shmsize;
1635f7487a0SJunchao Zhang   while (high-low > 5) {
1645f7487a0SJunchao Zhang     t = (low+high)/2;
1655f7487a0SJunchao Zhang     if (pshmcomm->globranks[t] > grank) high = t;
1665f7487a0SJunchao Zhang     else low = t;
1675f7487a0SJunchao Zhang   }
1685f7487a0SJunchao Zhang   for (i=low; i<high; i++) {
1695f7487a0SJunchao Zhang     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
1705f7487a0SJunchao Zhang     if (pshmcomm->globranks[i] == grank) {
1715f7487a0SJunchao Zhang       *lrank = i;
1725f7487a0SJunchao Zhang       PetscFunctionReturn(0);
1735f7487a0SJunchao Zhang     }
1745f7487a0SJunchao Zhang   }
1755f7487a0SJunchao Zhang   PetscFunctionReturn(0);
1765f7487a0SJunchao Zhang }
1775f7487a0SJunchao Zhang 
1785f7487a0SJunchao Zhang /*@C
1795f7487a0SJunchao Zhang     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
1805f7487a0SJunchao Zhang 
1815f7487a0SJunchao Zhang     Input Parameters:
1825f7487a0SJunchao Zhang +   pshmcomm - the shared memory communicator object
1835f7487a0SJunchao Zhang -   lrank    - the local rank in the shared memory communicator
1845f7487a0SJunchao Zhang 
1855f7487a0SJunchao Zhang     Output Parameter:
1865f7487a0SJunchao Zhang .   grank - the global rank in the global communicator where the shared memory communicator is built
1875f7487a0SJunchao Zhang 
1885f7487a0SJunchao Zhang     Level: developer
1895f7487a0SJunchao Zhang 
1905f7487a0SJunchao Zhang @*/
1915f7487a0SJunchao Zhang PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
1925f7487a0SJunchao Zhang {
1935f7487a0SJunchao Zhang   PetscFunctionBegin;
1943ca90d2dSJacob Faibussowitsch   PetscValidPointer(pshmcomm,1);
1953ca90d2dSJacob Faibussowitsch   PetscValidPointer(grank,3);
196*2c71b3e2SJacob Faibussowitsch   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %d in the shared memory communicator",lrank);
1975f7487a0SJunchao Zhang   *grank = pshmcomm->globranks[lrank];
1985f7487a0SJunchao Zhang   PetscFunctionReturn(0);
1995f7487a0SJunchao Zhang }
2005f7487a0SJunchao Zhang 
2015f7487a0SJunchao Zhang /*@C
2025f7487a0SJunchao Zhang     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
2035f7487a0SJunchao Zhang 
2045f7487a0SJunchao Zhang     Input Parameter:
2055f7487a0SJunchao Zhang .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
2065f7487a0SJunchao Zhang 
2075f7487a0SJunchao Zhang     Output Parameter:
2085f7487a0SJunchao Zhang .   comm     - the MPI communicator
2095f7487a0SJunchao Zhang 
2105f7487a0SJunchao Zhang     Level: developer
2115f7487a0SJunchao Zhang 
2125f7487a0SJunchao Zhang @*/
2135f7487a0SJunchao Zhang PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
2145f7487a0SJunchao Zhang {
2155f7487a0SJunchao Zhang   PetscFunctionBegin;
2163ca90d2dSJacob Faibussowitsch   PetscValidPointer(pshmcomm,1);
2173ca90d2dSJacob Faibussowitsch   PetscValidPointer(comm,2);
2185f7487a0SJunchao Zhang   *comm = pshmcomm->shmcomm;
2195f7487a0SJunchao Zhang   PetscFunctionReturn(0);
2205f7487a0SJunchao Zhang }
2215f7487a0SJunchao Zhang 
22220b3346cSJunchao Zhang #if defined(PETSC_HAVE_OPENMP_SUPPORT)
223a32e93adSJunchao Zhang #include <pthread.h>
224a32e93adSJunchao Zhang #include <hwloc.h>
225a32e93adSJunchao Zhang #include <omp.h>
226a32e93adSJunchao Zhang 
227eff715bbSJunchao Zhang /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
228eff715bbSJunchao Zhang    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
2294df5c2c7SJunchao Zhang    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
2304df5c2c7SJunchao Zhang    by 50%. Until the reason is found out, we use mmap() instead.
2314df5c2c7SJunchao Zhang */
2324df5c2c7SJunchao Zhang #define USE_MMAP_ALLOCATE_SHARED_MEMORY
2334df5c2c7SJunchao Zhang 
2344df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2354df5c2c7SJunchao Zhang #include <sys/mman.h>
2364df5c2c7SJunchao Zhang #include <sys/types.h>
2374df5c2c7SJunchao Zhang #include <sys/stat.h>
2384df5c2c7SJunchao Zhang #include <fcntl.h>
2394df5c2c7SJunchao Zhang #endif
2404df5c2c7SJunchao Zhang 
241a32e93adSJunchao Zhang struct _n_PetscOmpCtrl {
242a32e93adSJunchao Zhang   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
243a32e93adSJunchao Zhang   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
244a32e93adSJunchao Zhang   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
245a32e93adSJunchao Zhang   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
246a32e93adSJunchao Zhang   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
247a32e93adSJunchao Zhang   pthread_barrier_t *barrier;        /* pointer to the barrier */
248a32e93adSJunchao Zhang   hwloc_topology_t  topology;
249a32e93adSJunchao Zhang   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
250a32e93adSJunchao Zhang   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
251a32e93adSJunchao Zhang };
252a32e93adSJunchao Zhang 
253eff715bbSJunchao Zhang /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
2548fcaa860SBarry Smith    contained by the controller.
255eff715bbSJunchao Zhang 
2568fcaa860SBarry Smith    PETSc OpenMP controller users do not call this function directly. This function exists
257eff715bbSJunchao Zhang    only because we want to separate shared memory allocation methods from other code.
258eff715bbSJunchao Zhang  */
2599fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
260a32e93adSJunchao Zhang {
261a32e93adSJunchao Zhang   PetscErrorCode        ierr;
262a32e93adSJunchao Zhang   MPI_Aint              size;
263a32e93adSJunchao Zhang   void                  *baseptr;
264a32e93adSJunchao Zhang   pthread_barrierattr_t  attr;
265a32e93adSJunchao Zhang 
2664df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2674df5c2c7SJunchao Zhang   PetscInt              fd;
2684df5c2c7SJunchao Zhang   PetscChar             pathname[PETSC_MAX_PATH_LEN];
2694df5c2c7SJunchao Zhang #else
2704df5c2c7SJunchao Zhang   PetscMPIInt           disp_unit;
2714df5c2c7SJunchao Zhang #endif
2724df5c2c7SJunchao Zhang 
2734df5c2c7SJunchao Zhang   PetscFunctionBegin;
2744df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
2754df5c2c7SJunchao Zhang   size = sizeof(pthread_barrier_t);
2764df5c2c7SJunchao Zhang   if (ctrl->is_omp_master) {
277eff715bbSJunchao Zhang     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
2784df5c2c7SJunchao Zhang     ierr    = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
2794df5c2c7SJunchao Zhang     ierr    = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
2804df5c2c7SJunchao Zhang     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
281*2c71b3e2SJacob Faibussowitsch     fd      = mkstemp(pathname); PetscCheckFalse(fd == -1,PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp", pathname);
2824df5c2c7SJunchao Zhang     ierr    = ftruncate(fd,size);CHKERRQ(ierr);
283*2c71b3e2SJacob Faibussowitsch     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); PetscCheckFalse(baseptr == MAP_FAILED,PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed");
2844df5c2c7SJunchao Zhang     ierr    = close(fd);CHKERRQ(ierr);
28555b25c41SPierre Jolivet     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRMPI(ierr);
286eff715bbSJunchao Zhang     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
28755b25c41SPierre Jolivet     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
2884df5c2c7SJunchao Zhang     ierr    = unlink(pathname);CHKERRQ(ierr);
2894df5c2c7SJunchao Zhang   } else {
29055b25c41SPierre Jolivet     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRMPI(ierr);
291*2c71b3e2SJacob Faibussowitsch     fd      = open(pathname,O_RDWR); PetscCheckFalse(fd == -1,PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s", pathname);
292*2c71b3e2SJacob Faibussowitsch     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); PetscCheckFalse(baseptr == MAP_FAILED,PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed");
2934df5c2c7SJunchao Zhang     ierr    = close(fd);CHKERRQ(ierr);
29455b25c41SPierre Jolivet     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
2954df5c2c7SJunchao Zhang   }
2964df5c2c7SJunchao Zhang #else
297a32e93adSJunchao Zhang   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
298ffc4695bSBarry Smith   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRMPI(ierr);
299ffc4695bSBarry Smith   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRMPI(ierr);
3004df5c2c7SJunchao Zhang #endif
301a32e93adSJunchao Zhang   ctrl->barrier = (pthread_barrier_t*)baseptr;
302a32e93adSJunchao Zhang 
303a32e93adSJunchao Zhang   /* omp master initializes the barrier */
304a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
305ffc4695bSBarry Smith     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRMPI(ierr);
306a32e93adSJunchao Zhang     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
307a32e93adSJunchao Zhang     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
308a32e93adSJunchao Zhang     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
309a32e93adSJunchao Zhang     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
310a32e93adSJunchao Zhang   }
311a32e93adSJunchao Zhang 
3124df5c2c7SJunchao Zhang   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
313ffc4695bSBarry Smith   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
314a32e93adSJunchao Zhang   PetscFunctionReturn(0);
315a32e93adSJunchao Zhang }
316a32e93adSJunchao Zhang 
3178fcaa860SBarry Smith /* Destroy the pthread barrier in the PETSc OpenMP controller */
3189fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
319a32e93adSJunchao Zhang {
320a32e93adSJunchao Zhang   PetscErrorCode ierr;
321a32e93adSJunchao Zhang 
3224df5c2c7SJunchao Zhang   PetscFunctionBegin;
3234df5c2c7SJunchao Zhang   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
324ffc4695bSBarry Smith   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
325a32e93adSJunchao Zhang   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
3264df5c2c7SJunchao Zhang 
3274df5c2c7SJunchao Zhang #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
3284df5c2c7SJunchao Zhang   ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr);
3294df5c2c7SJunchao Zhang #else
330ffc4695bSBarry Smith   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRMPI(ierr);
3314df5c2c7SJunchao Zhang #endif
332a32e93adSJunchao Zhang   PetscFunctionReturn(0);
333a32e93adSJunchao Zhang }
334a32e93adSJunchao Zhang 
335eff715bbSJunchao Zhang /*@C
3368fcaa860SBarry Smith     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP
337eff715bbSJunchao Zhang 
338d8d19677SJose E. Roman     Input Parameters:
339eff715bbSJunchao Zhang +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
340a2b725a8SWilliam Gropp -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
341eff715bbSJunchao Zhang 
342eff715bbSJunchao Zhang     Output Parameter:
3438fcaa860SBarry Smith .   pctrl      - a PETSc OpenMP controller
344eff715bbSJunchao Zhang 
345eff715bbSJunchao Zhang     Level: developer
346eff715bbSJunchao Zhang 
3478fcaa860SBarry Smith     TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use
3488fcaa860SBarry Smith 
349eff715bbSJunchao Zhang .seealso PetscOmpCtrlDestroy()
350eff715bbSJunchao Zhang @*/
351a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
352a32e93adSJunchao Zhang {
353a32e93adSJunchao Zhang   PetscErrorCode        ierr;
354a32e93adSJunchao Zhang   PetscOmpCtrl          ctrl;
355a32e93adSJunchao Zhang   unsigned long         *cpu_ulongs=NULL;
356a32e93adSJunchao Zhang   PetscInt              i,nr_cpu_ulongs;
357a32e93adSJunchao Zhang   PetscShmComm          pshmcomm;
358a32e93adSJunchao Zhang   MPI_Comm              shm_comm;
359a32e93adSJunchao Zhang   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
3607c405c4aSJunchao Zhang   PetscInt              num_packages,num_cores;
361a32e93adSJunchao Zhang 
362a32e93adSJunchao Zhang   PetscFunctionBegin;
363a32e93adSJunchao Zhang   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
364a32e93adSJunchao Zhang 
365a32e93adSJunchao Zhang   /*=================================================================================
3667c405c4aSJunchao Zhang     Init hwloc
3677c405c4aSJunchao Zhang    ==================================================================================*/
3687c405c4aSJunchao Zhang   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
3697c405c4aSJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000
3707c405c4aSJunchao Zhang   /* to filter out unneeded info and have faster hwloc_topology_load */
3717c405c4aSJunchao Zhang   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
3727c405c4aSJunchao Zhang   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
3737c405c4aSJunchao Zhang #endif
3747c405c4aSJunchao Zhang   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
3757c405c4aSJunchao Zhang 
3767c405c4aSJunchao Zhang   /*=================================================================================
377a32e93adSJunchao Zhang     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
378a32e93adSJunchao Zhang     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
379a32e93adSJunchao Zhang     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
380a32e93adSJunchao Zhang     which is usually passed to third party libraries.
381a32e93adSJunchao Zhang    ==================================================================================*/
382a32e93adSJunchao Zhang 
383a32e93adSJunchao Zhang   /* fetch the stored shared memory communicator */
384a32e93adSJunchao Zhang   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
385a32e93adSJunchao Zhang   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
386a32e93adSJunchao Zhang 
387ffc4695bSBarry Smith   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRMPI(ierr);
388ffc4695bSBarry Smith   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRMPI(ierr);
389a32e93adSJunchao Zhang 
3907c405c4aSJunchao Zhang   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
3917c405c4aSJunchao Zhang   if (nthreads == -1) {
392a312e481SBarry Smith     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE);
393a312e481SBarry Smith     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 :  hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);
3947c405c4aSJunchao Zhang     nthreads     = num_cores/num_packages;
3957c405c4aSJunchao Zhang     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
3967c405c4aSJunchao Zhang   }
3977c405c4aSJunchao Zhang 
398*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nthreads < 1 || nthreads > shm_comm_size,petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d",nthreads,shm_comm_size);
3993ca90d2dSJacob Faibussowitsch   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
400a32e93adSJunchao Zhang 
401a32e93adSJunchao Zhang   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
402a32e93adSJunchao Zhang      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
403a32e93adSJunchao Zhang      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
404a32e93adSJunchao Zhang      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
405a32e93adSJunchao Zhang      Use 0 as key so that rank ordering wont change in new comm.
406a32e93adSJunchao Zhang    */
407a32e93adSJunchao Zhang   color = shm_rank / nthreads;
40855b25c41SPierre Jolivet   ierr  = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRMPI(ierr);
409a32e93adSJunchao Zhang 
410a32e93adSJunchao Zhang   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
411ffc4695bSBarry Smith   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRMPI(ierr);
412a32e93adSJunchao Zhang   if (!omp_rank) {
413a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_TRUE;  /* master */
414a32e93adSJunchao Zhang     color = 0;
415a32e93adSJunchao Zhang   } else {
416a32e93adSJunchao Zhang     ctrl->is_omp_master = PETSC_FALSE; /* slave */
417a32e93adSJunchao Zhang     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
418a32e93adSJunchao Zhang   }
419ffc4695bSBarry Smith   ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRMPI(ierr);
420a32e93adSJunchao Zhang 
421a32e93adSJunchao Zhang   /*=================================================================================
422a32e93adSJunchao Zhang     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
423a32e93adSJunchao Zhang     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
424a32e93adSJunchao Zhang     and run them on the idle CPUs.
425a32e93adSJunchao Zhang    ==================================================================================*/
426a32e93adSJunchao Zhang   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
427a32e93adSJunchao Zhang 
428a32e93adSJunchao Zhang   /*=================================================================================
429a32e93adSJunchao Zhang     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
430a32e93adSJunchao Zhang     is the union of the bindings of all ranks in the omp_comm
431a32e93adSJunchao Zhang     =================================================================================*/
432a32e93adSJunchao Zhang 
433*2c71b3e2SJacob Faibussowitsch   ctrl->cpuset = hwloc_bitmap_alloc(); PetscCheckFalse(!ctrl->cpuset,PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed");
434a32e93adSJunchao Zhang   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
435a32e93adSJunchao Zhang 
4363ab56b82SJunchao Zhang   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
437a32e93adSJunchao Zhang   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
438a32e93adSJunchao Zhang   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
439a32e93adSJunchao Zhang   if (nr_cpu_ulongs == 1) {
440a32e93adSJunchao Zhang     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
441a32e93adSJunchao Zhang   } else {
442a32e93adSJunchao Zhang     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
443a32e93adSJunchao Zhang   }
444a32e93adSJunchao Zhang 
445ffc4695bSBarry Smith   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRMPI(ierr);
446a32e93adSJunchao Zhang 
447a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
448*2c71b3e2SJacob Faibussowitsch     ctrl->omp_cpuset = hwloc_bitmap_alloc(); PetscCheckFalse(!ctrl->omp_cpuset,PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed");
449a32e93adSJunchao Zhang     if (nr_cpu_ulongs == 1) {
4503ab56b82SJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000
451a32e93adSJunchao Zhang       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
4523ab56b82SJunchao Zhang #else
4533ab56b82SJunchao Zhang       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
4543ab56b82SJunchao Zhang #endif
455a32e93adSJunchao Zhang     } else {
4563ab56b82SJunchao Zhang       for (i=0; i<nr_cpu_ulongs; i++)  {
4573ab56b82SJunchao Zhang #if HWLOC_API_VERSION >= 0x00020000
4583ab56b82SJunchao Zhang         ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr);
4593ab56b82SJunchao Zhang #else
4603ab56b82SJunchao Zhang         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
4613ab56b82SJunchao Zhang #endif
4623ab56b82SJunchao Zhang       }
463a32e93adSJunchao Zhang     }
464a32e93adSJunchao Zhang   }
465a32e93adSJunchao Zhang 
466a32e93adSJunchao Zhang   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
467a32e93adSJunchao Zhang   *pctrl = ctrl;
468a32e93adSJunchao Zhang   PetscFunctionReturn(0);
469a32e93adSJunchao Zhang }
470a32e93adSJunchao Zhang 
471eff715bbSJunchao Zhang /*@C
47264f49babSJed Brown     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
473eff715bbSJunchao Zhang 
474eff715bbSJunchao Zhang     Input Parameter:
4758fcaa860SBarry Smith .   pctrl  - a PETSc OpenMP controller
476eff715bbSJunchao Zhang 
477eff715bbSJunchao Zhang     Level: developer
478eff715bbSJunchao Zhang 
479eff715bbSJunchao Zhang .seealso PetscOmpCtrlCreate()
480eff715bbSJunchao Zhang @*/
481a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
482a32e93adSJunchao Zhang {
483a32e93adSJunchao Zhang   PetscErrorCode  ierr;
484a32e93adSJunchao Zhang   PetscOmpCtrl    ctrl = *pctrl;
485a32e93adSJunchao Zhang 
486a32e93adSJunchao Zhang   PetscFunctionBegin;
487a32e93adSJunchao Zhang   hwloc_bitmap_free(ctrl->cpuset);
488a32e93adSJunchao Zhang   hwloc_topology_destroy(ctrl->topology);
489a32e93adSJunchao Zhang   PetscOmpCtrlDestroyBarrier(ctrl);
490ffc4695bSBarry Smith   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRMPI(ierr);
491a32e93adSJunchao Zhang   if (ctrl->is_omp_master) {
492a32e93adSJunchao Zhang     hwloc_bitmap_free(ctrl->omp_cpuset);
493ffc4695bSBarry Smith     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRMPI(ierr);
494a32e93adSJunchao Zhang   }
495a32e93adSJunchao Zhang   ierr = PetscFree(ctrl);CHKERRQ(ierr);
496a32e93adSJunchao Zhang   PetscFunctionReturn(0);
497a32e93adSJunchao Zhang }
498a32e93adSJunchao Zhang 
499a32e93adSJunchao Zhang /*@C
5008fcaa860SBarry Smith     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
501a32e93adSJunchao Zhang 
502a32e93adSJunchao Zhang     Input Parameter:
5038fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
504a32e93adSJunchao Zhang 
505d8d19677SJose E. Roman     Output Parameters:
506eff715bbSJunchao Zhang +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
507a32e93adSJunchao Zhang .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
508a32e93adSJunchao Zhang                        on slave ranks, MPI_COMM_NULL will be return in reality.
509a32e93adSJunchao Zhang -   is_omp_master    - true if the calling process is an OMP master rank.
510a32e93adSJunchao Zhang 
511eff715bbSJunchao Zhang     Notes: any output parameter can be NULL. The parameter is just ignored.
512eff715bbSJunchao Zhang 
513a32e93adSJunchao Zhang     Level: developer
514a32e93adSJunchao Zhang @*/
515a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
516a32e93adSJunchao Zhang {
517a32e93adSJunchao Zhang   PetscFunctionBegin;
518a32e93adSJunchao Zhang   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
519a32e93adSJunchao Zhang   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
520a32e93adSJunchao Zhang   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
521a32e93adSJunchao Zhang   PetscFunctionReturn(0);
522a32e93adSJunchao Zhang }
523a32e93adSJunchao Zhang 
524eff715bbSJunchao Zhang /*@C
5258fcaa860SBarry Smith     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
526eff715bbSJunchao Zhang 
527eff715bbSJunchao Zhang     Input Parameter:
5288fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
529eff715bbSJunchao Zhang 
530eff715bbSJunchao Zhang     Notes:
531eff715bbSJunchao Zhang     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
532eff715bbSJunchao Zhang     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
533eff715bbSJunchao Zhang     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
534eff715bbSJunchao Zhang 
535eff715bbSJunchao Zhang     A code using PetscOmpCtrlBarrier() would be like this,
536eff715bbSJunchao Zhang 
537eff715bbSJunchao Zhang     if (is_omp_master) {
538eff715bbSJunchao Zhang       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
539eff715bbSJunchao Zhang       Call the library using OpenMP
540eff715bbSJunchao Zhang       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
541eff715bbSJunchao Zhang     }
542eff715bbSJunchao Zhang     PetscOmpCtrlBarrier(ctrl);
543eff715bbSJunchao Zhang 
544eff715bbSJunchao Zhang     Level: developer
545eff715bbSJunchao Zhang 
546eff715bbSJunchao Zhang .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
547eff715bbSJunchao Zhang @*/
548a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
549a32e93adSJunchao Zhang {
5502da392ccSBarry Smith   int err;
551a32e93adSJunchao Zhang 
552a32e93adSJunchao Zhang   PetscFunctionBegin;
5532da392ccSBarry Smith   err = pthread_barrier_wait(ctrl->barrier);
554*2c71b3e2SJacob Faibussowitsch   PetscCheckFalse(err && err != PTHREAD_BARRIER_SERIAL_THREAD,PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %" PetscInt_FMT, err);
555a32e93adSJunchao Zhang   PetscFunctionReturn(0);
556a32e93adSJunchao Zhang }
557a32e93adSJunchao Zhang 
558eff715bbSJunchao Zhang /*@C
559eff715bbSJunchao Zhang     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
560eff715bbSJunchao Zhang 
561eff715bbSJunchao Zhang     Input Parameter:
5628fcaa860SBarry Smith .   ctrl - a PETSc OMP controller
563eff715bbSJunchao Zhang 
564eff715bbSJunchao Zhang     Notes:
5658fcaa860SBarry Smith     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
566eff715bbSJunchao Zhang     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
567eff715bbSJunchao Zhang 
568eff715bbSJunchao Zhang     Level: developer
569eff715bbSJunchao Zhang 
570eff715bbSJunchao Zhang .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
571eff715bbSJunchao Zhang @*/
572a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
573a32e93adSJunchao Zhang {
574a32e93adSJunchao Zhang   PetscErrorCode ierr;
575a32e93adSJunchao Zhang 
576a32e93adSJunchao Zhang   PetscFunctionBegin;
577a32e93adSJunchao Zhang   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
578eff715bbSJunchao Zhang   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
579a32e93adSJunchao Zhang   PetscFunctionReturn(0);
580a32e93adSJunchao Zhang }
581a32e93adSJunchao Zhang 
582eff715bbSJunchao Zhang /*@C
583eff715bbSJunchao Zhang    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
584eff715bbSJunchao Zhang 
585eff715bbSJunchao Zhang    Input Parameter:
5868fcaa860SBarry Smith .  ctrl - a PETSc OMP controller
587eff715bbSJunchao Zhang 
588eff715bbSJunchao Zhang    Notes:
5898fcaa860SBarry Smith    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
590eff715bbSJunchao Zhang    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
591eff715bbSJunchao Zhang 
592eff715bbSJunchao Zhang    Level: developer
593eff715bbSJunchao Zhang 
594eff715bbSJunchao Zhang .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
595eff715bbSJunchao Zhang @*/
596a32e93adSJunchao Zhang PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
597a32e93adSJunchao Zhang {
598a32e93adSJunchao Zhang   PetscErrorCode ierr;
599a32e93adSJunchao Zhang 
600a32e93adSJunchao Zhang   PetscFunctionBegin;
601a32e93adSJunchao Zhang   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
602eff715bbSJunchao Zhang   omp_set_num_threads(1);
603a32e93adSJunchao Zhang   PetscFunctionReturn(0);
604a32e93adSJunchao Zhang }
605a32e93adSJunchao Zhang 
6064df5c2c7SJunchao Zhang #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
60720b3346cSJunchao Zhang #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
608