xref: /petsc/src/sys/utils/openmp/mpmpishm.c (revision d8f7746b020311b8d34d94008ae0987ac909cf22)
1ce78bad3SBarry Smith #include <petscsys.h> /*I  "petscsys.h"  I*/
2ce78bad3SBarry Smith #include <petsc/private/petscimpl.h>
3ce78bad3SBarry Smith #include <pthread.h>
4ce78bad3SBarry Smith #include <hwloc.h>
5ce78bad3SBarry Smith #include <omp.h>
6ce78bad3SBarry Smith 
7ce78bad3SBarry Smith /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
8ce78bad3SBarry Smith    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
9ce78bad3SBarry Smith    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
10ce78bad3SBarry Smith    by 50%. Until the reason is found out, we use mmap() instead.
11ce78bad3SBarry Smith */
12ce78bad3SBarry Smith #define USE_MMAP_ALLOCATE_SHARED_MEMORY
13ce78bad3SBarry Smith 
14ce78bad3SBarry Smith #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
15ce78bad3SBarry Smith   #include <sys/mman.h>
16ce78bad3SBarry Smith   #include <sys/types.h>
17ce78bad3SBarry Smith   #include <sys/stat.h>
18ce78bad3SBarry Smith   #include <fcntl.h>
19ce78bad3SBarry Smith #endif
20ce78bad3SBarry Smith 
21ce78bad3SBarry Smith struct _n_PetscOmpCtrl {
22ce78bad3SBarry Smith   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
23ce78bad3SBarry Smith   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
24ce78bad3SBarry Smith   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
25ce78bad3SBarry Smith   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
26ce78bad3SBarry Smith   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
27ce78bad3SBarry Smith   pthread_barrier_t *barrier;         /* pointer to the barrier */
28ce78bad3SBarry Smith   hwloc_topology_t   topology;
29ce78bad3SBarry Smith   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
30ce78bad3SBarry Smith   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
31ce78bad3SBarry Smith };
32ce78bad3SBarry Smith 
33ce78bad3SBarry Smith /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
34ce78bad3SBarry Smith    contained by the controller.
35ce78bad3SBarry Smith 
36ce78bad3SBarry Smith    PETSc OpenMP controller users do not call this function directly. This function exists
37ce78bad3SBarry Smith    only because we want to separate shared memory allocation methods from other code.
38ce78bad3SBarry Smith  */
PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)39ce78bad3SBarry Smith static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
40ce78bad3SBarry Smith {
41ce78bad3SBarry Smith   MPI_Aint              size;
42ce78bad3SBarry Smith   void                 *baseptr;
43ce78bad3SBarry Smith   pthread_barrierattr_t attr;
44ce78bad3SBarry Smith 
45ce78bad3SBarry Smith #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
46ce78bad3SBarry Smith   int  fd;
47ce78bad3SBarry Smith   char pathname[PETSC_MAX_PATH_LEN];
48ce78bad3SBarry Smith #else
49ce78bad3SBarry Smith   PetscMPIInt disp_unit;
50ce78bad3SBarry Smith #endif
51ce78bad3SBarry Smith 
52ce78bad3SBarry Smith   PetscFunctionBegin;
53ce78bad3SBarry Smith #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
54ce78bad3SBarry Smith   size = sizeof(pthread_barrier_t);
55ce78bad3SBarry Smith   if (ctrl->is_omp_master) {
56ce78bad3SBarry Smith     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
57ce78bad3SBarry Smith     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
58ce78bad3SBarry Smith     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
59ce78bad3SBarry Smith     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
60ce78bad3SBarry Smith     fd = mkstemp(pathname);
61ce78bad3SBarry Smith     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
62ce78bad3SBarry Smith     PetscCallExternal(ftruncate, fd, size);
63ce78bad3SBarry Smith     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
64ce78bad3SBarry Smith     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
65ce78bad3SBarry Smith     PetscCallExternal(close, fd);
66ce78bad3SBarry Smith     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
67ce78bad3SBarry Smith     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
68ce78bad3SBarry Smith     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
69ce78bad3SBarry Smith     PetscCallExternal(unlink, pathname);
70ce78bad3SBarry Smith   } else {
71ce78bad3SBarry Smith     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
72ce78bad3SBarry Smith     fd = open(pathname, O_RDWR);
73ce78bad3SBarry Smith     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
74ce78bad3SBarry Smith     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
75ce78bad3SBarry Smith     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
76ce78bad3SBarry Smith     PetscCallExternal(close, fd);
77ce78bad3SBarry Smith     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
78ce78bad3SBarry Smith   }
79ce78bad3SBarry Smith #else
80ce78bad3SBarry Smith   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
81ce78bad3SBarry Smith   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
82ce78bad3SBarry Smith   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
83ce78bad3SBarry Smith #endif
84ce78bad3SBarry Smith   ctrl->barrier = (pthread_barrier_t *)baseptr;
85ce78bad3SBarry Smith 
86ce78bad3SBarry Smith   /* omp master initializes the barrier */
87ce78bad3SBarry Smith   if (ctrl->is_omp_master) {
88ce78bad3SBarry Smith     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
89ce78bad3SBarry Smith     PetscCallExternal(pthread_barrierattr_init, &attr);
90ce78bad3SBarry Smith     PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
91ce78bad3SBarry Smith     PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
92ce78bad3SBarry Smith     PetscCallExternal(pthread_barrierattr_destroy, &attr);
93ce78bad3SBarry Smith   }
94ce78bad3SBarry Smith 
95ce78bad3SBarry Smith   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
96ce78bad3SBarry Smith   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
97ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
98ce78bad3SBarry Smith }
99ce78bad3SBarry Smith 
100ce78bad3SBarry Smith /* Destroy the pthread barrier in the PETSc OpenMP controller */
PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)101ce78bad3SBarry Smith static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
102ce78bad3SBarry Smith {
103ce78bad3SBarry Smith   PetscFunctionBegin;
104ce78bad3SBarry Smith   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
105ce78bad3SBarry Smith   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
106ce78bad3SBarry Smith   if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);
107ce78bad3SBarry Smith 
108ce78bad3SBarry Smith #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
109ce78bad3SBarry Smith   PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
110ce78bad3SBarry Smith #else
111ce78bad3SBarry Smith   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
112ce78bad3SBarry Smith #endif
113ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
114ce78bad3SBarry Smith }
115ce78bad3SBarry Smith 
116ce78bad3SBarry Smith /*@C
117ce78bad3SBarry Smith   PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
118ce78bad3SBarry Smith 
119ce78bad3SBarry Smith   Input Parameters:
120ce78bad3SBarry Smith + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
121ce78bad3SBarry Smith - nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
122ce78bad3SBarry Smith 
123ce78bad3SBarry Smith   Output Parameter:
124ce78bad3SBarry Smith . pctrl - a PETSc OpenMP controller
125ce78bad3SBarry Smith 
126ce78bad3SBarry Smith   Level: developer
127ce78bad3SBarry Smith 
128ce78bad3SBarry Smith   Developer Note:
129ce78bad3SBarry Smith   Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
130ce78bad3SBarry Smith 
131ce78bad3SBarry Smith .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
132ce78bad3SBarry Smith @*/
PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl * pctrl)133ce78bad3SBarry Smith PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
134ce78bad3SBarry Smith {
135ce78bad3SBarry Smith   PetscOmpCtrl   ctrl;
136ce78bad3SBarry Smith   unsigned long *cpu_ulongs = NULL;
137ce78bad3SBarry Smith   PetscShmComm   pshmcomm;
138ce78bad3SBarry Smith   MPI_Comm       shm_comm;
139*d83f41f0SPierre Jolivet   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color, nr_cpu_ulongs;
140ce78bad3SBarry Smith   PetscInt       num_packages, num_cores;
141ce78bad3SBarry Smith 
142ce78bad3SBarry Smith   PetscFunctionBegin;
143ce78bad3SBarry Smith   PetscCall(PetscNew(&ctrl));
144ce78bad3SBarry Smith 
145ce78bad3SBarry Smith   /*=================================================================================
146ce78bad3SBarry Smith     Init hwloc
147ce78bad3SBarry Smith    ==================================================================================*/
148ce78bad3SBarry Smith   PetscCallExternal(hwloc_topology_init, &ctrl->topology);
149ce78bad3SBarry Smith #if HWLOC_API_VERSION >= 0x00020000
150ce78bad3SBarry Smith   /* to filter out unneeded info and have faster hwloc_topology_load */
151ce78bad3SBarry Smith   PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
152ce78bad3SBarry Smith   PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
153ce78bad3SBarry Smith #endif
154ce78bad3SBarry Smith   PetscCallExternal(hwloc_topology_load, ctrl->topology);
155ce78bad3SBarry Smith 
156ce78bad3SBarry Smith   /*=================================================================================
157ce78bad3SBarry Smith     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
158ce78bad3SBarry Smith     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
159ce78bad3SBarry Smith     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
160ce78bad3SBarry Smith     which is usually passed to third party libraries.
161ce78bad3SBarry Smith    ==================================================================================*/
162ce78bad3SBarry Smith 
163ce78bad3SBarry Smith   /* fetch the stored shared memory communicator */
164ce78bad3SBarry Smith   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
165ce78bad3SBarry Smith   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
166ce78bad3SBarry Smith 
167ce78bad3SBarry Smith   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
168ce78bad3SBarry Smith   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
169ce78bad3SBarry Smith 
170ce78bad3SBarry Smith   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
171ce78bad3SBarry Smith   if (nthreads == -1) {
172ce78bad3SBarry Smith     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
173ce78bad3SBarry Smith     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
174ce78bad3SBarry Smith     nthreads     = num_cores / num_packages;
175ce78bad3SBarry Smith     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
176ce78bad3SBarry Smith   }
177ce78bad3SBarry Smith 
178ce78bad3SBarry Smith   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
179ce78bad3SBarry Smith   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
180ce78bad3SBarry Smith 
181ce78bad3SBarry Smith   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
182ce78bad3SBarry Smith      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
183ce78bad3SBarry Smith      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
184ce78bad3SBarry Smith      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
185ce78bad3SBarry Smith      Use 0 as key so that rank ordering wont change in new comm.
186ce78bad3SBarry Smith    */
187ce78bad3SBarry Smith   color = shm_rank / nthreads;
188ce78bad3SBarry Smith   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
189ce78bad3SBarry Smith 
190ce78bad3SBarry Smith   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
191ce78bad3SBarry Smith   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
192ce78bad3SBarry Smith   if (!omp_rank) {
193ce78bad3SBarry Smith     ctrl->is_omp_master = PETSC_TRUE; /* master */
194ce78bad3SBarry Smith     color               = 0;
195ce78bad3SBarry Smith   } else {
196ce78bad3SBarry Smith     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
197ce78bad3SBarry Smith     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
198ce78bad3SBarry Smith   }
199ce78bad3SBarry Smith   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
200ce78bad3SBarry Smith 
201ce78bad3SBarry Smith   /*=================================================================================
202ce78bad3SBarry Smith     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
203ce78bad3SBarry Smith     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
204ce78bad3SBarry Smith     and run them on the idle CPUs.
205ce78bad3SBarry Smith    ==================================================================================*/
206ce78bad3SBarry Smith   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
207ce78bad3SBarry Smith 
208ce78bad3SBarry Smith   /*=================================================================================
209ce78bad3SBarry Smith     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
210ce78bad3SBarry Smith     is the union of the bindings of all ranks in the omp_comm
211ce78bad3SBarry Smith     =================================================================================*/
212ce78bad3SBarry Smith 
213ce78bad3SBarry Smith   ctrl->cpuset = hwloc_bitmap_alloc();
214ce78bad3SBarry Smith   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
215ce78bad3SBarry Smith   PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
216ce78bad3SBarry Smith 
217ce78bad3SBarry Smith   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
218ce78bad3SBarry Smith   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
219ce78bad3SBarry Smith   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
220ce78bad3SBarry Smith   if (nr_cpu_ulongs == 1) {
221ce78bad3SBarry Smith     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
222ce78bad3SBarry Smith   } else {
223*d83f41f0SPierre Jolivet     for (PetscMPIInt i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
224ce78bad3SBarry Smith   }
225ce78bad3SBarry Smith 
226ce78bad3SBarry Smith   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
227ce78bad3SBarry Smith 
228ce78bad3SBarry Smith   if (ctrl->is_omp_master) {
229ce78bad3SBarry Smith     ctrl->omp_cpuset = hwloc_bitmap_alloc();
230ce78bad3SBarry Smith     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
231ce78bad3SBarry Smith     if (nr_cpu_ulongs == 1) {
232ce78bad3SBarry Smith #if HWLOC_API_VERSION >= 0x00020000
233ce78bad3SBarry Smith       PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
234ce78bad3SBarry Smith #else
235ce78bad3SBarry Smith       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
236ce78bad3SBarry Smith #endif
237ce78bad3SBarry Smith     } else {
238*d83f41f0SPierre Jolivet       for (PetscMPIInt i = 0; i < nr_cpu_ulongs; i++) {
239ce78bad3SBarry Smith #if HWLOC_API_VERSION >= 0x00020000
240ce78bad3SBarry Smith         PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
241ce78bad3SBarry Smith #else
242ce78bad3SBarry Smith         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
243ce78bad3SBarry Smith #endif
244ce78bad3SBarry Smith       }
245ce78bad3SBarry Smith     }
246ce78bad3SBarry Smith   }
247ce78bad3SBarry Smith   PetscCall(PetscFree(cpu_ulongs));
248ce78bad3SBarry Smith   *pctrl = ctrl;
249ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
250ce78bad3SBarry Smith }
251ce78bad3SBarry Smith 
252ce78bad3SBarry Smith /*@C
253ce78bad3SBarry Smith   PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
254ce78bad3SBarry Smith 
255ce78bad3SBarry Smith   Input Parameter:
256ce78bad3SBarry Smith . pctrl - a PETSc OpenMP controller
257ce78bad3SBarry Smith 
258ce78bad3SBarry Smith   Level: developer
259ce78bad3SBarry Smith 
260ce78bad3SBarry Smith .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
261ce78bad3SBarry Smith @*/
PetscOmpCtrlDestroy(PetscOmpCtrl * pctrl)262ce78bad3SBarry Smith PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
263ce78bad3SBarry Smith {
264ce78bad3SBarry Smith   PetscOmpCtrl ctrl = *pctrl;
265ce78bad3SBarry Smith 
266ce78bad3SBarry Smith   PetscFunctionBegin;
267ce78bad3SBarry Smith   hwloc_bitmap_free(ctrl->cpuset);
268ce78bad3SBarry Smith   hwloc_topology_destroy(ctrl->topology);
269ce78bad3SBarry Smith   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
270ce78bad3SBarry Smith   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
271ce78bad3SBarry Smith   if (ctrl->is_omp_master) {
272ce78bad3SBarry Smith     hwloc_bitmap_free(ctrl->omp_cpuset);
273ce78bad3SBarry Smith     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
274ce78bad3SBarry Smith   }
275ce78bad3SBarry Smith   PetscCall(PetscFree(ctrl));
276ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
277ce78bad3SBarry Smith }
278ce78bad3SBarry Smith 
279ce78bad3SBarry Smith /*@C
280ce78bad3SBarry Smith   PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
281ce78bad3SBarry Smith 
282ce78bad3SBarry Smith   Input Parameter:
283ce78bad3SBarry Smith . ctrl - a PETSc OMP controller
284ce78bad3SBarry Smith 
285ce78bad3SBarry Smith   Output Parameters:
286ce78bad3SBarry Smith + omp_comm        - a communicator that includes a master rank and slave ranks where master spawns threads
287ce78bad3SBarry Smith . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm;
288ce78bad3SBarry Smith                     on slave ranks, `MPI_COMM_NULL` will be return in reality.
289ce78bad3SBarry Smith - is_omp_master   - true if the calling process is an OMP master rank.
290ce78bad3SBarry Smith 
291ce78bad3SBarry Smith   Note:
292ce78bad3SBarry Smith   Any output parameter can be `NULL`. The parameter is just ignored.
293ce78bad3SBarry Smith 
294ce78bad3SBarry Smith   Level: developer
295ce78bad3SBarry Smith 
296ce78bad3SBarry Smith .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
297ce78bad3SBarry Smith @*/
PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm * omp_comm,MPI_Comm * omp_master_comm,PetscBool * is_omp_master)298ce78bad3SBarry Smith PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
299ce78bad3SBarry Smith {
300ce78bad3SBarry Smith   PetscFunctionBegin;
301ce78bad3SBarry Smith   if (omp_comm) *omp_comm = ctrl->omp_comm;
302ce78bad3SBarry Smith   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
303ce78bad3SBarry Smith   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
304ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
305ce78bad3SBarry Smith }
306ce78bad3SBarry Smith 
307ce78bad3SBarry Smith /*@C
308ce78bad3SBarry Smith   PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
309ce78bad3SBarry Smith 
310ce78bad3SBarry Smith   Input Parameter:
311ce78bad3SBarry Smith . ctrl - a PETSc OMP controller
312ce78bad3SBarry Smith 
313ce78bad3SBarry Smith   Notes:
314ce78bad3SBarry Smith   This is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
315ce78bad3SBarry Smith   require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
316ce78bad3SBarry Smith   MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
317ce78bad3SBarry Smith 
318ce78bad3SBarry Smith   A code using `PetscOmpCtrlBarrier()` would be like this,
319ce78bad3SBarry Smith .vb
320ce78bad3SBarry Smith   if (is_omp_master) {
321ce78bad3SBarry Smith     PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
322ce78bad3SBarry Smith     Call the library using OpenMP
323ce78bad3SBarry Smith     PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
324ce78bad3SBarry Smith   }
325ce78bad3SBarry Smith   PetscOmpCtrlBarrier(ctrl);
326ce78bad3SBarry Smith .ve
327ce78bad3SBarry Smith 
328ce78bad3SBarry Smith   Level: developer
329ce78bad3SBarry Smith 
330ce78bad3SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
331ce78bad3SBarry Smith @*/
PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)332ce78bad3SBarry Smith PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
333ce78bad3SBarry Smith {
334ce78bad3SBarry Smith   int err;
335ce78bad3SBarry Smith 
336ce78bad3SBarry Smith   PetscFunctionBegin;
337ce78bad3SBarry Smith   err = pthread_barrier_wait(ctrl->barrier);
338ce78bad3SBarry Smith   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
339ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
340ce78bad3SBarry Smith }
341ce78bad3SBarry Smith 
342ce78bad3SBarry Smith /*@C
343ce78bad3SBarry Smith   PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
344ce78bad3SBarry Smith 
345ce78bad3SBarry Smith   Input Parameter:
346ce78bad3SBarry Smith . ctrl - a PETSc OMP controller
347ce78bad3SBarry Smith 
348ce78bad3SBarry Smith   Note:
349ce78bad3SBarry Smith   Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
350ce78bad3SBarry Smith   This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
351ce78bad3SBarry Smith 
352ce78bad3SBarry Smith   Level: developer
353ce78bad3SBarry Smith 
354ce78bad3SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
355ce78bad3SBarry Smith @*/
PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)356ce78bad3SBarry Smith PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
357ce78bad3SBarry Smith {
358ce78bad3SBarry Smith   PetscFunctionBegin;
359ce78bad3SBarry Smith   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
360ce78bad3SBarry Smith   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
361ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
362ce78bad3SBarry Smith }
363ce78bad3SBarry Smith 
364ce78bad3SBarry Smith /*@C
365ce78bad3SBarry Smith   PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
366ce78bad3SBarry Smith 
367ce78bad3SBarry Smith   Input Parameter:
368ce78bad3SBarry Smith . ctrl - a PETSc OMP controller
369ce78bad3SBarry Smith 
370ce78bad3SBarry Smith   Note:
371ce78bad3SBarry Smith   Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
372ce78bad3SBarry Smith   This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
373ce78bad3SBarry Smith 
374ce78bad3SBarry Smith   Level: developer
375ce78bad3SBarry Smith 
376ce78bad3SBarry Smith .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
377ce78bad3SBarry Smith @*/
PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)378ce78bad3SBarry Smith PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
379ce78bad3SBarry Smith {
380ce78bad3SBarry Smith   PetscFunctionBegin;
381ce78bad3SBarry Smith   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
382ce78bad3SBarry Smith   omp_set_num_threads(1);
383ce78bad3SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
384ce78bad3SBarry Smith }
385ce78bad3SBarry Smith 
386ce78bad3SBarry Smith #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
387