xref: /petsc/src/sys/utils/mpishm.c (revision 2d30e087755efd99e28fdfe792ffbeb2ee1ea928)
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;         /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt  shmsize;           /* size of the shared memory communicator */
7   MPI_Comm     globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state) {
19   PetscShmComm p = (PetscShmComm)val;
20 
21   PetscFunctionBegin;
22   PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
23   PetscCallMPI(MPI_Comm_free(&p->shmcomm));
24   PetscCallMPI(PetscFree(p->globranks));
25   PetscCallMPI(PetscFree(val));
26   PetscFunctionReturn(MPI_SUCCESS);
27 }
28 
29 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
30 /* Data structures to support freeing comms created in PetscShmCommGet().
31   Since we predict communicators passed to PetscShmCommGet() are very likely
32   either a petsc inner communicator or an MPI communicator with a linked petsc
33   inner communicator, we use a simple static array to store dupped communicators
34   on rare cases otherwise.
35  */
36 #define MAX_SHMCOMM_DUPPED_COMMS 16
37 static PetscInt       num_dupped_comms = 0;
38 static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
39 static PetscErrorCode PetscShmCommDestroyDuppedComms(void) {
40   PetscInt i;
41   PetscFunctionBegin;
42   for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
43   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
44   PetscFunctionReturn(0);
45 }
46 #endif
47 
48 /*@C
49     PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory
50 
51     Collective.
52 
53     Input Parameter:
54 .   globcomm - `MPI_Comm`, which can be a user MPI_Comm or a PETSc inner MPI_Comm
55 
56     Output Parameter:
57 .   pshmcomm - the PETSc shared memory communicator object
58 
59     Level: developer
60 
61     Note:
62        When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
63 
64 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
65 @*/
66 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm) {
67 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
68   MPI_Group         globgroup, shmgroup;
69   PetscMPIInt      *shmranks, i, flg;
70   PetscCommCounter *counter;
71 
72   PetscFunctionBegin;
73   PetscValidPointer(pshmcomm, 2);
74   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
75   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
76   if (!flg) { /* globcomm is not a petsc comm */
77     union
78     {
79       MPI_Comm comm;
80       void    *ptr;
81     } ucomm;
82     /* check if globcomm already has a linked petsc inner comm */
83     PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
84     if (!flg) {
85       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
86       PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
87       PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
88       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
89       if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
90       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
91       num_dupped_comms++;
92     } else {
93       /* otherwise, we pull out the inner comm and use it as globcomm */
94       globcomm = ucomm.comm;
95     }
96   }
97 
98   /* Check if globcomm already has an attached pshmcomm. If no, create one */
99   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
100   if (flg) PetscFunctionReturn(0);
101 
102   PetscCall(PetscNew(pshmcomm));
103   (*pshmcomm)->globcomm = globcomm;
104 
105   PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));
106 
107   PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
108   PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
109   PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
110   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
111   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
112   for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
113   PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
114   PetscCall(PetscFree(shmranks));
115   PetscCallMPI(MPI_Group_free(&globgroup));
116   PetscCallMPI(MPI_Group_free(&shmgroup));
117 
118   for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]));
119   PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
120   PetscFunctionReturn(0);
121 #else
122   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
123 #endif
124 }
125 
126 /*@C
127     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
128 
129     Input Parameters:
130 +   pshmcomm - the shared memory communicator object
131 -   grank    - the global rank
132 
133     Output Parameter:
134 .   lrank - the local rank, or `MPI_PROC_NULL` if it does not exist
135 
136     Level: developer
137 
138     Developer Notes:
139     Assumes the pshmcomm->globranks[] is sorted
140 
141     It may be better to rewrite this to map multiple global ranks to local in the same function call
142 
143 .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
144 @*/
145 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank) {
146   PetscMPIInt low, high, t, i;
147   PetscBool   flg = PETSC_FALSE;
148 
149   PetscFunctionBegin;
150   PetscValidPointer(pshmcomm, 1);
151   PetscValidIntPointer(lrank, 3);
152   *lrank = MPI_PROC_NULL;
153   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
154   if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(0);
155   PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
156   if (flg) PetscFunctionReturn(0);
157   low  = 0;
158   high = pshmcomm->shmsize;
159   while (high - low > 5) {
160     t = (low + high) / 2;
161     if (pshmcomm->globranks[t] > grank) high = t;
162     else low = t;
163   }
164   for (i = low; i < high; i++) {
165     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
166     if (pshmcomm->globranks[i] == grank) {
167       *lrank = i;
168       PetscFunctionReturn(0);
169     }
170   }
171   PetscFunctionReturn(0);
172 }
173 
174 /*@C
175     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
176 
177     Input Parameters:
178 +   pshmcomm - the shared memory communicator object
179 -   lrank    - the local rank in the shared memory communicator
180 
181     Output Parameter:
182 .   grank - the global rank in the global communicator where the shared memory communicator is built
183 
184     Level: developer
185 
186 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
187 @*/
188 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank) {
189   PetscFunctionBegin;
190   PetscValidPointer(pshmcomm, 1);
191   PetscValidIntPointer(grank, 3);
192   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
193   *grank = pshmcomm->globranks[lrank];
194   PetscFunctionReturn(0);
195 }
196 
197 /*@C
198     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
199 
200     Input Parameter:
201 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
202 
203     Output Parameter:
204 .   comm     - the MPI communicator
205 
206     Level: developer
207 
208 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
209 @*/
210 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm) {
211   PetscFunctionBegin;
212   PetscValidPointer(pshmcomm, 1);
213   PetscValidPointer(comm, 2);
214   *comm = pshmcomm->shmcomm;
215   PetscFunctionReturn(0);
216 }
217 
218 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
219 #include <pthread.h>
220 #include <hwloc.h>
221 #include <omp.h>
222 
223 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
224    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
225    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
226    by 50%. Until the reason is found out, we use mmap() instead.
227 */
228 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
229 
230 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
231 #include <sys/mman.h>
232 #include <sys/types.h>
233 #include <sys/stat.h>
234 #include <fcntl.h>
235 #endif
236 
237 struct _n_PetscOmpCtrl {
238   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
239   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
240   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
241   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
242   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
243   pthread_barrier_t *barrier;         /* pointer to the barrier */
244   hwloc_topology_t   topology;
245   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
246   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
247 };
248 
249 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
250    contained by the controller.
251 
252    PETSc OpenMP controller users do not call this function directly. This function exists
253    only because we want to separate shared memory allocation methods from other code.
254  */
255 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) {
256   MPI_Aint              size;
257   void                 *baseptr;
258   pthread_barrierattr_t attr;
259 
260 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
261   PetscInt  fd;
262   PetscChar pathname[PETSC_MAX_PATH_LEN];
263 #else
264   PetscMPIInt disp_unit;
265 #endif
266 
267   PetscFunctionBegin;
268 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
269   size = sizeof(pthread_barrier_t);
270   if (ctrl->is_omp_master) {
271     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
272     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
273     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
274     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
275     fd = mkstemp(pathname);
276     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
277     PetscCall(ftruncate(fd, size));
278     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
279     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
280     PetscCall(close(fd));
281     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
282     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
283     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
284     PetscCall(unlink(pathname));
285   } else {
286     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
287     fd = open(pathname, O_RDWR);
288     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
289     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
290     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
291     PetscCall(close(fd));
292     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
293   }
294 #else
295   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
296   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
297   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
298 #endif
299   ctrl->barrier = (pthread_barrier_t *)baseptr;
300 
301   /* omp master initializes the barrier */
302   if (ctrl->is_omp_master) {
303     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
304     PetscCall(pthread_barrierattr_init(&attr));
305     PetscCall(pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)); /* make the barrier also work for processes */
306     PetscCall(pthread_barrier_init(ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size));
307     PetscCall(pthread_barrierattr_destroy(&attr));
308   }
309 
310   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
311   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
312   PetscFunctionReturn(0);
313 }
314 
315 /* Destroy the pthread barrier in the PETSc OpenMP controller */
316 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) {
317   PetscFunctionBegin;
318   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
319   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
320   if (ctrl->is_omp_master) PetscCall(pthread_barrier_destroy(ctrl->barrier));
321 
322 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
323   PetscCall(munmap(ctrl->barrier, sizeof(pthread_barrier_t)));
324 #else
325   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
326 #endif
327   PetscFunctionReturn(0);
328 }
329 
330 /*@C
331     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
332 
333     Input Parameters:
334 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
335 -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
336 
337     Output Parameter:
338 .   pctrl      - a PETSc OpenMP controller
339 
340     Level: developer
341 
342     Developer Note:
343     Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
344 
345 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
346 @*/
347 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl) {
348   PetscOmpCtrl   ctrl;
349   unsigned long *cpu_ulongs = NULL;
350   PetscInt       i, nr_cpu_ulongs;
351   PetscShmComm   pshmcomm;
352   MPI_Comm       shm_comm;
353   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color;
354   PetscInt       num_packages, num_cores;
355 
356   PetscFunctionBegin;
357   PetscCall(PetscNew(&ctrl));
358 
359   /*=================================================================================
360     Init hwloc
361    ==================================================================================*/
362   PetscCall(hwloc_topology_init(&ctrl->topology));
363 #if HWLOC_API_VERSION >= 0x00020000
364   /* to filter out unneeded info and have faster hwloc_topology_load */
365   PetscCall(hwloc_topology_set_all_types_filter(ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE));
366   PetscCall(hwloc_topology_set_type_filter(ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL));
367 #endif
368   PetscCall(hwloc_topology_load(ctrl->topology));
369 
370   /*=================================================================================
371     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
372     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
373     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
374     which is usually passed to third party libraries.
375    ==================================================================================*/
376 
377   /* fetch the stored shared memory communicator */
378   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
379   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
380 
381   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
382   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
383 
384   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
385   if (nthreads == -1) {
386     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
387     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
388     nthreads     = num_cores / num_packages;
389     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
390   }
391 
392   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
393   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
394 
395   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
396      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
397      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
398      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
399      Use 0 as key so that rank ordering wont change in new comm.
400    */
401   color = shm_rank / nthreads;
402   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
403 
404   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
405   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
406   if (!omp_rank) {
407     ctrl->is_omp_master = PETSC_TRUE; /* master */
408     color               = 0;
409   } else {
410     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
411     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
412   }
413   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
414 
415   /*=================================================================================
416     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
417     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
418     and run them on the idle CPUs.
419    ==================================================================================*/
420   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
421 
422   /*=================================================================================
423     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
424     is the union of the bindings of all ranks in the omp_comm
425     =================================================================================*/
426 
427   ctrl->cpuset = hwloc_bitmap_alloc();
428   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
429   PetscCall(hwloc_get_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS));
430 
431   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
432   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
433   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
434   if (nr_cpu_ulongs == 1) {
435     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
436   } else {
437     for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
438   }
439 
440   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
441 
442   if (ctrl->is_omp_master) {
443     ctrl->omp_cpuset = hwloc_bitmap_alloc();
444     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
445     if (nr_cpu_ulongs == 1) {
446 #if HWLOC_API_VERSION >= 0x00020000
447       PetscCall(hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]));
448 #else
449       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
450 #endif
451     } else {
452       for (i = 0; i < nr_cpu_ulongs; i++) {
453 #if HWLOC_API_VERSION >= 0x00020000
454         PetscCall(hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]));
455 #else
456         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
457 #endif
458       }
459     }
460   }
461   PetscCall(PetscFree(cpu_ulongs));
462   *pctrl = ctrl;
463   PetscFunctionReturn(0);
464 }
465 
466 /*@C
467     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
468 
469     Input Parameter:
470 .   pctrl  - a PETSc OpenMP controller
471 
472     Level: developer
473 
474 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
475 @*/
476 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) {
477   PetscOmpCtrl ctrl = *pctrl;
478 
479   PetscFunctionBegin;
480   hwloc_bitmap_free(ctrl->cpuset);
481   hwloc_topology_destroy(ctrl->topology);
482   PetscOmpCtrlDestroyBarrier(ctrl);
483   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
484   if (ctrl->is_omp_master) {
485     hwloc_bitmap_free(ctrl->omp_cpuset);
486     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
487   }
488   PetscCall(PetscFree(ctrl));
489   PetscFunctionReturn(0);
490 }
491 
492 /*@C
493     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
494 
495     Input Parameter:
496 .   ctrl - a PETSc OMP controller
497 
498     Output Parameters:
499 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
500 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
501                        on slave ranks, `MPI_COMM_NULL` will be return in reality.
502 -   is_omp_master    - true if the calling process is an OMP master rank.
503 
504     Note:
505     Any output parameter can be NULL. The parameter is just ignored.
506 
507     Level: developer
508 
509 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
510 @*/
511 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master) {
512   PetscFunctionBegin;
513   if (omp_comm) *omp_comm = ctrl->omp_comm;
514   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
515   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
516   PetscFunctionReturn(0);
517 }
518 
519 /*@C
520     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
521 
522     Input Parameter:
523 .   ctrl - a PETSc OMP controller
524 
525     Notes:
526     this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
527     require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
528     MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
529 
530     A code using `PetscOmpCtrlBarrier()` would be like this,
531 .vb
532     if (is_omp_master) {
533       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
534       Call the library using OpenMP
535       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
536     }
537     PetscOmpCtrlBarrier(ctrl);
538 .ve
539 
540     Level: developer
541 
542 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
543 @*/
544 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) {
545   int err;
546 
547   PetscFunctionBegin;
548   err = pthread_barrier_wait(ctrl->barrier);
549   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
550   PetscFunctionReturn(0);
551 }
552 
553 /*@C
554     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
555 
556     Input Parameter:
557 .   ctrl - a PETSc OMP controller
558 
559     Note:
560     Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
561     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
562 
563     Level: developer
564 
565 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
566 @*/
567 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) {
568   PetscFunctionBegin;
569   PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS));
570   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
571   PetscFunctionReturn(0);
572 }
573 
574 /*@C
575    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
576 
577    Input Parameter:
578 .  ctrl - a PETSc OMP controller
579 
580    Note:
581    Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
582    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
583 
584    Level: developer
585 
586 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
587 @*/
588 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) {
589   PetscFunctionBegin;
590   PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS));
591   omp_set_num_threads(1);
592   PetscFunctionReturn(0);
593 }
594 
595 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
596 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
597