xref: /petsc/src/sys/utils/mpishm.c (revision df4cd43f92eaa320656440c40edb1046daee8f75)
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;         /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt  shmsize;           /* size of the shared memory communicator */
7   MPI_Comm     globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
19 {
20   PetscShmComm p = (PetscShmComm)val;
21 
22   PetscFunctionBegin;
23   PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
24   PetscCallMPI(MPI_Comm_free(&p->shmcomm));
25   PetscCallMPI(PetscFree(p->globranks));
26   PetscCallMPI(PetscFree(val));
27   PetscFunctionReturn(MPI_SUCCESS);
28 }
29 
30 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
31   /* Data structures to support freeing comms created in PetscShmCommGet().
32   Since we predict communicators passed to PetscShmCommGet() are very likely
33   either a petsc inner communicator or an MPI communicator with a linked petsc
34   inner communicator, we use a simple static array to store dupped communicators
35   on rare cases otherwise.
36  */
37   #define MAX_SHMCOMM_DUPPED_COMMS 16
38 static PetscInt       num_dupped_comms = 0;
39 static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
40 static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
41 {
42   PetscInt i;
43   PetscFunctionBegin;
44   for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
45   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
46   PetscFunctionReturn(PETSC_SUCCESS);
47 }
48 #endif
49 
50 /*@C
51     PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory
52 
53     Collective.
54 
55     Input Parameter:
56 .   globcomm - `MPI_Comm`, which can be a user MPI_Comm or a PETSc inner MPI_Comm
57 
58     Output Parameter:
59 .   pshmcomm - the PETSc shared memory communicator object
60 
61     Level: developer
62 
63     Note:
64        When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
65 
66 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
67 @*/
68 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
69 {
70 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
71   MPI_Group         globgroup, shmgroup;
72   PetscMPIInt      *shmranks, i, flg;
73   PetscCommCounter *counter;
74 
75   PetscFunctionBegin;
76   PetscValidPointer(pshmcomm, 2);
77   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
78   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
79   if (!flg) { /* globcomm is not a petsc comm */
80     union
81     {
82       MPI_Comm comm;
83       void    *ptr;
84     } ucomm;
85     /* check if globcomm already has a linked petsc inner comm */
86     PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
87     if (!flg) {
88       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
89       PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
90       PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
91       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
92       if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
93       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
94       num_dupped_comms++;
95     } else {
96       /* otherwise, we pull out the inner comm and use it as globcomm */
97       globcomm = ucomm.comm;
98     }
99   }
100 
101   /* Check if globcomm already has an attached pshmcomm. If no, create one */
102   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
103   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
104 
105   PetscCall(PetscNew(pshmcomm));
106   (*pshmcomm)->globcomm = globcomm;
107 
108   PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));
109 
110   PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
111   PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
112   PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
113   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
114   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
115   for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
116   PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
117   PetscCall(PetscFree(shmranks));
118   PetscCallMPI(MPI_Group_free(&globgroup));
119   PetscCallMPI(MPI_Group_free(&shmgroup));
120 
121   for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]));
122   PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
123   PetscFunctionReturn(PETSC_SUCCESS);
124 #else
125   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
126 #endif
127 }
128 
129 /*@C
130     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
131 
132     Input Parameters:
133 +   pshmcomm - the shared memory communicator object
134 -   grank    - the global rank
135 
136     Output Parameter:
137 .   lrank - the local rank, or `MPI_PROC_NULL` if it does not exist
138 
139     Level: developer
140 
141     Developer Notes:
142     Assumes the pshmcomm->globranks[] is sorted
143 
144     It may be better to rewrite this to map multiple global ranks to local in the same function call
145 
146 .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
147 @*/
148 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
149 {
150   PetscMPIInt low, high, t, i;
151   PetscBool   flg = PETSC_FALSE;
152 
153   PetscFunctionBegin;
154   PetscValidPointer(pshmcomm, 1);
155   PetscValidIntPointer(lrank, 3);
156   *lrank = MPI_PROC_NULL;
157   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS);
158   if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS);
159   PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
160   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
161   low  = 0;
162   high = pshmcomm->shmsize;
163   while (high - low > 5) {
164     t = (low + high) / 2;
165     if (pshmcomm->globranks[t] > grank) high = t;
166     else low = t;
167   }
168   for (i = low; i < high; i++) {
169     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS);
170     if (pshmcomm->globranks[i] == grank) {
171       *lrank = i;
172       PetscFunctionReturn(PETSC_SUCCESS);
173     }
174   }
175   PetscFunctionReturn(PETSC_SUCCESS);
176 }
177 
178 /*@C
179     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
180 
181     Input Parameters:
182 +   pshmcomm - the shared memory communicator object
183 -   lrank    - the local rank in the shared memory communicator
184 
185     Output Parameter:
186 .   grank - the global rank in the global communicator where the shared memory communicator is built
187 
188     Level: developer
189 
190 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
191 @*/
192 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
193 {
194   PetscFunctionBegin;
195   PetscValidPointer(pshmcomm, 1);
196   PetscValidIntPointer(grank, 3);
197   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
198   *grank = pshmcomm->globranks[lrank];
199   PetscFunctionReturn(PETSC_SUCCESS);
200 }
201 
202 /*@C
203     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
204 
205     Input Parameter:
206 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
207 
208     Output Parameter:
209 .   comm     - the MPI communicator
210 
211     Level: developer
212 
213 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
214 @*/
215 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
216 {
217   PetscFunctionBegin;
218   PetscValidPointer(pshmcomm, 1);
219   PetscValidPointer(comm, 2);
220   *comm = pshmcomm->shmcomm;
221   PetscFunctionReturn(PETSC_SUCCESS);
222 }
223 
224 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
225   #include <pthread.h>
226   #include <hwloc.h>
227   #include <omp.h>
228 
229   /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
230    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
231    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
232    by 50%. Until the reason is found out, we use mmap() instead.
233 */
234   #define USE_MMAP_ALLOCATE_SHARED_MEMORY
235 
236   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
237     #include <sys/mman.h>
238     #include <sys/types.h>
239     #include <sys/stat.h>
240     #include <fcntl.h>
241   #endif
242 
243 struct _n_PetscOmpCtrl {
244   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
245   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
246   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
247   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
248   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
249   pthread_barrier_t *barrier;         /* pointer to the barrier */
250   hwloc_topology_t   topology;
251   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
252   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
253 };
254 
255 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
256    contained by the controller.
257 
258    PETSc OpenMP controller users do not call this function directly. This function exists
259    only because we want to separate shared memory allocation methods from other code.
260  */
261 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
262 {
263   MPI_Aint              size;
264   void                 *baseptr;
265   pthread_barrierattr_t attr;
266 
267   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
268   PetscInt  fd;
269   PetscChar pathname[PETSC_MAX_PATH_LEN];
270   #else
271   PetscMPIInt disp_unit;
272   #endif
273 
274   PetscFunctionBegin;
275   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
276   size = sizeof(pthread_barrier_t);
277   if (ctrl->is_omp_master) {
278     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
279     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
280     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
281     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
282     fd = mkstemp(pathname);
283     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
284     PetscCall(ftruncate(fd, size));
285     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
286     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
287     PetscCall(close(fd));
288     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
289     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
290     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
291     PetscCall(unlink(pathname));
292   } else {
293     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
294     fd = open(pathname, O_RDWR);
295     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
296     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
297     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
298     PetscCall(close(fd));
299     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
300   }
301   #else
302   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
303   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
304   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
305   #endif
306   ctrl->barrier = (pthread_barrier_t *)baseptr;
307 
308   /* omp master initializes the barrier */
309   if (ctrl->is_omp_master) {
310     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
311     PetscCall(pthread_barrierattr_init(&attr));
312     PetscCall(pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)); /* make the barrier also work for processes */
313     PetscCall(pthread_barrier_init(ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size));
314     PetscCall(pthread_barrierattr_destroy(&attr));
315   }
316 
317   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
318   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
319   PetscFunctionReturn(PETSC_SUCCESS);
320 }
321 
322 /* Destroy the pthread barrier in the PETSc OpenMP controller */
323 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
324 {
325   PetscFunctionBegin;
326   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
327   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
328   if (ctrl->is_omp_master) PetscCall(pthread_barrier_destroy(ctrl->barrier));
329 
330   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
331   PetscCall(munmap(ctrl->barrier, sizeof(pthread_barrier_t)));
332   #else
333   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
334   #endif
335   PetscFunctionReturn(PETSC_SUCCESS);
336 }
337 
338 /*@C
339     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
340 
341     Input Parameters:
342 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
343 -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
344 
345     Output Parameter:
346 .   pctrl      - a PETSc OpenMP controller
347 
348     Level: developer
349 
350     Developer Note:
351     Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
352 
353 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
354 @*/
355 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
356 {
357   PetscOmpCtrl   ctrl;
358   unsigned long *cpu_ulongs = NULL;
359   PetscInt       i, nr_cpu_ulongs;
360   PetscShmComm   pshmcomm;
361   MPI_Comm       shm_comm;
362   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color;
363   PetscInt       num_packages, num_cores;
364 
365   PetscFunctionBegin;
366   PetscCall(PetscNew(&ctrl));
367 
368   /*=================================================================================
369     Init hwloc
370    ==================================================================================*/
371   PetscCall(hwloc_topology_init(&ctrl->topology));
372   #if HWLOC_API_VERSION >= 0x00020000
373   /* to filter out unneeded info and have faster hwloc_topology_load */
374   PetscCall(hwloc_topology_set_all_types_filter(ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE));
375   PetscCall(hwloc_topology_set_type_filter(ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL));
376   #endif
377   PetscCall(hwloc_topology_load(ctrl->topology));
378 
379   /*=================================================================================
380     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
381     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
382     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
383     which is usually passed to third party libraries.
384    ==================================================================================*/
385 
386   /* fetch the stored shared memory communicator */
387   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
388   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
389 
390   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
391   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
392 
393   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
394   if (nthreads == -1) {
395     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
396     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
397     nthreads     = num_cores / num_packages;
398     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
399   }
400 
401   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
402   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
403 
404   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
405      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
406      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
407      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
408      Use 0 as key so that rank ordering wont change in new comm.
409    */
410   color = shm_rank / nthreads;
411   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
412 
413   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
414   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
415   if (!omp_rank) {
416     ctrl->is_omp_master = PETSC_TRUE; /* master */
417     color               = 0;
418   } else {
419     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
420     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
421   }
422   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
423 
424   /*=================================================================================
425     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
426     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
427     and run them on the idle CPUs.
428    ==================================================================================*/
429   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
430 
431   /*=================================================================================
432     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
433     is the union of the bindings of all ranks in the omp_comm
434     =================================================================================*/
435 
436   ctrl->cpuset = hwloc_bitmap_alloc();
437   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
438   PetscCall(hwloc_get_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS));
439 
440   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
441   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
442   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
443   if (nr_cpu_ulongs == 1) {
444     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
445   } else {
446     for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
447   }
448 
449   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
450 
451   if (ctrl->is_omp_master) {
452     ctrl->omp_cpuset = hwloc_bitmap_alloc();
453     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
454     if (nr_cpu_ulongs == 1) {
455   #if HWLOC_API_VERSION >= 0x00020000
456       PetscCall(hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]));
457   #else
458       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
459   #endif
460     } else {
461       for (i = 0; i < nr_cpu_ulongs; i++) {
462   #if HWLOC_API_VERSION >= 0x00020000
463         PetscCall(hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]));
464   #else
465         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
466   #endif
467       }
468     }
469   }
470   PetscCall(PetscFree(cpu_ulongs));
471   *pctrl = ctrl;
472   PetscFunctionReturn(PETSC_SUCCESS);
473 }
474 
475 /*@C
476     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
477 
478     Input Parameter:
479 .   pctrl  - a PETSc OpenMP controller
480 
481     Level: developer
482 
483 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
484 @*/
485 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
486 {
487   PetscOmpCtrl ctrl = *pctrl;
488 
489   PetscFunctionBegin;
490   hwloc_bitmap_free(ctrl->cpuset);
491   hwloc_topology_destroy(ctrl->topology);
492   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
493   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
494   if (ctrl->is_omp_master) {
495     hwloc_bitmap_free(ctrl->omp_cpuset);
496     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
497   }
498   PetscCall(PetscFree(ctrl));
499   PetscFunctionReturn(PETSC_SUCCESS);
500 }
501 
502 /*@C
503     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
504 
505     Input Parameter:
506 .   ctrl - a PETSc OMP controller
507 
508     Output Parameters:
509 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
510 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
511                        on slave ranks, `MPI_COMM_NULL` will be return in reality.
512 -   is_omp_master    - true if the calling process is an OMP master rank.
513 
514     Note:
515     Any output parameter can be NULL. The parameter is just ignored.
516 
517     Level: developer
518 
519 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
520 @*/
521 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
522 {
523   PetscFunctionBegin;
524   if (omp_comm) *omp_comm = ctrl->omp_comm;
525   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
526   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
527   PetscFunctionReturn(PETSC_SUCCESS);
528 }
529 
530 /*@C
531     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
532 
533     Input Parameter:
534 .   ctrl - a PETSc OMP controller
535 
536     Notes:
537     this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
538     require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
539     MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
540 
541     A code using `PetscOmpCtrlBarrier()` would be like this,
542 .vb
543     if (is_omp_master) {
544       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
545       Call the library using OpenMP
546       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
547     }
548     PetscOmpCtrlBarrier(ctrl);
549 .ve
550 
551     Level: developer
552 
553 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
554 @*/
555 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
556 {
557   int err;
558 
559   PetscFunctionBegin;
560   err = pthread_barrier_wait(ctrl->barrier);
561   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
562   PetscFunctionReturn(PETSC_SUCCESS);
563 }
564 
565 /*@C
566     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
567 
568     Input Parameter:
569 .   ctrl - a PETSc OMP controller
570 
571     Note:
572     Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
573     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
574 
575     Level: developer
576 
577 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
578 @*/
579 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
580 {
581   PetscFunctionBegin;
582   PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS));
583   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
584   PetscFunctionReturn(PETSC_SUCCESS);
585 }
586 
587 /*@C
588    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
589 
590    Input Parameter:
591 .  ctrl - a PETSc OMP controller
592 
593    Note:
594    Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
595    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
596 
597    Level: developer
598 
599 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
600 @*/
601 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
602 {
603   PetscFunctionBegin;
604   PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS));
605   omp_set_num_threads(1);
606   PetscFunctionReturn(PETSC_SUCCESS);
607 }
608 
609   #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
610 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
611