xref: /petsc/src/sys/utils/mpishm.c (revision 98d129c30f3ee9fdddc40fdbc5a989b7be64f888)
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;         /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt  shmsize;           /* size of the shared memory communicator */
7   MPI_Comm     globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_DeleteFn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
19 {
20   PetscShmComm p = (PetscShmComm)val;
21 
22   PetscFunctionBegin;
23   PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
24   PetscCallMPI(MPI_Comm_free(&p->shmcomm));
25   PetscCallMPI(PetscFree(p->globranks));
26   PetscCallMPI(PetscFree(val));
27   PetscFunctionReturn(MPI_SUCCESS);
28 }
29 
30 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
31   /* Data structures to support freeing comms created in PetscShmCommGet().
32   Since we predict communicators passed to PetscShmCommGet() are very likely
33   either a petsc inner communicator or an MPI communicator with a linked petsc
34   inner communicator, we use a simple static array to store dupped communicators
35   on rare cases otherwise.
36  */
37   #define MAX_SHMCOMM_DUPPED_COMMS 16
38 static PetscInt       num_dupped_comms = 0;
39 static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
40 static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
41 {
42   PetscInt i;
43 
44   PetscFunctionBegin;
45   for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
46   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
47   PetscFunctionReturn(PETSC_SUCCESS);
48 }
49 #endif
50 
51 /*@C
52   PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory
53 
54   Collective.
55 
56   Input Parameter:
57 . globcomm - `MPI_Comm`, which can be a user `MPI_Comm` or a PETSc inner `MPI_Comm`
58 
59   Output Parameter:
60 . pshmcomm - the PETSc shared memory communicator object
61 
62   Level: developer
63 
64   Note:
65   When used with MPICH, MPICH must be configured with `--download-mpich-device=ch3:nemesis`
66 
67 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
68 @*/
69 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
70 {
71 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
72   MPI_Group         globgroup, shmgroup;
73   PetscMPIInt      *shmranks, i, flg;
74   PetscCommCounter *counter;
75 
76   PetscFunctionBegin;
77   PetscAssertPointer(pshmcomm, 2);
78   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
79   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
80   if (!flg) { /* globcomm is not a petsc comm */
81     union
82     {
83       MPI_Comm comm;
84       void    *ptr;
85     } ucomm;
86     /* check if globcomm already has a linked petsc inner comm */
87     PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
88     if (!flg) {
89       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
90       PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
91       PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
92       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
93       if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
94       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
95       num_dupped_comms++;
96     } else {
97       /* otherwise, we pull out the inner comm and use it as globcomm */
98       globcomm = ucomm.comm;
99     }
100   }
101 
102   /* Check if globcomm already has an attached pshmcomm. If no, create one */
103   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
104   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
105 
106   PetscCall(PetscNew(pshmcomm));
107   (*pshmcomm)->globcomm = globcomm;
108 
109   PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));
110 
111   PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
112   PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
113   PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
114   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
115   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
116   for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
117   PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
118   PetscCall(PetscFree(shmranks));
119   PetscCallMPI(MPI_Group_free(&globgroup));
120   PetscCallMPI(MPI_Group_free(&shmgroup));
121 
122   for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]));
123   PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
124   PetscFunctionReturn(PETSC_SUCCESS);
125 #else
126   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
127 #endif
128 }
129 
130 /*@C
131   PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
132 
133   Input Parameters:
134 + pshmcomm - the shared memory communicator object
135 - grank    - the global rank
136 
137   Output Parameter:
138 . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist
139 
140   Level: developer
141 
142   Developer Notes:
143   Assumes the pshmcomm->globranks[] is sorted
144 
145   It may be better to rewrite this to map multiple global ranks to local in the same function call
146 
147 .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
148 @*/
149 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
150 {
151   PetscMPIInt low, high, t, i;
152   PetscBool   flg = PETSC_FALSE;
153 
154   PetscFunctionBegin;
155   PetscAssertPointer(pshmcomm, 1);
156   PetscAssertPointer(lrank, 3);
157   *lrank = MPI_PROC_NULL;
158   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS);
159   if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS);
160   PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
161   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
162   low  = 0;
163   high = pshmcomm->shmsize;
164   while (high - low > 5) {
165     t = (low + high) / 2;
166     if (pshmcomm->globranks[t] > grank) high = t;
167     else low = t;
168   }
169   for (i = low; i < high; i++) {
170     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS);
171     if (pshmcomm->globranks[i] == grank) {
172       *lrank = i;
173       PetscFunctionReturn(PETSC_SUCCESS);
174     }
175   }
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 /*@C
180   PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
181 
182   Input Parameters:
183 + pshmcomm - the shared memory communicator object
184 - lrank    - the local rank in the shared memory communicator
185 
186   Output Parameter:
187 . grank - the global rank in the global communicator where the shared memory communicator is built
188 
189   Level: developer
190 
191 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
192 @*/
193 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
194 {
195   PetscFunctionBegin;
196   PetscAssertPointer(pshmcomm, 1);
197   PetscAssertPointer(grank, 3);
198   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
199   *grank = pshmcomm->globranks[lrank];
200   PetscFunctionReturn(PETSC_SUCCESS);
201 }
202 
203 /*@C
204   PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
205 
206   Input Parameter:
207 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
208 
209   Output Parameter:
210 . comm - the MPI communicator
211 
212   Level: developer
213 
214 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
215 @*/
216 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
217 {
218   PetscFunctionBegin;
219   PetscAssertPointer(pshmcomm, 1);
220   PetscAssertPointer(comm, 2);
221   *comm = pshmcomm->shmcomm;
222   PetscFunctionReturn(PETSC_SUCCESS);
223 }
224 
225 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
226   #include <pthread.h>
227   #include <hwloc.h>
228   #include <omp.h>
229 
230   /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
231    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
232    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
233    by 50%. Until the reason is found out, we use mmap() instead.
234 */
235   #define USE_MMAP_ALLOCATE_SHARED_MEMORY
236 
237   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
238     #include <sys/mman.h>
239     #include <sys/types.h>
240     #include <sys/stat.h>
241     #include <fcntl.h>
242   #endif
243 
244 struct _n_PetscOmpCtrl {
245   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
246   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
247   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
248   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
249   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
250   pthread_barrier_t *barrier;         /* pointer to the barrier */
251   hwloc_topology_t   topology;
252   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
253   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
254 };
255 
256 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
257    contained by the controller.
258 
259    PETSc OpenMP controller users do not call this function directly. This function exists
260    only because we want to separate shared memory allocation methods from other code.
261  */
262 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
263 {
264   MPI_Aint              size;
265   void                 *baseptr;
266   pthread_barrierattr_t attr;
267 
268   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
269   int       fd;
270   PetscChar pathname[PETSC_MAX_PATH_LEN];
271   #else
272   PetscMPIInt disp_unit;
273   #endif
274 
275   PetscFunctionBegin;
276   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
277   size = sizeof(pthread_barrier_t);
278   if (ctrl->is_omp_master) {
279     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
280     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
281     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
282     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
283     fd = mkstemp(pathname);
284     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
285     PetscCallExternal(ftruncate, fd, size);
286     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
287     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
288     PetscCallExternal(close, fd);
289     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
290     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
291     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
292     PetscCallExternal(unlink, pathname);
293   } else {
294     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
295     fd = open(pathname, O_RDWR);
296     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
297     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
298     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
299     PetscCallExternal(close, fd);
300     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
301   }
302   #else
303   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
304   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
305   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
306   #endif
307   ctrl->barrier = (pthread_barrier_t *)baseptr;
308 
309   /* omp master initializes the barrier */
310   if (ctrl->is_omp_master) {
311     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
312     PetscCallExternal(pthread_barrierattr_init, &attr);
313     PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
314     PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
315     PetscCallExternal(pthread_barrierattr_destroy, &attr);
316   }
317 
318   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
319   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
320   PetscFunctionReturn(PETSC_SUCCESS);
321 }
322 
323 /* Destroy the pthread barrier in the PETSc OpenMP controller */
324 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
325 {
326   PetscFunctionBegin;
327   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
328   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
329   if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);
330 
331   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
332   PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
333   #else
334   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
335   #endif
336   PetscFunctionReturn(PETSC_SUCCESS);
337 }
338 
339 /*@C
340     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
341 
342     Input Parameters:
343 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
344 -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
345 
346     Output Parameter:
347 .   pctrl      - a PETSc OpenMP controller
348 
349     Level: developer
350 
351     Developer Note:
352     Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
353 
354 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
355 @*/
356 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
357 {
358   PetscOmpCtrl   ctrl;
359   unsigned long *cpu_ulongs = NULL;
360   PetscInt       i, nr_cpu_ulongs;
361   PetscShmComm   pshmcomm;
362   MPI_Comm       shm_comm;
363   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color;
364   PetscInt       num_packages, num_cores;
365 
366   PetscFunctionBegin;
367   PetscCall(PetscNew(&ctrl));
368 
369   /*=================================================================================
370     Init hwloc
371    ==================================================================================*/
372   PetscCallExternal(hwloc_topology_init, &ctrl->topology);
373   #if HWLOC_API_VERSION >= 0x00020000
374   /* to filter out unneeded info and have faster hwloc_topology_load */
375   PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
376   PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
377   #endif
378   PetscCallExternal(hwloc_topology_load, ctrl->topology);
379 
380   /*=================================================================================
381     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
382     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
383     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
384     which is usually passed to third party libraries.
385    ==================================================================================*/
386 
387   /* fetch the stored shared memory communicator */
388   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
389   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
390 
391   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
392   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
393 
394   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
395   if (nthreads == -1) {
396     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
397     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
398     nthreads     = num_cores / num_packages;
399     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
400   }
401 
402   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
403   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
404 
405   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
406      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
407      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
408      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
409      Use 0 as key so that rank ordering wont change in new comm.
410    */
411   color = shm_rank / nthreads;
412   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
413 
414   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
415   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
416   if (!omp_rank) {
417     ctrl->is_omp_master = PETSC_TRUE; /* master */
418     color               = 0;
419   } else {
420     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
421     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
422   }
423   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
424 
425   /*=================================================================================
426     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
427     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
428     and run them on the idle CPUs.
429    ==================================================================================*/
430   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
431 
432   /*=================================================================================
433     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
434     is the union of the bindings of all ranks in the omp_comm
435     =================================================================================*/
436 
437   ctrl->cpuset = hwloc_bitmap_alloc();
438   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
439   PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
440 
441   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
442   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
443   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
444   if (nr_cpu_ulongs == 1) {
445     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
446   } else {
447     for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
448   }
449 
450   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
451 
452   if (ctrl->is_omp_master) {
453     ctrl->omp_cpuset = hwloc_bitmap_alloc();
454     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
455     if (nr_cpu_ulongs == 1) {
456   #if HWLOC_API_VERSION >= 0x00020000
457       PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
458   #else
459       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
460   #endif
461     } else {
462       for (i = 0; i < nr_cpu_ulongs; i++) {
463   #if HWLOC_API_VERSION >= 0x00020000
464         PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
465   #else
466         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
467   #endif
468       }
469     }
470   }
471   PetscCall(PetscFree(cpu_ulongs));
472   *pctrl = ctrl;
473   PetscFunctionReturn(PETSC_SUCCESS);
474 }
475 
476 /*@C
477     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
478 
479     Input Parameter:
480 .   pctrl  - a PETSc OpenMP controller
481 
482     Level: developer
483 
484 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
485 @*/
486 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
487 {
488   PetscOmpCtrl ctrl = *pctrl;
489 
490   PetscFunctionBegin;
491   hwloc_bitmap_free(ctrl->cpuset);
492   hwloc_topology_destroy(ctrl->topology);
493   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
494   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
495   if (ctrl->is_omp_master) {
496     hwloc_bitmap_free(ctrl->omp_cpuset);
497     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
498   }
499   PetscCall(PetscFree(ctrl));
500   PetscFunctionReturn(PETSC_SUCCESS);
501 }
502 
503 /*@C
504     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
505 
506     Input Parameter:
507 .   ctrl - a PETSc OMP controller
508 
509     Output Parameters:
510 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
511 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
512                        on slave ranks, `MPI_COMM_NULL` will be return in reality.
513 -   is_omp_master    - true if the calling process is an OMP master rank.
514 
515     Note:
516     Any output parameter can be `NULL`. The parameter is just ignored.
517 
518     Level: developer
519 
520 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
521 @*/
522 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
523 {
524   PetscFunctionBegin;
525   if (omp_comm) *omp_comm = ctrl->omp_comm;
526   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
527   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
528   PetscFunctionReturn(PETSC_SUCCESS);
529 }
530 
531 /*@C
532     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
533 
534     Input Parameter:
535 .   ctrl - a PETSc OMP controller
536 
537     Notes:
538     this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
539     require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
540     MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
541 
542     A code using `PetscOmpCtrlBarrier()` would be like this,
543 .vb
544     if (is_omp_master) {
545       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
546       Call the library using OpenMP
547       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
548     }
549     PetscOmpCtrlBarrier(ctrl);
550 .ve
551 
552     Level: developer
553 
554 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
555 @*/
556 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
557 {
558   int err;
559 
560   PetscFunctionBegin;
561   err = pthread_barrier_wait(ctrl->barrier);
562   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
563   PetscFunctionReturn(PETSC_SUCCESS);
564 }
565 
566 /*@C
567     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
568 
569     Input Parameter:
570 .   ctrl - a PETSc OMP controller
571 
572     Note:
573     Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
574     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
575 
576     Level: developer
577 
578 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
579 @*/
580 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
581 {
582   PetscFunctionBegin;
583   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
584   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
585   PetscFunctionReturn(PETSC_SUCCESS);
586 }
587 
588 /*@C
589    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
590 
591    Input Parameter:
592 .  ctrl - a PETSc OMP controller
593 
594    Note:
595    Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
596    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
597 
598    Level: developer
599 
600 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
601 @*/
602 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
603 {
604   PetscFunctionBegin;
605   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
606   omp_set_num_threads(1);
607   PetscFunctionReturn(PETSC_SUCCESS);
608 }
609 
610   #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
611 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
612