xref: /petsc/src/sys/utils/mpishm.c (revision e91c04dfc8a52dee1965211bb1cc8e5bf775178f)
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;         /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt  shmsize;           /* size of the shared memory communicator */
7   MPI_Comm     globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_DeleteFn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
19 {
20   PetscShmComm p = (PetscShmComm)val;
21 
22   PetscFunctionBegin;
23   PetscCallReturnMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
24   PetscCallMPIReturnMPI(MPI_Comm_free(&p->shmcomm));
25   PetscCallReturnMPI(PetscFree(p->globranks));
26   PetscCallReturnMPI(PetscFree(val));
27   PetscFunctionReturn(MPI_SUCCESS);
28 }
29 
30 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
31   /* Data structures to support freeing comms created in PetscShmCommGet().
32   Since we predict communicators passed to PetscShmCommGet() are very likely
33   either a petsc inner communicator or an MPI communicator with a linked petsc
34   inner communicator, we use a simple static array to store dupped communicators
35   on rare cases otherwise.
36  */
37   #define MAX_SHMCOMM_DUPPED_COMMS 16
38 static PetscInt       num_dupped_comms = 0;
39 static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
40 static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
41 {
42   PetscFunctionBegin;
43   for (PetscInt i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
44   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
45   PetscFunctionReturn(PETSC_SUCCESS);
46 }
47 #endif
48 
49 /*@C
50   PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory
51 
52   Collective.
53 
54   Input Parameter:
55 . globcomm - `MPI_Comm`, which can be a user `MPI_Comm` or a PETSc inner `MPI_Comm`
56 
57   Output Parameter:
58 . pshmcomm - the PETSc shared memory communicator object
59 
60   Level: developer
61 
62   Note:
63   When used with MPICH, MPICH must be configured with `--download-mpich-device=ch3:nemesis`
64 
65 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
66 @*/
67 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
68 {
69 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
70   MPI_Group         globgroup, shmgroup;
71   PetscMPIInt      *shmranks, i, flg;
72   PetscCommCounter *counter;
73 
74   PetscFunctionBegin;
75   PetscAssertPointer(pshmcomm, 2);
76   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
77   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
78   if (!flg) { /* globcomm is not a petsc comm */
79     union
80     {
81       MPI_Comm comm;
82       void    *ptr;
83     } ucomm;
84     /* check if globcomm already has a linked petsc inner comm */
85     PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
86     if (!flg) {
87       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
88       PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
89       PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
90       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
91       if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
92       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
93       num_dupped_comms++;
94     } else {
95       /* otherwise, we pull out the inner comm and use it as globcomm */
96       globcomm = ucomm.comm;
97     }
98   }
99 
100   /* Check if globcomm already has an attached pshmcomm. If no, create one */
101   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
102   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
103 
104   PetscCall(PetscNew(pshmcomm));
105   (*pshmcomm)->globcomm = globcomm;
106 
107   PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));
108 
109   PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
110   PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
111   PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
112   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
113   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
114   for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
115   PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
116   PetscCall(PetscFree(shmranks));
117   PetscCallMPI(MPI_Group_free(&globgroup));
118   PetscCallMPI(MPI_Group_free(&shmgroup));
119 
120   for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]));
121   PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
122   PetscFunctionReturn(PETSC_SUCCESS);
123 #else
124   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
125 #endif
126 }
127 
128 /*@C
129   PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
130 
131   Input Parameters:
132 + pshmcomm - the shared memory communicator object
133 - grank    - the global rank
134 
135   Output Parameter:
136 . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist
137 
138   Level: developer
139 
140   Developer Notes:
141   Assumes the pshmcomm->globranks[] is sorted
142 
143   It may be better to rewrite this to map multiple global ranks to local in the same function call
144 
145 .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
146 @*/
147 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
148 {
149   PetscMPIInt low, high, t, i;
150   PetscBool   flg = PETSC_FALSE;
151 
152   PetscFunctionBegin;
153   PetscAssertPointer(pshmcomm, 1);
154   PetscAssertPointer(lrank, 3);
155   *lrank = MPI_PROC_NULL;
156   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS);
157   if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS);
158   PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
159   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
160   low  = 0;
161   high = pshmcomm->shmsize;
162   while (high - low > 5) {
163     t = (low + high) / 2;
164     if (pshmcomm->globranks[t] > grank) high = t;
165     else low = t;
166   }
167   for (i = low; i < high; i++) {
168     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS);
169     if (pshmcomm->globranks[i] == grank) {
170       *lrank = i;
171       PetscFunctionReturn(PETSC_SUCCESS);
172     }
173   }
174   PetscFunctionReturn(PETSC_SUCCESS);
175 }
176 
177 /*@C
178   PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
179 
180   Input Parameters:
181 + pshmcomm - the shared memory communicator object
182 - lrank    - the local rank in the shared memory communicator
183 
184   Output Parameter:
185 . grank - the global rank in the global communicator where the shared memory communicator is built
186 
187   Level: developer
188 
189 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
190 @*/
191 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
192 {
193   PetscFunctionBegin;
194   PetscAssertPointer(pshmcomm, 1);
195   PetscAssertPointer(grank, 3);
196   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
197   *grank = pshmcomm->globranks[lrank];
198   PetscFunctionReturn(PETSC_SUCCESS);
199 }
200 
201 /*@C
202   PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
203 
204   Input Parameter:
205 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
206 
207   Output Parameter:
208 . comm - the MPI communicator
209 
210   Level: developer
211 
212 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
213 @*/
214 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
215 {
216   PetscFunctionBegin;
217   PetscAssertPointer(pshmcomm, 1);
218   PetscAssertPointer(comm, 2);
219   *comm = pshmcomm->shmcomm;
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
224   #include <pthread.h>
225   #include <hwloc.h>
226   #include <omp.h>
227 
228   /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
229    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
230    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
231    by 50%. Until the reason is found out, we use mmap() instead.
232 */
233   #define USE_MMAP_ALLOCATE_SHARED_MEMORY
234 
235   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
236     #include <sys/mman.h>
237     #include <sys/types.h>
238     #include <sys/stat.h>
239     #include <fcntl.h>
240   #endif
241 
242 struct _n_PetscOmpCtrl {
243   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
244   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
245   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
246   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
247   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
248   pthread_barrier_t *barrier;         /* pointer to the barrier */
249   hwloc_topology_t   topology;
250   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
251   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
252 };
253 
254 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
255    contained by the controller.
256 
257    PETSc OpenMP controller users do not call this function directly. This function exists
258    only because we want to separate shared memory allocation methods from other code.
259  */
260 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
261 {
262   MPI_Aint              size;
263   void                 *baseptr;
264   pthread_barrierattr_t attr;
265 
266   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
267   int       fd;
268   PetscChar pathname[PETSC_MAX_PATH_LEN];
269   #else
270   PetscMPIInt disp_unit;
271   #endif
272 
273   PetscFunctionBegin;
274   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
275   size = sizeof(pthread_barrier_t);
276   if (ctrl->is_omp_master) {
277     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
278     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
279     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
280     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
281     fd = mkstemp(pathname);
282     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
283     PetscCallExternal(ftruncate, fd, size);
284     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
285     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
286     PetscCallExternal(close, fd);
287     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
288     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
289     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
290     PetscCallExternal(unlink, pathname);
291   } else {
292     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
293     fd = open(pathname, O_RDWR);
294     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
295     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
296     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
297     PetscCallExternal(close, fd);
298     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
299   }
300   #else
301   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
302   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
303   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
304   #endif
305   ctrl->barrier = (pthread_barrier_t *)baseptr;
306 
307   /* omp master initializes the barrier */
308   if (ctrl->is_omp_master) {
309     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
310     PetscCallExternal(pthread_barrierattr_init, &attr);
311     PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
312     PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
313     PetscCallExternal(pthread_barrierattr_destroy, &attr);
314   }
315 
316   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
317   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 
321 /* Destroy the pthread barrier in the PETSc OpenMP controller */
322 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
323 {
324   PetscFunctionBegin;
325   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
326   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
327   if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);
328 
329   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
330   PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
331   #else
332   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
333   #endif
334   PetscFunctionReturn(PETSC_SUCCESS);
335 }
336 
337 /*@C
338     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
339 
340     Input Parameters:
341 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
342 -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
343 
344     Output Parameter:
345 .   pctrl      - a PETSc OpenMP controller
346 
347     Level: developer
348 
349     Developer Note:
350     Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
351 
352 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
353 @*/
354 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
355 {
356   PetscOmpCtrl   ctrl;
357   unsigned long *cpu_ulongs = NULL;
358   PetscShmComm   pshmcomm;
359   MPI_Comm       shm_comm;
360   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color, nr_cpu_ulongs;
361   PetscInt       num_packages, num_cores;
362 
363   PetscFunctionBegin;
364   PetscCall(PetscNew(&ctrl));
365 
366   /*=================================================================================
367     Init hwloc
368    ==================================================================================*/
369   PetscCallExternal(hwloc_topology_init, &ctrl->topology);
370   #if HWLOC_API_VERSION >= 0x00020000
371   /* to filter out unneeded info and have faster hwloc_topology_load */
372   PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
373   PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
374   #endif
375   PetscCallExternal(hwloc_topology_load, ctrl->topology);
376 
377   /*=================================================================================
378     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
379     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
380     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
381     which is usually passed to third party libraries.
382    ==================================================================================*/
383 
384   /* fetch the stored shared memory communicator */
385   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
386   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
387 
388   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
389   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
390 
391   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
392   if (nthreads == -1) {
393     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
394     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
395     nthreads     = num_cores / num_packages;
396     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
397   }
398 
399   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
400   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
401 
402   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
403      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
404      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
405      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
406      Use 0 as key so that rank ordering wont change in new comm.
407    */
408   color = shm_rank / nthreads;
409   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
410 
411   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
412   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
413   if (!omp_rank) {
414     ctrl->is_omp_master = PETSC_TRUE; /* master */
415     color               = 0;
416   } else {
417     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
418     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
419   }
420   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
421 
422   /*=================================================================================
423     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
424     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
425     and run them on the idle CPUs.
426    ==================================================================================*/
427   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
428 
429   /*=================================================================================
430     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
431     is the union of the bindings of all ranks in the omp_comm
432     =================================================================================*/
433 
434   ctrl->cpuset = hwloc_bitmap_alloc();
435   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
436   PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
437 
438   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
439   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
440   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
441   if (nr_cpu_ulongs == 1) {
442     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
443   } else {
444     for (PetscInt i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
445   }
446 
447   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
448 
449   if (ctrl->is_omp_master) {
450     ctrl->omp_cpuset = hwloc_bitmap_alloc();
451     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
452     if (nr_cpu_ulongs == 1) {
453   #if HWLOC_API_VERSION >= 0x00020000
454       PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
455   #else
456       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
457   #endif
458     } else {
459       for (PetscInt i = 0; i < nr_cpu_ulongs; i++) {
460   #if HWLOC_API_VERSION >= 0x00020000
461         PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
462   #else
463         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
464   #endif
465       }
466     }
467   }
468   PetscCall(PetscFree(cpu_ulongs));
469   *pctrl = ctrl;
470   PetscFunctionReturn(PETSC_SUCCESS);
471 }
472 
473 /*@C
474     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
475 
476     Input Parameter:
477 .   pctrl  - a PETSc OpenMP controller
478 
479     Level: developer
480 
481 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
482 @*/
483 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
484 {
485   PetscOmpCtrl ctrl = *pctrl;
486 
487   PetscFunctionBegin;
488   hwloc_bitmap_free(ctrl->cpuset);
489   hwloc_topology_destroy(ctrl->topology);
490   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
491   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
492   if (ctrl->is_omp_master) {
493     hwloc_bitmap_free(ctrl->omp_cpuset);
494     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
495   }
496   PetscCall(PetscFree(ctrl));
497   PetscFunctionReturn(PETSC_SUCCESS);
498 }
499 
500 /*@C
501     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
502 
503     Input Parameter:
504 .   ctrl - a PETSc OMP controller
505 
506     Output Parameters:
507 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
508 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
509                        on slave ranks, `MPI_COMM_NULL` will be return in reality.
510 -   is_omp_master    - true if the calling process is an OMP master rank.
511 
512     Note:
513     Any output parameter can be `NULL`. The parameter is just ignored.
514 
515     Level: developer
516 
517 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
518 @*/
519 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
520 {
521   PetscFunctionBegin;
522   if (omp_comm) *omp_comm = ctrl->omp_comm;
523   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
524   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
525   PetscFunctionReturn(PETSC_SUCCESS);
526 }
527 
528 /*@C
529     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
530 
531     Input Parameter:
532 .   ctrl - a PETSc OMP controller
533 
534     Notes:
535     this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
536     require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
537     MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
538 
539     A code using `PetscOmpCtrlBarrier()` would be like this,
540 .vb
541     if (is_omp_master) {
542       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
543       Call the library using OpenMP
544       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
545     }
546     PetscOmpCtrlBarrier(ctrl);
547 .ve
548 
549     Level: developer
550 
551 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
552 @*/
553 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
554 {
555   int err;
556 
557   PetscFunctionBegin;
558   err = pthread_barrier_wait(ctrl->barrier);
559   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
560   PetscFunctionReturn(PETSC_SUCCESS);
561 }
562 
563 /*@C
564     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
565 
566     Input Parameter:
567 .   ctrl - a PETSc OMP controller
568 
569     Note:
570     Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
571     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
572 
573     Level: developer
574 
575 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
576 @*/
577 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
578 {
579   PetscFunctionBegin;
580   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
581   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
582   PetscFunctionReturn(PETSC_SUCCESS);
583 }
584 
585 /*@C
586    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
587 
588    Input Parameter:
589 .  ctrl - a PETSc OMP controller
590 
591    Note:
592    Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
593    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
594 
595    Level: developer
596 
597 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
598 @*/
599 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
600 {
601   PetscFunctionBegin;
602   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
603   omp_set_num_threads(1);
604   PetscFunctionReturn(PETSC_SUCCESS);
605 }
606 
607   #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
608 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
609