xref: /petsc/src/sys/utils/mpishm.c (revision 9371c9d470a9602b6d10a8bf50c9b2280a79e45a)
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;         /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt  shmsize;           /* size of the shared memory communicator */
7   MPI_Comm     globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state) {
19   PetscShmComm p = (PetscShmComm)val;
20 
21   PetscFunctionBegin;
22   PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
23   PetscCallMPI(MPI_Comm_free(&p->shmcomm));
24   PetscCallMPI(PetscFree(p->globranks));
25   PetscCallMPI(PetscFree(val));
26   PetscFunctionReturn(MPI_SUCCESS);
27 }
28 
29 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
30 /* Data structures to support freeing comms created in PetscShmCommGet().
31   Since we predict communicators passed to PetscShmCommGet() are very likely
32   either a petsc inner communicator or an MPI communicator with a linked petsc
33   inner communicator, we use a simple static array to store dupped communicators
34   on rare cases otherwise.
35  */
36 #define MAX_SHMCOMM_DUPPED_COMMS 16
37 static PetscInt       num_dupped_comms = 0;
38 static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
39 static PetscErrorCode PetscShmCommDestroyDuppedComms(void) {
40   PetscInt i;
41   PetscFunctionBegin;
42   for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
43   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
44   PetscFunctionReturn(0);
45 }
46 #endif
47 
48 /*@C
49     PetscShmCommGet - Given a communicator returns a sub-communicator of all ranks that share a common memory
50 
51     Collective.
52 
53     Input Parameter:
54 .   globcomm - MPI_Comm, which can be a user MPI_Comm or a PETSc inner MPI_Comm
55 
56     Output Parameter:
57 .   pshmcomm - the PETSc shared memory communicator object
58 
59     Level: developer
60 
61     Notes:
62        When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
63 
64 @*/
65 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm) {
66 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
67   MPI_Group         globgroup, shmgroup;
68   PetscMPIInt      *shmranks, i, flg;
69   PetscCommCounter *counter;
70 
71   PetscFunctionBegin;
72   PetscValidPointer(pshmcomm, 2);
73   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
74   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
75   if (!flg) { /* globcomm is not a petsc comm */
76     union
77     {
78       MPI_Comm comm;
79       void    *ptr;
80     } ucomm;
81     /* check if globcomm already has a linked petsc inner comm */
82     PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
83     if (!flg) {
84       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
85       PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
86       PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
87       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
88       if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
89       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
90       num_dupped_comms++;
91     } else {
92       /* otherwise, we pull out the inner comm and use it as globcomm */
93       globcomm = ucomm.comm;
94     }
95   }
96 
97   /* Check if globcomm already has an attached pshmcomm. If no, create one */
98   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
99   if (flg) PetscFunctionReturn(0);
100 
101   PetscCall(PetscNew(pshmcomm));
102   (*pshmcomm)->globcomm = globcomm;
103 
104   PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));
105 
106   PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
107   PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
108   PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
109   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
110   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
111   for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
112   PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
113   PetscCall(PetscFree(shmranks));
114   PetscCallMPI(MPI_Group_free(&globgroup));
115   PetscCallMPI(MPI_Group_free(&shmgroup));
116 
117   for (i = 0; i < (*pshmcomm)->shmsize; i++) { PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i])); }
118   PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
119   PetscFunctionReturn(0);
120 #else
121   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
122 #endif
123 }
124 
125 /*@C
126     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
127 
128     Input Parameters:
129 +   pshmcomm - the shared memory communicator object
130 -   grank    - the global rank
131 
132     Output Parameter:
133 .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
134 
135     Level: developer
136 
137     Developer Notes:
138     Assumes the pshmcomm->globranks[] is sorted
139 
140     It may be better to rewrite this to map multiple global ranks to local in the same function call
141 
142 @*/
143 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank) {
144   PetscMPIInt low, high, t, i;
145   PetscBool   flg = PETSC_FALSE;
146 
147   PetscFunctionBegin;
148   PetscValidPointer(pshmcomm, 1);
149   PetscValidIntPointer(lrank, 3);
150   *lrank = MPI_PROC_NULL;
151   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
152   if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(0);
153   PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
154   if (flg) PetscFunctionReturn(0);
155   low  = 0;
156   high = pshmcomm->shmsize;
157   while (high - low > 5) {
158     t = (low + high) / 2;
159     if (pshmcomm->globranks[t] > grank) high = t;
160     else low = t;
161   }
162   for (i = low; i < high; i++) {
163     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
164     if (pshmcomm->globranks[i] == grank) {
165       *lrank = i;
166       PetscFunctionReturn(0);
167     }
168   }
169   PetscFunctionReturn(0);
170 }
171 
172 /*@C
173     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
174 
175     Input Parameters:
176 +   pshmcomm - the shared memory communicator object
177 -   lrank    - the local rank in the shared memory communicator
178 
179     Output Parameter:
180 .   grank - the global rank in the global communicator where the shared memory communicator is built
181 
182     Level: developer
183 
184 @*/
185 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank) {
186   PetscFunctionBegin;
187   PetscValidPointer(pshmcomm, 1);
188   PetscValidIntPointer(grank, 3);
189   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
190   *grank = pshmcomm->globranks[lrank];
191   PetscFunctionReturn(0);
192 }
193 
194 /*@C
195     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
196 
197     Input Parameter:
198 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
199 
200     Output Parameter:
201 .   comm     - the MPI communicator
202 
203     Level: developer
204 
205 @*/
206 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm) {
207   PetscFunctionBegin;
208   PetscValidPointer(pshmcomm, 1);
209   PetscValidPointer(comm, 2);
210   *comm = pshmcomm->shmcomm;
211   PetscFunctionReturn(0);
212 }
213 
214 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
215 #include <pthread.h>
216 #include <hwloc.h>
217 #include <omp.h>
218 
219 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
220    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
221    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
222    by 50%. Until the reason is found out, we use mmap() instead.
223 */
224 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
225 
226 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
227 #include <sys/mman.h>
228 #include <sys/types.h>
229 #include <sys/stat.h>
230 #include <fcntl.h>
231 #endif
232 
233 struct _n_PetscOmpCtrl {
234   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
235   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
236   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
237   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
238   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
239   pthread_barrier_t *barrier;         /* pointer to the barrier */
240   hwloc_topology_t   topology;
241   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
242   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
243 };
244 
245 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
246    contained by the controller.
247 
248    PETSc OpenMP controller users do not call this function directly. This function exists
249    only because we want to separate shared memory allocation methods from other code.
250  */
251 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) {
252   MPI_Aint              size;
253   void                 *baseptr;
254   pthread_barrierattr_t attr;
255 
256 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
257   PetscInt  fd;
258   PetscChar pathname[PETSC_MAX_PATH_LEN];
259 #else
260   PetscMPIInt disp_unit;
261 #endif
262 
263   PetscFunctionBegin;
264 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
265   size = sizeof(pthread_barrier_t);
266   if (ctrl->is_omp_master) {
267     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
268     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
269     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
270     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
271     fd = mkstemp(pathname);
272     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
273     PetscCall(ftruncate(fd, size));
274     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
275     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
276     PetscCall(close(fd));
277     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
278     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
279     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
280     PetscCall(unlink(pathname));
281   } else {
282     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
283     fd = open(pathname, O_RDWR);
284     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
285     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
286     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
287     PetscCall(close(fd));
288     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
289   }
290 #else
291   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
292   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
293   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
294 #endif
295   ctrl->barrier = (pthread_barrier_t *)baseptr;
296 
297   /* omp master initializes the barrier */
298   if (ctrl->is_omp_master) {
299     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
300     PetscCall(pthread_barrierattr_init(&attr));
301     PetscCall(pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)); /* make the barrier also work for processes */
302     PetscCall(pthread_barrier_init(ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size));
303     PetscCall(pthread_barrierattr_destroy(&attr));
304   }
305 
306   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
307   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
308   PetscFunctionReturn(0);
309 }
310 
311 /* Destroy the pthread barrier in the PETSc OpenMP controller */
312 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) {
313   PetscFunctionBegin;
314   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
315   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
316   if (ctrl->is_omp_master) PetscCall(pthread_barrier_destroy(ctrl->barrier));
317 
318 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
319   PetscCall(munmap(ctrl->barrier, sizeof(pthread_barrier_t)));
320 #else
321   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
322 #endif
323   PetscFunctionReturn(0);
324 }
325 
326 /*@C
327     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP
328 
329     Input Parameters:
330 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
331 -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
332 
333     Output Parameter:
334 .   pctrl      - a PETSc OpenMP controller
335 
336     Level: developer
337 
338     TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use
339 
340 .seealso `PetscOmpCtrlDestroy()`
341 @*/
342 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl) {
343   PetscOmpCtrl   ctrl;
344   unsigned long *cpu_ulongs = NULL;
345   PetscInt       i, nr_cpu_ulongs;
346   PetscShmComm   pshmcomm;
347   MPI_Comm       shm_comm;
348   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color;
349   PetscInt       num_packages, num_cores;
350 
351   PetscFunctionBegin;
352   PetscCall(PetscNew(&ctrl));
353 
354   /*=================================================================================
355     Init hwloc
356    ==================================================================================*/
357   PetscCall(hwloc_topology_init(&ctrl->topology));
358 #if HWLOC_API_VERSION >= 0x00020000
359   /* to filter out unneeded info and have faster hwloc_topology_load */
360   PetscCall(hwloc_topology_set_all_types_filter(ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE));
361   PetscCall(hwloc_topology_set_type_filter(ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL));
362 #endif
363   PetscCall(hwloc_topology_load(ctrl->topology));
364 
365   /*=================================================================================
366     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
367     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
368     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
369     which is usually passed to third party libraries.
370    ==================================================================================*/
371 
372   /* fetch the stored shared memory communicator */
373   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
374   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
375 
376   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
377   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
378 
379   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
380   if (nthreads == -1) {
381     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
382     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
383     nthreads     = num_cores / num_packages;
384     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
385   }
386 
387   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
388   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
389 
390   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
391      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
392      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
393      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
394      Use 0 as key so that rank ordering wont change in new comm.
395    */
396   color = shm_rank / nthreads;
397   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
398 
399   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
400   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
401   if (!omp_rank) {
402     ctrl->is_omp_master = PETSC_TRUE; /* master */
403     color               = 0;
404   } else {
405     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
406     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
407   }
408   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
409 
410   /*=================================================================================
411     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
412     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
413     and run them on the idle CPUs.
414    ==================================================================================*/
415   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
416 
417   /*=================================================================================
418     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
419     is the union of the bindings of all ranks in the omp_comm
420     =================================================================================*/
421 
422   ctrl->cpuset = hwloc_bitmap_alloc();
423   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
424   PetscCall(hwloc_get_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS));
425 
426   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
427   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
428   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
429   if (nr_cpu_ulongs == 1) {
430     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
431   } else {
432     for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
433   }
434 
435   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
436 
437   if (ctrl->is_omp_master) {
438     ctrl->omp_cpuset = hwloc_bitmap_alloc();
439     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
440     if (nr_cpu_ulongs == 1) {
441 #if HWLOC_API_VERSION >= 0x00020000
442       PetscCall(hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]));
443 #else
444       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
445 #endif
446     } else {
447       for (i = 0; i < nr_cpu_ulongs; i++) {
448 #if HWLOC_API_VERSION >= 0x00020000
449         PetscCall(hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]));
450 #else
451         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
452 #endif
453       }
454     }
455   }
456 
457   PetscCall(PetscFree(cpu_ulongs));
458   *pctrl = ctrl;
459   PetscFunctionReturn(0);
460 }
461 
462 /*@C
463     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
464 
465     Input Parameter:
466 .   pctrl  - a PETSc OpenMP controller
467 
468     Level: developer
469 
470 .seealso `PetscOmpCtrlCreate()`
471 @*/
472 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) {
473   PetscOmpCtrl ctrl = *pctrl;
474 
475   PetscFunctionBegin;
476   hwloc_bitmap_free(ctrl->cpuset);
477   hwloc_topology_destroy(ctrl->topology);
478   PetscOmpCtrlDestroyBarrier(ctrl);
479   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
480   if (ctrl->is_omp_master) {
481     hwloc_bitmap_free(ctrl->omp_cpuset);
482     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
483   }
484   PetscCall(PetscFree(ctrl));
485   PetscFunctionReturn(0);
486 }
487 
488 /*@C
489     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
490 
491     Input Parameter:
492 .   ctrl - a PETSc OMP controller
493 
494     Output Parameters:
495 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
496 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
497                        on slave ranks, MPI_COMM_NULL will be return in reality.
498 -   is_omp_master    - true if the calling process is an OMP master rank.
499 
500     Notes: any output parameter can be NULL. The parameter is just ignored.
501 
502     Level: developer
503 @*/
504 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master) {
505   PetscFunctionBegin;
506   if (omp_comm) *omp_comm = ctrl->omp_comm;
507   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
508   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
509   PetscFunctionReturn(0);
510 }
511 
512 /*@C
513     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
514 
515     Input Parameter:
516 .   ctrl - a PETSc OMP controller
517 
518     Notes:
519     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
520     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
521     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
522 
523     A code using PetscOmpCtrlBarrier() would be like this,
524 
525     if (is_omp_master) {
526       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
527       Call the library using OpenMP
528       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
529     }
530     PetscOmpCtrlBarrier(ctrl);
531 
532     Level: developer
533 
534 .seealso `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`
535 @*/
536 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) {
537   int err;
538 
539   PetscFunctionBegin;
540   err = pthread_barrier_wait(ctrl->barrier);
541   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %" PetscInt_FMT, err);
542   PetscFunctionReturn(0);
543 }
544 
545 /*@C
546     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
547 
548     Input Parameter:
549 .   ctrl - a PETSc OMP controller
550 
551     Notes:
552     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
553     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
554 
555     Level: developer
556 
557 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`
558 @*/
559 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) {
560   PetscFunctionBegin;
561   PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS));
562   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
563   PetscFunctionReturn(0);
564 }
565 
566 /*@C
567    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
568 
569    Input Parameter:
570 .  ctrl - a PETSc OMP controller
571 
572    Notes:
573    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
574    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
575 
576    Level: developer
577 
578 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`
579 @*/
580 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) {
581   PetscFunctionBegin;
582   PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS));
583   omp_set_num_threads(1);
584   PetscFunctionReturn(0);
585 }
586 
587 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
588 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
589