xref: /petsc/src/sys/utils/mpishm.c (revision c4762a1b19cd2af06abeed90e8f9d34fb975dd94)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt shmsize;          /* size of the shared memory communicator */
7   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal tag/name shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19 {
20   PetscErrorCode  ierr;
21   PetscShmComm p = (PetscShmComm)val;
22 
23   PetscFunctionBegin;
24   ierr = PetscInfo1(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
25   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
26   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
27   ierr = PetscFree(val);CHKERRMPI(ierr);
28   PetscFunctionReturn(MPI_SUCCESS);
29 }
30 
31 /*@C
32     PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory
33 
34 
35     Collective.
36 
37     Input Parameter:
38 .   globcomm - MPI_Comm
39 
40     Output Parameter:
41 .   pshmcomm - the PETSc shared memory communicator object
42 
43     Level: developer
44 
45     Notes:
46     This should be called only with an PetscCommDuplicate() communictor
47 
48            When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
49 
50 @*/
51 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
52 {
53 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
54   PetscErrorCode   ierr;
55   MPI_Group        globgroup,shmgroup;
56   PetscMPIInt      *shmranks,i,flg;
57   PetscCommCounter *counter;
58 
59   PetscFunctionBegin;
60   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
61   if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator");
62 
63   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr);
64   if (flg) PetscFunctionReturn(0);
65 
66   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
67   (*pshmcomm)->globcomm = globcomm;
68 
69   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr);
70 
71   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr);
72   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr);
73   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr);
74   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
75   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
76   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
77   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr);
78   ierr = PetscFree(shmranks);CHKERRQ(ierr);
79   ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr);
80   ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr);
81 
82   for (i=0; i<(*pshmcomm)->shmsize; i++) {
83     ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
84   }
85   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr);
86   PetscFunctionReturn(0);
87 #else
88   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
89 #endif
90 }
91 
92 /*@C
93     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
94 
95     Input Parameters:
96 +   pshmcomm - the shared memory communicator object
97 -   grank    - the global rank
98 
99     Output Parameter:
100 .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
101 
102     Level: developer
103 
104     Developer Notes:
105     Assumes the pshmcomm->globranks[] is sorted
106 
107     It may be better to rewrite this to map multiple global ranks to local in the same function call
108 
109 @*/
110 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
111 {
112   PetscMPIInt    low,high,t,i;
113   PetscBool      flg = PETSC_FALSE;
114   PetscErrorCode ierr;
115 
116   PetscFunctionBegin;
117   *lrank = MPI_PROC_NULL;
118   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
119   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
120   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
121   if (flg) PetscFunctionReturn(0);
122   low  = 0;
123   high = pshmcomm->shmsize;
124   while (high-low > 5) {
125     t = (low+high)/2;
126     if (pshmcomm->globranks[t] > grank) high = t;
127     else low = t;
128   }
129   for (i=low; i<high; i++) {
130     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
131     if (pshmcomm->globranks[i] == grank) {
132       *lrank = i;
133       PetscFunctionReturn(0);
134     }
135   }
136   PetscFunctionReturn(0);
137 }
138 
139 /*@C
140     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
141 
142     Input Parameters:
143 +   pshmcomm - the shared memory communicator object
144 -   lrank    - the local rank in the shared memory communicator
145 
146     Output Parameter:
147 .   grank - the global rank in the global communicator where the shared memory communicator is built
148 
149     Level: developer
150 
151 @*/
152 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
153 {
154   PetscFunctionBegin;
155 #ifdef PETSC_USE_DEBUG
156   {
157     PetscErrorCode ierr;
158     if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);CHKERRQ(ierr); }
159   }
160 #endif
161   *grank = pshmcomm->globranks[lrank];
162   PetscFunctionReturn(0);
163 }
164 
165 /*@C
166     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
167 
168     Input Parameter:
169 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
170 
171     Output Parameter:
172 .   comm     - the MPI communicator
173 
174     Level: developer
175 
176 @*/
177 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
178 {
179   PetscFunctionBegin;
180   *comm = pshmcomm->shmcomm;
181   PetscFunctionReturn(0);
182 }
183 
184 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
185 #include <pthread.h>
186 #include <hwloc.h>
187 #include <omp.h>
188 
189 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
190    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
191    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
192    by 50%. Until the reason is found out, we use mmap() instead.
193 */
194 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
195 
196 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
197 #include <sys/mman.h>
198 #include <sys/types.h>
199 #include <sys/stat.h>
200 #include <fcntl.h>
201 #endif
202 
203 struct _n_PetscOmpCtrl {
204   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
205   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
206   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
207   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
208   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
209   pthread_barrier_t *barrier;        /* pointer to the barrier */
210   hwloc_topology_t  topology;
211   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
212   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
213 };
214 
215 
216 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
217    contained by the controller.
218 
219    PETSc OpenMP controller users do not call this function directly. This function exists
220    only because we want to separate shared memory allocation methods from other code.
221  */
222 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
223 {
224   PetscErrorCode        ierr;
225   MPI_Aint              size;
226   void                  *baseptr;
227   pthread_barrierattr_t  attr;
228 
229 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
230   PetscInt              fd;
231   PetscChar             pathname[PETSC_MAX_PATH_LEN];
232 #else
233   PetscMPIInt           disp_unit;
234 #endif
235 
236   PetscFunctionBegin;
237 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
238   size = sizeof(pthread_barrier_t);
239   if (ctrl->is_omp_master) {
240     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
241     ierr    = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
242     ierr    = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
243     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
244     fd      = mkstemp(pathname); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname);
245     ierr    = ftruncate(fd,size);CHKERRQ(ierr);
246     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
247     ierr    = close(fd);CHKERRQ(ierr);
248     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
249     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
250     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
251     ierr    = unlink(pathname);CHKERRQ(ierr);
252   } else {
253     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
254     fd      = open(pathname,O_RDWR); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname);
255     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
256     ierr    = close(fd);CHKERRQ(ierr);
257     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
258   }
259 #else
260   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
261   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr);
262   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr);
263 #endif
264   ctrl->barrier = (pthread_barrier_t*)baseptr;
265 
266   /* omp master initializes the barrier */
267   if (ctrl->is_omp_master) {
268     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr);
269     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
270     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
271     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
272     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
273   }
274 
275   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
276   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
277   PetscFunctionReturn(0);
278 }
279 
280 /* Destroy the pthread barrier in the PETSc OpenMP controller */
281 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
282 {
283   PetscErrorCode ierr;
284 
285   PetscFunctionBegin;
286   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
287   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
288   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
289 
290 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
291   ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr);
292 #else
293   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr);
294 #endif
295   PetscFunctionReturn(0);
296 }
297 
298 /*@C
299     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP
300 
301     Input Parameter:
302 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
303 -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
304 
305     Output Parameter:
306 .   pctrl      - a PETSc OpenMP controller
307 
308     Level: developer
309 
310     TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use
311 
312 .seealso PetscOmpCtrlDestroy()
313 @*/
314 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
315 {
316   PetscErrorCode        ierr;
317   PetscOmpCtrl          ctrl;
318   unsigned long         *cpu_ulongs=NULL;
319   PetscInt              i,nr_cpu_ulongs;
320   PetscShmComm          pshmcomm;
321   MPI_Comm              shm_comm;
322   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
323   PetscInt              num_packages,num_cores;
324 
325   PetscFunctionBegin;
326   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
327 
328   /*=================================================================================
329     Init hwloc
330    ==================================================================================*/
331   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
332 #if HWLOC_API_VERSION >= 0x00020000
333   /* to filter out unneeded info and have faster hwloc_topology_load */
334   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
335   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
336 #endif
337   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
338 
339   /*=================================================================================
340     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
341     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
342     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
343     which is usually passed to third party libraries.
344    ==================================================================================*/
345 
346   /* fetch the stored shared memory communicator */
347   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
348   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
349 
350   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr);
351   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr);
352 
353   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
354   if (nthreads == -1) {
355     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE);
356     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 :  hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);
357     nthreads     = num_cores/num_packages;
358     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
359   }
360 
361   if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %D can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
362   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %D is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
363 
364   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
365      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
366      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
367      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
368      Use 0 as key so that rank ordering wont change in new comm.
369    */
370   color = shm_rank / nthreads;
371   ierr  = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRQ(ierr);
372 
373   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
374   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr);
375   if (!omp_rank) {
376     ctrl->is_omp_master = PETSC_TRUE;  /* master */
377     color = 0;
378   } else {
379     ctrl->is_omp_master = PETSC_FALSE; /* slave */
380     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
381   }
382   ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRQ(ierr); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */
383 
384   /*=================================================================================
385     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
386     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
387     and run them on the idle CPUs.
388    ==================================================================================*/
389   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
390 
391   /*=================================================================================
392     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
393     is the union of the bindings of all ranks in the omp_comm
394     =================================================================================*/
395 
396   ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
397   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
398 
399   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
400   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
401   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
402   if (nr_cpu_ulongs == 1) {
403     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
404   } else {
405     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
406   }
407 
408   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr);
409 
410   if (ctrl->is_omp_master) {
411     ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
412     if (nr_cpu_ulongs == 1) {
413 #if HWLOC_API_VERSION >= 0x00020000
414       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
415 #else
416       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
417 #endif
418     } else {
419       for (i=0; i<nr_cpu_ulongs; i++)  {
420 #if HWLOC_API_VERSION >= 0x00020000
421         ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr);
422 #else
423         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
424 #endif
425       }
426     }
427   }
428 
429   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
430   *pctrl = ctrl;
431   PetscFunctionReturn(0);
432 }
433 
434 /*@C
435     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
436 
437     Input Parameter:
438 .   pctrl  - a PETSc OpenMP controller
439 
440     Level: developer
441 
442 .seealso PetscOmpCtrlCreate()
443 @*/
444 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
445 {
446   PetscErrorCode  ierr;
447   PetscOmpCtrl    ctrl = *pctrl;
448 
449   PetscFunctionBegin;
450   hwloc_bitmap_free(ctrl->cpuset);
451   hwloc_topology_destroy(ctrl->topology);
452   PetscOmpCtrlDestroyBarrier(ctrl);
453   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr);
454   if (ctrl->is_omp_master) {
455     hwloc_bitmap_free(ctrl->omp_cpuset);
456     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr);
457   }
458   ierr = PetscFree(ctrl);CHKERRQ(ierr);
459   PetscFunctionReturn(0);
460 }
461 
462 /*@C
463     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
464 
465     Input Parameter:
466 .   ctrl - a PETSc OMP controller
467 
468     Output Parameter:
469 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
470 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
471                        on slave ranks, MPI_COMM_NULL will be return in reality.
472 -   is_omp_master    - true if the calling process is an OMP master rank.
473 
474     Notes: any output parameter can be NULL. The parameter is just ignored.
475 
476     Level: developer
477 @*/
478 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
479 {
480   PetscFunctionBegin;
481   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
482   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
483   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
484   PetscFunctionReturn(0);
485 }
486 
487 /*@C
488     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
489 
490     Input Parameter:
491 .   ctrl - a PETSc OMP controller
492 
493     Notes:
494     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
495     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
496     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
497 
498     A code using PetscOmpCtrlBarrier() would be like this,
499 
500     if (is_omp_master) {
501       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
502       Call the library using OpenMP
503       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
504     }
505     PetscOmpCtrlBarrier(ctrl);
506 
507     Level: developer
508 
509 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
510 @*/
511 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
512 {
513   PetscErrorCode ierr;
514 
515   PetscFunctionBegin;
516   ierr = pthread_barrier_wait(ctrl->barrier);
517   if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr);
518   PetscFunctionReturn(0);
519 }
520 
521 /*@C
522     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
523 
524     Input Parameter:
525 .   ctrl - a PETSc OMP controller
526 
527     Notes:
528     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
529     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
530 
531     Level: developer
532 
533 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
534 @*/
535 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
536 {
537   PetscErrorCode ierr;
538 
539   PetscFunctionBegin;
540   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
541   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
542   PetscFunctionReturn(0);
543 }
544 
545 /*@C
546    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
547 
548    Input Parameter:
549 .  ctrl - a PETSc OMP controller
550 
551    Notes:
552    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
553    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
554 
555    Level: developer
556 
557 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
558 @*/
559 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
560 {
561   PetscErrorCode ierr;
562 
563   PetscFunctionBegin;
564   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
565   omp_set_num_threads(1);
566   PetscFunctionReturn(0);
567 }
568 
569 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
570 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
571