xref: /petsc/src/sys/utils/mpishm.c (revision 1e1ea65d8de51fde77ce8a787efbef25e407badc)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt shmsize;          /* size of the shared memory communicator */
7   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19 {
20   PetscErrorCode  ierr;
21   PetscShmComm p = (PetscShmComm)val;
22 
23   PetscFunctionBegin;
24   ierr = PetscInfo1(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
25   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
26   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
27   ierr = PetscFree(val);CHKERRMPI(ierr);
28   PetscFunctionReturn(MPI_SUCCESS);
29 }
30 
31 /*@C
32     PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory
33 
34     Collective.
35 
36     Input Parameter:
37 .   globcomm - MPI_Comm
38 
39     Output Parameter:
40 .   pshmcomm - the PETSc shared memory communicator object
41 
42     Level: developer
43 
44     Notes:
45     This should be called only with an PetscCommDuplicate() communictor
46 
47            When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
48 
49 @*/
50 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
51 {
52 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
53   PetscErrorCode   ierr;
54   MPI_Group        globgroup,shmgroup;
55   PetscMPIInt      *shmranks,i,flg;
56   PetscCommCounter *counter;
57 
58   PetscFunctionBegin;
59   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRMPI(ierr);
60   if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator");
61 
62   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRMPI(ierr);
63   if (flg) PetscFunctionReturn(0);
64 
65   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
66   (*pshmcomm)->globcomm = globcomm;
67 
68   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRMPI(ierr);
69 
70   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRMPI(ierr);
71   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRMPI(ierr);
72   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRMPI(ierr);
73   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
74   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
75   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
76   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRMPI(ierr);
77   ierr = PetscFree(shmranks);CHKERRQ(ierr);
78   ierr = MPI_Group_free(&globgroup);CHKERRMPI(ierr);
79   ierr = MPI_Group_free(&shmgroup);CHKERRMPI(ierr);
80 
81   for (i=0; i<(*pshmcomm)->shmsize; i++) {
82     ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
83   }
84   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRMPI(ierr);
85   PetscFunctionReturn(0);
86 #else
87   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
88 #endif
89 }
90 
91 /*@C
92     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
93 
94     Input Parameters:
95 +   pshmcomm - the shared memory communicator object
96 -   grank    - the global rank
97 
98     Output Parameter:
99 .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
100 
101     Level: developer
102 
103     Developer Notes:
104     Assumes the pshmcomm->globranks[] is sorted
105 
106     It may be better to rewrite this to map multiple global ranks to local in the same function call
107 
108 @*/
109 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
110 {
111   PetscMPIInt    low,high,t,i;
112   PetscBool      flg = PETSC_FALSE;
113   PetscErrorCode ierr;
114 
115   PetscFunctionBegin;
116   *lrank = MPI_PROC_NULL;
117   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
118   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
119   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
120   if (flg) PetscFunctionReturn(0);
121   low  = 0;
122   high = pshmcomm->shmsize;
123   while (high-low > 5) {
124     t = (low+high)/2;
125     if (pshmcomm->globranks[t] > grank) high = t;
126     else low = t;
127   }
128   for (i=low; i<high; i++) {
129     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
130     if (pshmcomm->globranks[i] == grank) {
131       *lrank = i;
132       PetscFunctionReturn(0);
133     }
134   }
135   PetscFunctionReturn(0);
136 }
137 
138 /*@C
139     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
140 
141     Input Parameters:
142 +   pshmcomm - the shared memory communicator object
143 -   lrank    - the local rank in the shared memory communicator
144 
145     Output Parameter:
146 .   grank - the global rank in the global communicator where the shared memory communicator is built
147 
148     Level: developer
149 
150 @*/
151 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
152 {
153   PetscFunctionBegin;
154   if (lrank < 0 || lrank >= pshmcomm->shmsize) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);
155   *grank = pshmcomm->globranks[lrank];
156   PetscFunctionReturn(0);
157 }
158 
159 /*@C
160     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
161 
162     Input Parameter:
163 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
164 
165     Output Parameter:
166 .   comm     - the MPI communicator
167 
168     Level: developer
169 
170 @*/
171 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
172 {
173   PetscFunctionBegin;
174   *comm = pshmcomm->shmcomm;
175   PetscFunctionReturn(0);
176 }
177 
178 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
179 #include <pthread.h>
180 #include <hwloc.h>
181 #include <omp.h>
182 
183 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
184    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
185    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
186    by 50%. Until the reason is found out, we use mmap() instead.
187 */
188 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
189 
190 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
191 #include <sys/mman.h>
192 #include <sys/types.h>
193 #include <sys/stat.h>
194 #include <fcntl.h>
195 #endif
196 
197 struct _n_PetscOmpCtrl {
198   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
199   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
200   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
201   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
202   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
203   pthread_barrier_t *barrier;        /* pointer to the barrier */
204   hwloc_topology_t  topology;
205   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
206   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
207 };
208 
209 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
210    contained by the controller.
211 
212    PETSc OpenMP controller users do not call this function directly. This function exists
213    only because we want to separate shared memory allocation methods from other code.
214  */
215 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
216 {
217   PetscErrorCode        ierr;
218   MPI_Aint              size;
219   void                  *baseptr;
220   pthread_barrierattr_t  attr;
221 
222 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
223   PetscInt              fd;
224   PetscChar             pathname[PETSC_MAX_PATH_LEN];
225 #else
226   PetscMPIInt           disp_unit;
227 #endif
228 
229   PetscFunctionBegin;
230 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
231   size = sizeof(pthread_barrier_t);
232   if (ctrl->is_omp_master) {
233     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
234     ierr    = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
235     ierr    = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
236     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
237     fd      = mkstemp(pathname); if (fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname);
238     ierr    = ftruncate(fd,size);CHKERRQ(ierr);
239     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
240     ierr    = close(fd);CHKERRQ(ierr);
241     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRMPI(ierr);
242     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
243     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
244     ierr    = unlink(pathname);CHKERRQ(ierr);
245   } else {
246     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRMPI(ierr);
247     fd      = open(pathname,O_RDWR); if (fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname);
248     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
249     ierr    = close(fd);CHKERRQ(ierr);
250     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
251   }
252 #else
253   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
254   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRMPI(ierr);
255   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRMPI(ierr);
256 #endif
257   ctrl->barrier = (pthread_barrier_t*)baseptr;
258 
259   /* omp master initializes the barrier */
260   if (ctrl->is_omp_master) {
261     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRMPI(ierr);
262     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
263     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
264     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
265     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
266   }
267 
268   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
269   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
270   PetscFunctionReturn(0);
271 }
272 
273 /* Destroy the pthread barrier in the PETSc OpenMP controller */
274 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
275 {
276   PetscErrorCode ierr;
277 
278   PetscFunctionBegin;
279   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
280   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
281   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
282 
283 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
284   ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr);
285 #else
286   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRMPI(ierr);
287 #endif
288   PetscFunctionReturn(0);
289 }
290 
291 /*@C
292     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP
293 
294     Input Parameter:
295 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
296 -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
297 
298     Output Parameter:
299 .   pctrl      - a PETSc OpenMP controller
300 
301     Level: developer
302 
303     TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use
304 
305 .seealso PetscOmpCtrlDestroy()
306 @*/
307 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
308 {
309   PetscErrorCode        ierr;
310   PetscOmpCtrl          ctrl;
311   unsigned long         *cpu_ulongs=NULL;
312   PetscInt              i,nr_cpu_ulongs;
313   PetscShmComm          pshmcomm;
314   MPI_Comm              shm_comm;
315   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
316   PetscInt              num_packages,num_cores;
317 
318   PetscFunctionBegin;
319   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
320 
321   /*=================================================================================
322     Init hwloc
323    ==================================================================================*/
324   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
325 #if HWLOC_API_VERSION >= 0x00020000
326   /* to filter out unneeded info and have faster hwloc_topology_load */
327   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
328   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
329 #endif
330   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
331 
332   /*=================================================================================
333     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
334     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
335     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
336     which is usually passed to third party libraries.
337    ==================================================================================*/
338 
339   /* fetch the stored shared memory communicator */
340   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
341   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
342 
343   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRMPI(ierr);
344   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRMPI(ierr);
345 
346   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
347   if (nthreads == -1) {
348     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE);
349     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 :  hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);
350     nthreads     = num_cores/num_packages;
351     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
352   }
353 
354   if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %D can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
355   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %D is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
356 
357   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
358      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
359      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
360      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
361      Use 0 as key so that rank ordering wont change in new comm.
362    */
363   color = shm_rank / nthreads;
364   ierr  = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRMPI(ierr);
365 
366   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
367   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRMPI(ierr);
368   if (!omp_rank) {
369     ctrl->is_omp_master = PETSC_TRUE;  /* master */
370     color = 0;
371   } else {
372     ctrl->is_omp_master = PETSC_FALSE; /* slave */
373     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
374   }
375   ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRMPI(ierr);
376 
377   /*=================================================================================
378     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
379     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
380     and run them on the idle CPUs.
381    ==================================================================================*/
382   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
383 
384   /*=================================================================================
385     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
386     is the union of the bindings of all ranks in the omp_comm
387     =================================================================================*/
388 
389   ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
390   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
391 
392   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
393   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
394   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
395   if (nr_cpu_ulongs == 1) {
396     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
397   } else {
398     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
399   }
400 
401   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRMPI(ierr);
402 
403   if (ctrl->is_omp_master) {
404     ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
405     if (nr_cpu_ulongs == 1) {
406 #if HWLOC_API_VERSION >= 0x00020000
407       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
408 #else
409       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
410 #endif
411     } else {
412       for (i=0; i<nr_cpu_ulongs; i++)  {
413 #if HWLOC_API_VERSION >= 0x00020000
414         ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr);
415 #else
416         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
417 #endif
418       }
419     }
420   }
421 
422   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
423   *pctrl = ctrl;
424   PetscFunctionReturn(0);
425 }
426 
427 /*@C
428     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
429 
430     Input Parameter:
431 .   pctrl  - a PETSc OpenMP controller
432 
433     Level: developer
434 
435 .seealso PetscOmpCtrlCreate()
436 @*/
437 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
438 {
439   PetscErrorCode  ierr;
440   PetscOmpCtrl    ctrl = *pctrl;
441 
442   PetscFunctionBegin;
443   hwloc_bitmap_free(ctrl->cpuset);
444   hwloc_topology_destroy(ctrl->topology);
445   PetscOmpCtrlDestroyBarrier(ctrl);
446   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRMPI(ierr);
447   if (ctrl->is_omp_master) {
448     hwloc_bitmap_free(ctrl->omp_cpuset);
449     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRMPI(ierr);
450   }
451   ierr = PetscFree(ctrl);CHKERRQ(ierr);
452   PetscFunctionReturn(0);
453 }
454 
455 /*@C
456     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
457 
458     Input Parameter:
459 .   ctrl - a PETSc OMP controller
460 
461     Output Parameter:
462 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
463 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
464                        on slave ranks, MPI_COMM_NULL will be return in reality.
465 -   is_omp_master    - true if the calling process is an OMP master rank.
466 
467     Notes: any output parameter can be NULL. The parameter is just ignored.
468 
469     Level: developer
470 @*/
471 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
472 {
473   PetscFunctionBegin;
474   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
475   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
476   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
477   PetscFunctionReturn(0);
478 }
479 
480 /*@C
481     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
482 
483     Input Parameter:
484 .   ctrl - a PETSc OMP controller
485 
486     Notes:
487     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
488     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
489     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
490 
491     A code using PetscOmpCtrlBarrier() would be like this,
492 
493     if (is_omp_master) {
494       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
495       Call the library using OpenMP
496       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
497     }
498     PetscOmpCtrlBarrier(ctrl);
499 
500     Level: developer
501 
502 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
503 @*/
504 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
505 {
506   int err;
507 
508   PetscFunctionBegin;
509   err = pthread_barrier_wait(ctrl->barrier);
510   if (err && err != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", err);
511   PetscFunctionReturn(0);
512 }
513 
514 /*@C
515     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
516 
517     Input Parameter:
518 .   ctrl - a PETSc OMP controller
519 
520     Notes:
521     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
522     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
523 
524     Level: developer
525 
526 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
527 @*/
528 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
529 {
530   PetscErrorCode ierr;
531 
532   PetscFunctionBegin;
533   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
534   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
535   PetscFunctionReturn(0);
536 }
537 
538 /*@C
539    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
540 
541    Input Parameter:
542 .  ctrl - a PETSc OMP controller
543 
544    Notes:
545    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
546    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
547 
548    Level: developer
549 
550 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
551 @*/
552 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
553 {
554   PetscErrorCode ierr;
555 
556   PetscFunctionBegin;
557   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
558   omp_set_num_threads(1);
559   PetscFunctionReturn(0);
560 }
561 
562 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
563 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
564