xref: /petsc/src/sys/utils/mpishm.c (revision dec1416f15364d8a66cef6f4b2a5a2aba5192d13)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt shmsize;          /* size of the shared memory communicator */
7   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal tag/name shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19 {
20   PetscErrorCode  ierr;
21   PetscShmComm p = (PetscShmComm)val;
22 
23   PetscFunctionBegin;
24   ierr = PetscInfo1(0,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
25   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
26   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
27   ierr = PetscFree(val);CHKERRMPI(ierr);
28   PetscFunctionReturn(MPI_SUCCESS);
29 }
30 
31 /*@C
32     PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory
33 
34 
35     Collective.
36 
37     Input Parameter:
38 .   globcomm - MPI_Comm
39 
40     Output Parameter:
41 .   pshmcomm - the PETSc shared memory communicator object
42 
43     Level: developer
44 
45     Notes:
46     This should be called only with an PetscCommDuplicate() communictor
47 
48            When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
49 
50 @*/
51 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
52 {
53 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
54   PetscErrorCode   ierr;
55   MPI_Group        globgroup,shmgroup;
56   PetscMPIInt      *shmranks,i,flg;
57   PetscCommCounter *counter;
58 
59   PetscFunctionBegin;
60   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
61   if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator");
62 
63   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr);
64   if (flg) PetscFunctionReturn(0);
65 
66   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
67   (*pshmcomm)->globcomm = globcomm;
68 
69   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr);
70 
71   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr);
72   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr);
73   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr);
74   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
75   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
76   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
77   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr);
78   ierr = PetscFree(shmranks);CHKERRQ(ierr);
79   ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr);
80   ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr);
81 
82   for (i=0; i<(*pshmcomm)->shmsize; i++) {
83     ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
84   }
85   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr);
86   PetscFunctionReturn(0);
87 #else
88   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
89 #endif
90 }
91 
92 /*@C
93     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
94 
95     Input Parameters:
96 +   pshmcomm - the shared memory communicator object
97 -   grank    - the global rank
98 
99     Output Parameter:
100 .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
101 
102     Level: developer
103 
104     Developer Notes:
105     Assumes the pshmcomm->globranks[] is sorted
106 
107     It may be better to rewrite this to map multiple global ranks to local in the same function call
108 
109 @*/
110 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
111 {
112   PetscMPIInt    low,high,t,i;
113   PetscBool      flg = PETSC_FALSE;
114   PetscErrorCode ierr;
115 
116   PetscFunctionBegin;
117   *lrank = MPI_PROC_NULL;
118   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
119   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
120   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
121   if (flg) PetscFunctionReturn(0);
122   low  = 0;
123   high = pshmcomm->shmsize;
124   while (high-low > 5) {
125     t = (low+high)/2;
126     if (pshmcomm->globranks[t] > grank) high = t;
127     else low = t;
128   }
129   for (i=low; i<high; i++) {
130     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
131     if (pshmcomm->globranks[i] == grank) {
132       *lrank = i;
133       PetscFunctionReturn(0);
134     }
135   }
136   PetscFunctionReturn(0);
137 }
138 
139 /*@C
140     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
141 
142     Input Parameters:
143 +   pshmcomm - the shared memory communicator object
144 -   lrank    - the local rank in the shared memory communicator
145 
146     Output Parameter:
147 .   grank - the global rank in the global communicator where the shared memory communicator is built
148 
149     Level: developer
150 
151 @*/
152 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
153 {
154   PetscFunctionBegin;
155 #ifdef PETSC_USE_DEBUG
156   {
157     PetscErrorCode ierr;
158     if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);CHKERRQ(ierr); }
159   }
160 #endif
161   *grank = pshmcomm->globranks[lrank];
162   PetscFunctionReturn(0);
163 }
164 
165 /*@C
166     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
167 
168     Input Parameter:
169 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
170 
171     Output Parameter:
172 .   comm     - the MPI communicator
173 
174     Level: developer
175 
176 @*/
177 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
178 {
179   PetscFunctionBegin;
180   *comm = pshmcomm->shmcomm;
181   PetscFunctionReturn(0);
182 }
183 
184 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
185 #include <pthread.h>
186 #include <hwloc.h>
187 #include <omp.h>
188 
189 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
190    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
191    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
192    by 50%. Until the reason is found out, we use mmap() instead.
193 */
194 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
195 
196 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
197 #include <sys/mman.h>
198 #include <sys/types.h>
199 #include <sys/stat.h>
200 #include <fcntl.h>
201 #endif
202 
203 struct _n_PetscOmpCtrl {
204   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
205   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
206   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
207   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
208   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
209   pthread_barrier_t *barrier;        /* pointer to the barrier */
210   hwloc_topology_t  topology;
211   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
212   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
213 };
214 
215 
216 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
217    contained by the controler.
218 
219    PETSc OpenMP controler users do not call this function directly. This function exists
220    only because we want to separate shared memory allocation methods from other code.
221  */
222 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
223 {
224   PetscErrorCode        ierr;
225   MPI_Aint              size;
226   void                  *baseptr;
227   pthread_barrierattr_t  attr;
228 
229 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
230   PetscInt              fd;
231   PetscChar             pathname[PETSC_MAX_PATH_LEN];
232 #else
233   PetscMPIInt           disp_unit;
234 #endif
235 
236   PetscFunctionBegin;
237 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
238   size = sizeof(pthread_barrier_t);
239   if (ctrl->is_omp_master) {
240     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
241     ierr    = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
242     ierr    = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
243     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
244     fd      = mkstemp(pathname); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname);
245     ierr    = ftruncate(fd,size);CHKERRQ(ierr);
246     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
247     ierr    = close(fd);CHKERRQ(ierr);
248     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
249     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
250     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
251     ierr    = unlink(pathname);CHKERRQ(ierr);
252   } else {
253     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
254     fd      = open(pathname,O_RDWR); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname);
255     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
256     ierr    = close(fd);CHKERRQ(ierr);
257     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
258   }
259 #else
260   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
261   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr);
262   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr);
263 #endif
264   ctrl->barrier = (pthread_barrier_t*)baseptr;
265 
266   /* omp master initializes the barrier */
267   if (ctrl->is_omp_master) {
268     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr);
269     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
270     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
271     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
272     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
273   }
274 
275   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
276   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
277   PetscFunctionReturn(0);
278 }
279 
280 /* Destroy the pthread barrier in the PETSc OpenMP controler */
281 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
282 {
283   PetscErrorCode ierr;
284 
285   PetscFunctionBegin;
286   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
287   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
288   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
289 
290 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
291   ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr);
292 #else
293   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr);
294 #endif
295   PetscFunctionReturn(0);
296 }
297 
298 /*@C
299     PetscOmpCtrlCreate - create a PETSc OpenMP controler, which manages PETSc's interaction with third party libraries using OpenMP
300 
301     Input Parameter:
302 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
303 -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
304 
305     Output Parameter:
306 .   pctrl      - a PETSc OpenMP controler
307 
308     Level: developer
309 
310 .seealso PetscOmpCtrlDestroy()
311 @*/
312 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
313 {
314   PetscErrorCode        ierr;
315   PetscOmpCtrl          ctrl;
316   unsigned long         *cpu_ulongs=NULL;
317   PetscInt              i,nr_cpu_ulongs;
318   PetscShmComm          pshmcomm;
319   MPI_Comm              shm_comm;
320   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
321   PetscInt              num_packages,num_cores;
322 
323   PetscFunctionBegin;
324   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
325 
326   /*=================================================================================
327     Init hwloc
328    ==================================================================================*/
329   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
330 #if HWLOC_API_VERSION >= 0x00020000
331   /* to filter out unneeded info and have faster hwloc_topology_load */
332   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
333   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
334 #endif
335   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
336 
337   /*=================================================================================
338     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
339     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
340     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
341     which is usually passed to third party libraries.
342    ==================================================================================*/
343 
344   /* fetch the stored shared memory communicator */
345   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
346   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
347 
348   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr);
349   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr);
350 
351   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
352   if (nthreads == -1) {
353     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE); if (num_packages <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not determine number of sockets(packages) per compute node\n");
354     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);    if (num_cores    <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not determine number of cores per compute node\n");
355     nthreads     = num_cores/num_packages;
356     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
357   }
358 
359   if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %d can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
360   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %d is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
361 
362   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
363      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
364      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
365      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
366      Use 0 as key so that rank ordering wont change in new comm.
367    */
368   color = shm_rank / nthreads;
369   ierr  = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRQ(ierr);
370 
371   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
372   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr);
373   if (!omp_rank) {
374     ctrl->is_omp_master = PETSC_TRUE;  /* master */
375     color = 0;
376   } else {
377     ctrl->is_omp_master = PETSC_FALSE; /* slave */
378     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
379   }
380   ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRQ(ierr); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */
381 
382   /*=================================================================================
383     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
384     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
385     and run them on the idle CPUs.
386    ==================================================================================*/
387   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
388 
389   /*=================================================================================
390     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
391     is the union of the bindings of all ranks in the omp_comm
392     =================================================================================*/
393 
394   ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
395   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
396 
397   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
398   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
399   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
400   if (nr_cpu_ulongs == 1) {
401     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
402   } else {
403     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
404   }
405 
406   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr);
407 
408   if (ctrl->is_omp_master) {
409     ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
410     if (nr_cpu_ulongs == 1) {
411 #if HWLOC_API_VERSION >= 0x00020000
412       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
413 #else
414       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
415 #endif
416     } else {
417       for (i=0; i<nr_cpu_ulongs; i++)  {
418 #if HWLOC_API_VERSION >= 0x00020000
419         ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr);
420 #else
421         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
422 #endif
423       }
424     }
425   }
426 
427   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
428   *pctrl = ctrl;
429   PetscFunctionReturn(0);
430 }
431 
432 /*@C
433     PetscOmpCtrlDestroy - destory the PETSc OpenMP controler
434 
435     Input Parameter:
436 .   pctrl  - a PETSc OpenMP controler
437 
438     Level: developer
439 
440 .seealso PetscOmpCtrlCreate()
441 @*/
442 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
443 {
444   PetscErrorCode  ierr;
445   PetscOmpCtrl    ctrl = *pctrl;
446 
447   PetscFunctionBegin;
448   hwloc_bitmap_free(ctrl->cpuset);
449   hwloc_topology_destroy(ctrl->topology);
450   PetscOmpCtrlDestroyBarrier(ctrl);
451   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr);
452   if (ctrl->is_omp_master) {
453     hwloc_bitmap_free(ctrl->omp_cpuset);
454     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr);
455   }
456   ierr = PetscFree(ctrl);CHKERRQ(ierr);
457   PetscFunctionReturn(0);
458 }
459 
460 /*@C
461     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controler
462 
463     Input Parameter:
464 .   ctrl - a PETSc OMP controler
465 
466     Output Parameter:
467 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
468 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
469                        on slave ranks, MPI_COMM_NULL will be return in reality.
470 -   is_omp_master    - true if the calling process is an OMP master rank.
471 
472     Notes: any output parameter can be NULL. The parameter is just ignored.
473 
474     Level: developer
475 @*/
476 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
477 {
478   PetscFunctionBegin;
479   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
480   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
481   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
482   PetscFunctionReturn(0);
483 }
484 
485 /*@C
486     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controler (to let slave ranks free their CPU)
487 
488     Input Parameter:
489 .   ctrl - a PETSc OMP controler
490 
491     Notes:
492     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
493     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
494     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
495 
496     A code using PetscOmpCtrlBarrier() would be like this,
497 
498     if (is_omp_master) {
499       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
500       Call the library using OpenMP
501       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
502     }
503     PetscOmpCtrlBarrier(ctrl);
504 
505     Level: developer
506 
507 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
508 @*/
509 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
510 {
511   PetscErrorCode ierr;
512 
513   PetscFunctionBegin;
514   ierr = pthread_barrier_wait(ctrl->barrier);
515   if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr);
516   PetscFunctionReturn(0);
517 }
518 
519 /*@C
520     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
521 
522     Input Parameter:
523 .   ctrl - a PETSc OMP controler
524 
525     Notes:
526     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank.
527     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
528 
529     Level: developer
530 
531 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
532 @*/
533 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
534 {
535   PetscErrorCode ierr;
536 
537   PetscFunctionBegin;
538   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
539   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
540   PetscFunctionReturn(0);
541 }
542 
543 /*@C
544    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
545 
546    Input Parameter:
547 .  ctrl - a PETSc OMP controler
548 
549    Notes:
550    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank.
551    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
552 
553    Level: developer
554 
555 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
556 @*/
557 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
558 {
559   PetscErrorCode ierr;
560 
561   PetscFunctionBegin;
562   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
563   omp_set_num_threads(1);
564   PetscFunctionReturn(0);
565 }
566 
567 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
568 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
569