xref: /petsc/src/sys/utils/mpishm.c (revision 1d36bdfd46c3877894678bf6fced88766689f9b6)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt shmsize;          /* size of the shared memory communicator */
7   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal tag/name shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19 {
20   PetscErrorCode  ierr;
21   PetscShmComm p = (PetscShmComm)val;
22 
23   PetscFunctionBegin;
24   ierr = PetscInfo1(0,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
25   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
26   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
27   ierr = PetscFree(val);CHKERRMPI(ierr);
28   PetscFunctionReturn(MPI_SUCCESS);
29 }
30 
31 /*@C
32     PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory
33 
34 
35     Collective on comm.
36 
37     Input Parameter:
38 .   globcomm - MPI_Comm
39 
40     Output Parameter:
41 .   pshmcomm - the PETSc shared memory communicator object
42 
43     Level: developer
44 
45     Notes:
46     This should be called only with an PetscCommDuplicate() communictor
47 
48            When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
49 
50     Concepts: MPI subcomm^numbering
51 
52 @*/
53 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
54 {
55 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
56   PetscErrorCode   ierr;
57   MPI_Group        globgroup,shmgroup;
58   PetscMPIInt      *shmranks,i,flg;
59   PetscCommCounter *counter;
60 
61   PetscFunctionBegin;
62   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
63   if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator");
64 
65   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr);
66   if (flg) PetscFunctionReturn(0);
67 
68   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
69   (*pshmcomm)->globcomm = globcomm;
70 
71   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr);
72 
73   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr);
74   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr);
75   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr);
76   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
77   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
78   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
79   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr);
80   ierr = PetscFree(shmranks);CHKERRQ(ierr);
81   ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr);
82   ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr);
83 
84   for (i=0; i<(*pshmcomm)->shmsize; i++) {
85     ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
86   }
87   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr);
88   PetscFunctionReturn(0);
89 #else
90   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
91 #endif
92 }
93 
94 /*@C
95     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
96 
97     Input Parameters:
98 +   pshmcomm - the shared memory communicator object
99 -   grank    - the global rank
100 
101     Output Parameter:
102 .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
103 
104     Level: developer
105 
106     Developer Notes:
107     Assumes the pshmcomm->globranks[] is sorted
108 
109     It may be better to rewrite this to map multiple global ranks to local in the same function call
110 
111     Concepts: MPI subcomm^numbering
112 
113 @*/
114 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
115 {
116   PetscMPIInt    low,high,t,i;
117   PetscBool      flg = PETSC_FALSE;
118   PetscErrorCode ierr;
119 
120   PetscFunctionBegin;
121   *lrank = MPI_PROC_NULL;
122   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
123   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
124   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
125   if (flg) PetscFunctionReturn(0);
126   low  = 0;
127   high = pshmcomm->shmsize;
128   while (high-low > 5) {
129     t = (low+high)/2;
130     if (pshmcomm->globranks[t] > grank) high = t;
131     else low = t;
132   }
133   for (i=low; i<high; i++) {
134     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
135     if (pshmcomm->globranks[i] == grank) {
136       *lrank = i;
137       PetscFunctionReturn(0);
138     }
139   }
140   PetscFunctionReturn(0);
141 }
142 
143 /*@C
144     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
145 
146     Input Parameters:
147 +   pshmcomm - the shared memory communicator object
148 -   lrank    - the local rank in the shared memory communicator
149 
150     Output Parameter:
151 .   grank - the global rank in the global communicator where the shared memory communicator is built
152 
153     Level: developer
154 
155     Concepts: MPI subcomm^numbering
156 @*/
157 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
158 {
159   PetscFunctionBegin;
160 #ifdef PETSC_USE_DEBUG
161   {
162     PetscErrorCode ierr;
163     if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);CHKERRQ(ierr); }
164   }
165 #endif
166   *grank = pshmcomm->globranks[lrank];
167   PetscFunctionReturn(0);
168 }
169 
170 /*@C
171     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
172 
173     Input Parameter:
174 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
175 
176     Output Parameter:
177 .   comm     - the MPI communicator
178 
179     Level: developer
180 
181 @*/
182 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
183 {
184   PetscFunctionBegin;
185   *comm = pshmcomm->shmcomm;
186   PetscFunctionReturn(0);
187 }
188 
189 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
190 #include <pthread.h>
191 #include <hwloc.h>
192 #include <omp.h>
193 
194 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
195    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
196    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
197    by 50%. Until the reason is found out, we use mmap() instead.
198 */
199 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
200 
201 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
202 #include <sys/mman.h>
203 #include <sys/types.h>
204 #include <sys/stat.h>
205 #include <fcntl.h>
206 #endif
207 
208 struct _n_PetscOmpCtrl {
209   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
210   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
211   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
212   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
213   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
214   pthread_barrier_t *barrier;        /* pointer to the barrier */
215   hwloc_topology_t  topology;
216   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
217   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
218 };
219 
220 
221 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
222    contained by the controler.
223 
224    PETSc OpenMP controler users do not call this function directly. This function exists
225    only because we want to separate shared memory allocation methods from other code.
226  */
227 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
228 {
229   PetscErrorCode        ierr;
230   MPI_Aint              size;
231   void                  *baseptr;
232   pthread_barrierattr_t  attr;
233 
234 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
235   PetscInt              fd;
236   PetscChar             pathname[PETSC_MAX_PATH_LEN];
237 #else
238   PetscMPIInt           disp_unit;
239 #endif
240 
241   PetscFunctionBegin;
242 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
243   size = sizeof(pthread_barrier_t);
244   if (ctrl->is_omp_master) {
245     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
246     ierr    = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
247     ierr    = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
248     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
249     fd      = mkstemp(pathname); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname);
250     ierr    = ftruncate(fd,size);CHKERRQ(ierr);
251     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
252     ierr    = close(fd);CHKERRQ(ierr);
253     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
254     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
255     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
256     ierr    = unlink(pathname);CHKERRQ(ierr);
257   } else {
258     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
259     fd      = open(pathname,O_RDWR); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname);
260     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
261     ierr    = close(fd);CHKERRQ(ierr);
262     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
263   }
264 #else
265   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
266   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr);
267   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr);
268 #endif
269   ctrl->barrier = (pthread_barrier_t*)baseptr;
270 
271   /* omp master initializes the barrier */
272   if (ctrl->is_omp_master) {
273     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr);
274     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
275     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
276     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
277     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
278   }
279 
280   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
281   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
282   PetscFunctionReturn(0);
283 }
284 
285 /* Destroy the pthread barrier in the PETSc OpenMP controler */
286 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
287 {
288   PetscErrorCode ierr;
289 
290   PetscFunctionBegin;
291   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
292   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
293   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
294 
295 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
296   ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr);
297 #else
298   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr);
299 #endif
300   PetscFunctionReturn(0);
301 }
302 
303 /*@C
304     PetscOmpCtrlCreate - create a PETSc OpenMP controler, which manages PETSc's interaction with third party libraries using OpenMP
305 
306     Input Parameter:
307 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
308 .   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
309 
310     Output Parameter:
311 .   pctrl      - a PETSc OpenMP controler
312 
313     Level: developer
314 
315 .seealso PetscOmpCtrlDestroy()
316 @*/
317 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
318 {
319   PetscErrorCode        ierr;
320   PetscOmpCtrl          ctrl;
321   unsigned long         *cpu_ulongs=NULL;
322   PetscInt              i,nr_cpu_ulongs;
323   PetscShmComm          pshmcomm;
324   MPI_Comm              shm_comm;
325   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
326   PetscInt              num_packages,num_cores;
327 
328   PetscFunctionBegin;
329   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
330 
331   /*=================================================================================
332     Init hwloc
333    ==================================================================================*/
334   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
335 #if HWLOC_API_VERSION >= 0x00020000
336   /* to filter out unneeded info and have faster hwloc_topology_load */
337   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
338   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
339 #endif
340   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
341 
342   /*=================================================================================
343     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
344     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
345     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
346     which is usually passed to third party libraries.
347    ==================================================================================*/
348 
349   /* fetch the stored shared memory communicator */
350   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
351   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
352 
353   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr);
354   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr);
355 
356   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
357   if (nthreads == -1) {
358     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE); if (num_packages <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not determine number of sockets(packages) per compute node\n");
359     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);    if (num_cores    <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not determine number of cores per compute node\n");
360     nthreads     = num_cores/num_packages;
361     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
362   }
363 
364   if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %d can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
365   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %d is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
366 
367   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
368      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
369      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
370      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
371      Use 0 as key so that rank ordering wont change in new comm.
372    */
373   color = shm_rank / nthreads;
374   ierr  = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRQ(ierr);
375 
376   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
377   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr);
378   if (!omp_rank) {
379     ctrl->is_omp_master = PETSC_TRUE;  /* master */
380     color = 0;
381   } else {
382     ctrl->is_omp_master = PETSC_FALSE; /* slave */
383     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
384   }
385   ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRQ(ierr); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */
386 
387   /*=================================================================================
388     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
389     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
390     and run them on the idle CPUs.
391    ==================================================================================*/
392   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
393 
394   /*=================================================================================
395     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
396     is the union of the bindings of all ranks in the omp_comm
397     =================================================================================*/
398 
399   ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
400   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
401 
402   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
403   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
404   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
405   if (nr_cpu_ulongs == 1) {
406     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
407   } else {
408     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
409   }
410 
411   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr);
412 
413   if (ctrl->is_omp_master) {
414     ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
415     if (nr_cpu_ulongs == 1) {
416 #if HWLOC_API_VERSION >= 0x00020000
417       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
418 #else
419       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
420 #endif
421     } else {
422       for (i=0; i<nr_cpu_ulongs; i++)  {
423 #if HWLOC_API_VERSION >= 0x00020000
424         ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr);
425 #else
426         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
427 #endif
428       }
429     }
430   }
431 
432   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
433   *pctrl = ctrl;
434   PetscFunctionReturn(0);
435 }
436 
437 /*@C
438     PetscOmpCtrlDestroy - destory the PETSc OpenMP controler
439 
440     Input Parameter:
441 .   pctrl  - a PETSc OpenMP controler
442 
443     Level: developer
444 
445 .seealso PetscOmpCtrlCreate()
446 @*/
447 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
448 {
449   PetscErrorCode  ierr;
450   PetscOmpCtrl    ctrl = *pctrl;
451 
452   PetscFunctionBegin;
453   hwloc_bitmap_free(ctrl->cpuset);
454   hwloc_topology_destroy(ctrl->topology);
455   PetscOmpCtrlDestroyBarrier(ctrl);
456   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr);
457   if (ctrl->is_omp_master) {
458     hwloc_bitmap_free(ctrl->omp_cpuset);
459     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr);
460   }
461   ierr = PetscFree(ctrl);CHKERRQ(ierr);
462   PetscFunctionReturn(0);
463 }
464 
465 /*@C
466     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controler
467 
468     Input Parameter:
469 .   ctrl - a PETSc OMP controler
470 
471     Output Parameter:
472 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
473 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
474                        on slave ranks, MPI_COMM_NULL will be return in reality.
475 -   is_omp_master    - true if the calling process is an OMP master rank.
476 
477     Notes: any output parameter can be NULL. The parameter is just ignored.
478 
479     Level: developer
480 @*/
481 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
482 {
483   PetscFunctionBegin;
484   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
485   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
486   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
487   PetscFunctionReturn(0);
488 }
489 
490 /*@C
491     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controler (to let slave ranks free their CPU)
492 
493     Input Parameter:
494 .   ctrl - a PETSc OMP controler
495 
496     Notes:
497     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
498     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
499     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
500 
501     A code using PetscOmpCtrlBarrier() would be like this,
502 
503     if (is_omp_master) {
504       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
505       Call the library using OpenMP
506       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
507     }
508     PetscOmpCtrlBarrier(ctrl);
509 
510     Level: developer
511 
512 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
513 @*/
514 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
515 {
516   PetscErrorCode ierr;
517 
518   PetscFunctionBegin;
519   ierr = pthread_barrier_wait(ctrl->barrier);
520   if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr);
521   PetscFunctionReturn(0);
522 }
523 
524 /*@C
525     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
526 
527     Input Parameter:
528 .   ctrl - a PETSc OMP controler
529 
530     Notes:
531     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank.
532     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
533 
534     Level: developer
535 
536 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
537 @*/
538 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
539 {
540   PetscErrorCode ierr;
541 
542   PetscFunctionBegin;
543   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
544   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
545   PetscFunctionReturn(0);
546 }
547 
548 /*@C
549    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
550 
551    Input Parameter:
552 .  ctrl - a PETSc OMP controler
553 
554    Notes:
555    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank.
556    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
557 
558    Level: developer
559 
560 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
561 @*/
562 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
563 {
564   PetscErrorCode ierr;
565 
566   PetscFunctionBegin;
567   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
568   omp_set_num_threads(1);
569   PetscFunctionReturn(0);
570 }
571 
572 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
573 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
574