xref: /petsc/src/sys/utils/mpishm.c (revision eff715bbf1a92b57a39590b123ea1f6a46dc3c94)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt shmsize;          /* size of the shared memory communicator */
7   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal tag/name shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19 {
20   PetscErrorCode  ierr;
21   PetscShmComm p = (PetscShmComm)val;
22 
23   PetscFunctionBegin;
24   ierr = PetscInfo1(0,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
25   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
26   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
27   ierr = PetscFree(val);CHKERRMPI(ierr);
28   PetscFunctionReturn(MPI_SUCCESS);
29 }
30 
31 /*@C
32     PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory
33 
34 
35     Collective on comm.
36 
37     Input Parameter:
38 .   globcomm - MPI_Comm
39 
40     Output Parameter:
41 .   pshmcomm - the PETSc shared memory communicator object
42 
43     Level: developer
44 
45     Notes:
46     This should be called only with an PetscCommDuplicate() communictor
47 
48            When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
49 
50     Concepts: MPI subcomm^numbering
51 
52 @*/
53 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
54 {
55 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
56   PetscErrorCode   ierr;
57   MPI_Group        globgroup,shmgroup;
58   PetscMPIInt      *shmranks,i,flg;
59   PetscCommCounter *counter;
60 
61   PetscFunctionBegin;
62   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
63   if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator");
64 
65   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr);
66   if (flg) PetscFunctionReturn(0);
67 
68   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
69   (*pshmcomm)->globcomm = globcomm;
70 
71   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr);
72 
73   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr);
74   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr);
75   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr);
76   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
77   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
78   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
79   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr);
80   ierr = PetscFree(shmranks);CHKERRQ(ierr);
81   ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr);
82   ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr);
83 
84   for (i=0; i<(*pshmcomm)->shmsize; i++) {
85     ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
86   }
87   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr);
88   PetscFunctionReturn(0);
89 #else
90   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
91 #endif
92 }
93 
94 /*@C
95     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
96 
97     Input Parameters:
98 +   pshmcomm - the shared memory communicator object
99 -   grank    - the global rank
100 
101     Output Parameter:
102 .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
103 
104     Level: developer
105 
106     Developer Notes:
107     Assumes the pshmcomm->globranks[] is sorted
108 
109     It may be better to rewrite this to map multiple global ranks to local in the same function call
110 
111     Concepts: MPI subcomm^numbering
112 
113 @*/
114 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
115 {
116   PetscMPIInt    low,high,t,i;
117   PetscBool      flg = PETSC_FALSE;
118   PetscErrorCode ierr;
119 
120   PetscFunctionBegin;
121   *lrank = MPI_PROC_NULL;
122   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
123   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
124   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
125   if (flg) PetscFunctionReturn(0);
126   low  = 0;
127   high = pshmcomm->shmsize;
128   while (high-low > 5) {
129     t = (low+high)/2;
130     if (pshmcomm->globranks[t] > grank) high = t;
131     else low = t;
132   }
133   for (i=low; i<high; i++) {
134     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
135     if (pshmcomm->globranks[i] == grank) {
136       *lrank = i;
137       PetscFunctionReturn(0);
138     }
139   }
140   PetscFunctionReturn(0);
141 }
142 
143 /*@C
144     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
145 
146     Input Parameters:
147 +   pshmcomm - the shared memory communicator object
148 -   lrank    - the local rank in the shared memory communicator
149 
150     Output Parameter:
151 .   grank - the global rank in the global communicator where the shared memory communicator is built
152 
153     Level: developer
154 
155     Concepts: MPI subcomm^numbering
156 @*/
157 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
158 {
159   PetscFunctionBegin;
160 #ifdef PETSC_USE_DEBUG
161   {
162     PetscErrorCode ierr;
163     if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);CHKERRQ(ierr); }
164   }
165 #endif
166   *grank = pshmcomm->globranks[lrank];
167   PetscFunctionReturn(0);
168 }
169 
170 /*@C
171     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
172 
173     Input Parameter:
174 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
175 
176     Output Parameter:
177 .   comm     - the MPI communicator
178 
179     Level: developer
180 
181 @*/
182 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
183 {
184   PetscFunctionBegin;
185   *comm = pshmcomm->shmcomm;
186   PetscFunctionReturn(0);
187 }
188 
189 #if defined(PETSC_HAVE_OPENMP) && defined(PETSC_HAVE_PTHREAD) && defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY) && defined(PETSC_HAVE_HWLOC)
190 #include <pthread.h>
191 #include <hwloc.h>
192 #include <omp.h>
193 
194 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
195    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
196    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
197    by 50%. Until the reason is found out, we use mmap() instead.
198 */
199 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
200 
201 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
202 #include <sys/mman.h>
203 #include <sys/types.h>
204 #include <sys/stat.h>
205 #include <fcntl.h>
206 #endif
207 
208 struct _n_PetscOmpCtrl {
209   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
210   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
211   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
212   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
213   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
214   pthread_barrier_t *barrier;        /* pointer to the barrier */
215   hwloc_topology_t  topology;
216   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
217   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
218 };
219 
220 
221 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
222    contained by the controler.
223 
224    PETSc OpenMP controler users do not call this function directly. This function exists
225    only because we want to separate shared memory allocation methods from other code.
226  */
227 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
228 {
229   PetscErrorCode        ierr;
230   MPI_Aint              size;
231   void                  *baseptr;
232   pthread_barrierattr_t  attr;
233 
234 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
235   PetscInt              fd;
236   PetscChar             pathname[PETSC_MAX_PATH_LEN];
237 #else
238   PetscMPIInt           disp_unit;
239 #endif
240 
241   PetscFunctionBegin;
242 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
243   size = sizeof(pthread_barrier_t);
244   if (ctrl->is_omp_master) {
245     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
246     ierr    = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
247     ierr    = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
248     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
249     fd      = mkstemp(pathname); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname);
250     ierr    = ftruncate(fd,size);CHKERRQ(ierr);
251     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
252     ierr    = close(fd);CHKERRQ(ierr);
253     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
254     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
255     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
256     ierr    = unlink(pathname);CHKERRQ(ierr);
257   } else {
258     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
259     fd      = open(pathname,O_RDWR); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname);
260     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
261     ierr    = close(fd);CHKERRQ(ierr);
262     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
263   }
264 #else
265   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
266   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr);
267   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr);
268 #endif
269   ctrl->barrier = (pthread_barrier_t*)baseptr;
270 
271   /* omp master initializes the barrier */
272   if (ctrl->is_omp_master) {
273     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr);
274     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
275     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
276     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
277     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
278   }
279 
280   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
281   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
282   PetscFunctionReturn(0);
283 }
284 
285 /* Destroy the pthread barrier in the PETSc OpenMP controler */
286 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
287 {
288   PetscErrorCode ierr;
289 
290   PetscFunctionBegin;
291   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
292   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
293   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
294 
295 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
296   ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr);
297 #else
298   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr);
299 #endif
300   PetscFunctionReturn(0);
301 }
302 
303 /*@C
304     PetscOmpCtrlCreate - create a PETSc OpenMP controler, which manages PETSc's interaction with third party libraries using OpenMP
305 
306     Input Parameter:
307 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
308 .   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP
309 
310     Output Parameter:
311 .   pctrl      - a PETSc OpenMP controler
312 
313     Level: developer
314 
315 .seealso PetscOmpCtrlDestroy()
316 @*/
317 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
318 {
319   PetscErrorCode        ierr;
320   PetscOmpCtrl          ctrl;
321   unsigned long         *cpu_ulongs=NULL;
322   PetscInt              i,nr_cpu_ulongs;
323   PetscShmComm          pshmcomm;
324   MPI_Comm              shm_comm;
325   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
326 
327   PetscFunctionBegin;
328   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
329 
330   /*=================================================================================
331     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
332     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
333     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
334     which is usually passed to third party libraries.
335    ==================================================================================*/
336 
337   /* fetch the stored shared memory communicator */
338   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
339   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
340 
341   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr);
342   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr);
343 
344   if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %d can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
345   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %d is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
346 
347   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
348      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
349      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
350      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
351      Use 0 as key so that rank ordering wont change in new comm.
352    */
353   color = shm_rank / nthreads;
354   ierr  = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRQ(ierr);
355 
356   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
357   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr);
358   if (!omp_rank) {
359     ctrl->is_omp_master = PETSC_TRUE;  /* master */
360     color = 0;
361   } else {
362     ctrl->is_omp_master = PETSC_FALSE; /* slave */
363     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
364   }
365   ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRQ(ierr); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */
366 
367   /*=================================================================================
368     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
369     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
370     and run them on the idle CPUs.
371    ==================================================================================*/
372   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
373 
374   /*=================================================================================
375     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
376     is the union of the bindings of all ranks in the omp_comm
377     =================================================================================*/
378   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
379 #if HWLOC_API_VERSION >= 0x00020000
380   /* to filter out unneeded info and have faster hwloc_topology_load */
381   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
382   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
383 #endif
384   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
385 
386   ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
387   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
388 
389   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
390   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
391   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
392   if (nr_cpu_ulongs == 1) {
393     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
394   } else {
395     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
396   }
397 
398   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr);
399 
400   if (ctrl->is_omp_master) {
401     ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
402     if (nr_cpu_ulongs == 1) {
403 #if HWLOC_API_VERSION >= 0x00020000
404       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
405 #else
406       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
407 #endif
408     } else {
409       for (i=0; i<nr_cpu_ulongs; i++)  {
410 #if HWLOC_API_VERSION >= 0x00020000
411         ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr);
412 #else
413         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
414 #endif
415       }
416     }
417   }
418 
419   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
420   *pctrl = ctrl;
421   PetscFunctionReturn(0);
422 }
423 
424 /*@C
425     PetscOmpCtrlDestroy - destory the PETSc OpenMP controler
426 
427     Input Parameter:
428 .   pctrl  - a PETSc OpenMP controler
429 
430     Level: developer
431 
432 .seealso PetscOmpCtrlCreate()
433 @*/
434 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
435 {
436   PetscErrorCode  ierr;
437   PetscOmpCtrl    ctrl = *pctrl;
438 
439   PetscFunctionBegin;
440   hwloc_bitmap_free(ctrl->cpuset);
441   hwloc_topology_destroy(ctrl->topology);
442   PetscOmpCtrlDestroyBarrier(ctrl);
443   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr);
444   if (ctrl->is_omp_master) {
445     hwloc_bitmap_free(ctrl->omp_cpuset);
446     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr);
447   }
448   ierr = PetscFree(ctrl);CHKERRQ(ierr);
449   PetscFunctionReturn(0);
450 }
451 
452 /*@C
453     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controler
454 
455     Input Parameter:
456 .   ctrl - a PETSc OMP controler
457 
458     Output Parameter:
459 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
460 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
461                        on slave ranks, MPI_COMM_NULL will be return in reality.
462 -   is_omp_master    - true if the calling process is an OMP master rank.
463 
464     Notes: any output parameter can be NULL. The parameter is just ignored.
465 
466     Level: developer
467 @*/
468 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
469 {
470   PetscFunctionBegin;
471   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
472   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
473   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
474   PetscFunctionReturn(0);
475 }
476 
477 /*@C
478     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controler (to let slave ranks free their CPU)
479 
480     Input Parameter:
481 .   ctrl - a PETSc OMP controler
482 
483     Notes:
484     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
485     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
486     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
487 
488     A code using PetscOmpCtrlBarrier() would be like this,
489 
490     if (is_omp_master) {
491       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
492       Call the library using OpenMP
493       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
494     }
495     PetscOmpCtrlBarrier(ctrl);
496 
497     Level: developer
498 
499 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
500 @*/
501 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
502 {
503   PetscErrorCode ierr;
504 
505   PetscFunctionBegin;
506   ierr = pthread_barrier_wait(ctrl->barrier);
507   if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr);
508   PetscFunctionReturn(0);
509 }
510 
511 /*@C
512     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
513 
514     Input Parameter:
515 .   ctrl - a PETSc OMP controler
516 
517     Notes:
518     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank.
519     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
520 
521     Level: developer
522 
523 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
524 @*/
525 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
526 {
527   PetscErrorCode ierr;
528 
529   PetscFunctionBegin;
530   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
531   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
532   PetscFunctionReturn(0);
533 }
534 
535 /*@C
536    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
537 
538    Input Parameter:
539 .  ctrl - a PETSc OMP controler
540 
541    Notes:
542    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank.
543    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
544 
545    Level: developer
546 
547 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
548 @*/
549 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
550 {
551   PetscErrorCode ierr;
552 
553   PetscFunctionBegin;
554   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
555   omp_set_num_threads(1);
556   PetscFunctionReturn(0);
557 }
558 
559 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
560 #endif /* defined(PETSC_HAVE_PTHREAD) && .. */
561