xref: /petsc/src/sys/utils/mpishm.c (revision 5f80ce2ab25dff0f4601e710601cbbcecf323266)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt shmsize;          /* size of the shared memory communicator */
7   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19 {
20   PetscShmComm p = (PetscShmComm)val;
21 
22   PetscFunctionBegin;
23   CHKERRMPI(PetscInfo(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm));
24   CHKERRMPI(MPI_Comm_free(&p->shmcomm));
25   CHKERRMPI(PetscFree(p->globranks));
26   CHKERRMPI(PetscFree(val));
27   PetscFunctionReturn(MPI_SUCCESS);
28 }
29 
30 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
31 /* Data structures to support freeing comms created in PetscShmCommGet().
32   Since we predict communicators passed to PetscShmCommGet() are very likely
33   either a petsc inner communicator or an MPI communicator with a linked petsc
34   inner communicator, we use a simple static array to store dupped communicators
35   on rare cases otherwise.
36  */
37 #define MAX_SHMCOMM_DUPPED_COMMS 16
38 static PetscInt       num_dupped_comms=0;
39 static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
40 static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
41 {
42   PetscInt         i;
43   PetscFunctionBegin;
44   for (i=0; i<num_dupped_comms; i++) CHKERRQ(PetscCommDestroy(&shmcomm_dupped_comms[i]));
45   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
46   PetscFunctionReturn(0);
47 }
48 #endif
49 
50 /*@C
51     PetscShmCommGet - Given a communicator returns a sub-communicator of all ranks that share a common memory
52 
53     Collective.
54 
55     Input Parameter:
56 .   globcomm - MPI_Comm, which can be a user MPI_Comm or a PETSc inner MPI_Comm
57 
58     Output Parameter:
59 .   pshmcomm - the PETSc shared memory communicator object
60 
61     Level: developer
62 
63     Notes:
64        When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
65 
66 @*/
67 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
68 {
69 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
70   MPI_Group        globgroup,shmgroup;
71   PetscMPIInt      *shmranks,i,flg;
72   PetscCommCounter *counter;
73 
74   PetscFunctionBegin;
75   PetscValidPointer(pshmcomm,2);
76   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
77   CHKERRMPI(MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg));
78   if (!flg) { /* globcomm is not a petsc comm */
79     union {MPI_Comm comm; void *ptr;} ucomm;
80     /* check if globcomm already has a linked petsc inner comm */
81     CHKERRMPI(MPI_Comm_get_attr(globcomm,Petsc_InnerComm_keyval,&ucomm,&flg));
82     if (!flg) {
83       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
84       PetscCheckFalse(num_dupped_comms >= MAX_SHMCOMM_DUPPED_COMMS,globcomm,PETSC_ERR_PLIB,"PetscShmCommGet() is trying to dup more than %d MPI_Comms",MAX_SHMCOMM_DUPPED_COMMS);
85       CHKERRQ(PetscCommDuplicate(globcomm,&globcomm,NULL));
86       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
87       if (num_dupped_comms == 0) CHKERRQ(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
88       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
89       num_dupped_comms++;
90     } else {
91       /* otherwise, we pull out the inner comm and use it as globcomm */
92       globcomm = ucomm.comm;
93     }
94   }
95 
96   /* Check if globcomm already has an attached pshmcomm. If no, create one */
97   CHKERRMPI(MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg));
98   if (flg) PetscFunctionReturn(0);
99 
100   CHKERRQ(PetscNew(pshmcomm));
101   (*pshmcomm)->globcomm = globcomm;
102 
103   CHKERRMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm));
104 
105   CHKERRMPI(MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize));
106   CHKERRMPI(MPI_Comm_group(globcomm, &globgroup));
107   CHKERRMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
108   CHKERRQ(PetscMalloc1((*pshmcomm)->shmsize,&shmranks));
109   CHKERRQ(PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks));
110   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
111   CHKERRMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
112   CHKERRQ(PetscFree(shmranks));
113   CHKERRMPI(MPI_Group_free(&globgroup));
114   CHKERRMPI(MPI_Group_free(&shmgroup));
115 
116   for (i=0; i<(*pshmcomm)->shmsize; i++) {
117     CHKERRQ(PetscInfo(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]));
118   }
119   CHKERRMPI(MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm));
120   PetscFunctionReturn(0);
121 #else
122   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
123 #endif
124 }
125 
126 /*@C
127     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
128 
129     Input Parameters:
130 +   pshmcomm - the shared memory communicator object
131 -   grank    - the global rank
132 
133     Output Parameter:
134 .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
135 
136     Level: developer
137 
138     Developer Notes:
139     Assumes the pshmcomm->globranks[] is sorted
140 
141     It may be better to rewrite this to map multiple global ranks to local in the same function call
142 
143 @*/
144 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
145 {
146   PetscMPIInt    low,high,t,i;
147   PetscBool      flg = PETSC_FALSE;
148 
149   PetscFunctionBegin;
150   PetscValidPointer(pshmcomm,1);
151   PetscValidPointer(lrank,3);
152   *lrank = MPI_PROC_NULL;
153   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
154   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
155   CHKERRQ(PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL));
156   if (flg) PetscFunctionReturn(0);
157   low  = 0;
158   high = pshmcomm->shmsize;
159   while (high-low > 5) {
160     t = (low+high)/2;
161     if (pshmcomm->globranks[t] > grank) high = t;
162     else low = t;
163   }
164   for (i=low; i<high; i++) {
165     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
166     if (pshmcomm->globranks[i] == grank) {
167       *lrank = i;
168       PetscFunctionReturn(0);
169     }
170   }
171   PetscFunctionReturn(0);
172 }
173 
174 /*@C
175     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
176 
177     Input Parameters:
178 +   pshmcomm - the shared memory communicator object
179 -   lrank    - the local rank in the shared memory communicator
180 
181     Output Parameter:
182 .   grank - the global rank in the global communicator where the shared memory communicator is built
183 
184     Level: developer
185 
186 @*/
187 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
188 {
189   PetscFunctionBegin;
190   PetscValidPointer(pshmcomm,1);
191   PetscValidPointer(grank,3);
192   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %d in the shared memory communicator",lrank);
193   *grank = pshmcomm->globranks[lrank];
194   PetscFunctionReturn(0);
195 }
196 
197 /*@C
198     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
199 
200     Input Parameter:
201 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
202 
203     Output Parameter:
204 .   comm     - the MPI communicator
205 
206     Level: developer
207 
208 @*/
209 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
210 {
211   PetscFunctionBegin;
212   PetscValidPointer(pshmcomm,1);
213   PetscValidPointer(comm,2);
214   *comm = pshmcomm->shmcomm;
215   PetscFunctionReturn(0);
216 }
217 
218 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
219 #include <pthread.h>
220 #include <hwloc.h>
221 #include <omp.h>
222 
223 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
224    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
225    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
226    by 50%. Until the reason is found out, we use mmap() instead.
227 */
228 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
229 
230 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
231 #include <sys/mman.h>
232 #include <sys/types.h>
233 #include <sys/stat.h>
234 #include <fcntl.h>
235 #endif
236 
237 struct _n_PetscOmpCtrl {
238   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
239   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
240   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
241   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
242   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
243   pthread_barrier_t *barrier;        /* pointer to the barrier */
244   hwloc_topology_t  topology;
245   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
246   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
247 };
248 
249 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
250    contained by the controller.
251 
252    PETSc OpenMP controller users do not call this function directly. This function exists
253    only because we want to separate shared memory allocation methods from other code.
254  */
255 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
256 {
257   MPI_Aint              size;
258   void                  *baseptr;
259   pthread_barrierattr_t  attr;
260 
261 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
262   PetscInt              fd;
263   PetscChar             pathname[PETSC_MAX_PATH_LEN];
264 #else
265   PetscMPIInt           disp_unit;
266 #endif
267 
268   PetscFunctionBegin;
269 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
270   size = sizeof(pthread_barrier_t);
271   if (ctrl->is_omp_master) {
272     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
273     CHKERRQ(PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN));
274     CHKERRQ(PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN));
275     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
276     fd      = mkstemp(pathname); PetscCheckFalse(fd == -1,PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp", pathname);
277     CHKERRQ(ftruncate(fd,size));
278     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); PetscCheckFalse(baseptr == MAP_FAILED,PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed");
279     CHKERRQ(close(fd));
280     CHKERRMPI(MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm));
281     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
282     CHKERRMPI(MPI_Barrier(ctrl->omp_comm));
283     CHKERRQ(unlink(pathname));
284   } else {
285     CHKERRMPI(MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm));
286     fd      = open(pathname,O_RDWR); PetscCheckFalse(fd == -1,PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s", pathname);
287     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); PetscCheckFalse(baseptr == MAP_FAILED,PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed");
288     CHKERRQ(close(fd));
289     CHKERRMPI(MPI_Barrier(ctrl->omp_comm));
290   }
291 #else
292   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
293   CHKERRMPI(MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win));
294   CHKERRMPI(MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr));
295 #endif
296   ctrl->barrier = (pthread_barrier_t*)baseptr;
297 
298   /* omp master initializes the barrier */
299   if (ctrl->is_omp_master) {
300     CHKERRMPI(MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size));
301     CHKERRQ(pthread_barrierattr_init(&attr));
302     CHKERRQ(pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED)); /* make the barrier also work for processes */
303     CHKERRQ(pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size));
304     CHKERRQ(pthread_barrierattr_destroy(&attr));
305   }
306 
307   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
308   CHKERRMPI(MPI_Barrier(ctrl->omp_comm));
309   PetscFunctionReturn(0);
310 }
311 
312 /* Destroy the pthread barrier in the PETSc OpenMP controller */
313 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
314 {
315   PetscFunctionBegin;
316   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
317   CHKERRMPI(MPI_Barrier(ctrl->omp_comm));
318   if (ctrl->is_omp_master) CHKERRQ(pthread_barrier_destroy(ctrl->barrier));
319 
320 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
321   CHKERRQ(munmap(ctrl->barrier,sizeof(pthread_barrier_t)));
322 #else
323   CHKERRMPI(MPI_Win_free(&ctrl->omp_win));
324 #endif
325   PetscFunctionReturn(0);
326 }
327 
328 /*@C
329     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP
330 
331     Input Parameters:
332 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
333 -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
334 
335     Output Parameter:
336 .   pctrl      - a PETSc OpenMP controller
337 
338     Level: developer
339 
340     TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use
341 
342 .seealso PetscOmpCtrlDestroy()
343 @*/
344 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
345 {
346   PetscOmpCtrl   ctrl;
347   unsigned long *cpu_ulongs = NULL;
348   PetscInt       i,nr_cpu_ulongs;
349   PetscShmComm   pshmcomm;
350   MPI_Comm       shm_comm;
351   PetscMPIInt    shm_rank,shm_comm_size,omp_rank,color;
352   PetscInt       num_packages,num_cores;
353 
354   PetscFunctionBegin;
355   CHKERRQ(PetscNew(&ctrl));
356 
357   /*=================================================================================
358     Init hwloc
359    ==================================================================================*/
360   CHKERRQ(hwloc_topology_init(&ctrl->topology));
361 #if HWLOC_API_VERSION >= 0x00020000
362   /* to filter out unneeded info and have faster hwloc_topology_load */
363   CHKERRQ(hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE));
364   CHKERRQ(hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL));
365 #endif
366   CHKERRQ(hwloc_topology_load(ctrl->topology));
367 
368   /*=================================================================================
369     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
370     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
371     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
372     which is usually passed to third party libraries.
373    ==================================================================================*/
374 
375   /* fetch the stored shared memory communicator */
376   CHKERRQ(PetscShmCommGet(petsc_comm,&pshmcomm));
377   CHKERRQ(PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm));
378 
379   CHKERRMPI(MPI_Comm_rank(shm_comm,&shm_rank));
380   CHKERRMPI(MPI_Comm_size(shm_comm,&shm_comm_size));
381 
382   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
383   if (nthreads == -1) {
384     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE);
385     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 :  hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);
386     nthreads     = num_cores/num_packages;
387     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
388   }
389 
390   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size,petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d",nthreads,shm_comm_size);
391   if (shm_comm_size % nthreads) CHKERRQ(PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size));
392 
393   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
394      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
395      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
396      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
397      Use 0 as key so that rank ordering wont change in new comm.
398    */
399   color = shm_rank / nthreads;
400   CHKERRMPI(MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm));
401 
402   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
403   CHKERRMPI(MPI_Comm_rank(ctrl->omp_comm,&omp_rank));
404   if (!omp_rank) {
405     ctrl->is_omp_master = PETSC_TRUE;  /* master */
406     color = 0;
407   } else {
408     ctrl->is_omp_master = PETSC_FALSE; /* slave */
409     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
410   }
411   CHKERRMPI(MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm));
412 
413   /*=================================================================================
414     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
415     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
416     and run them on the idle CPUs.
417    ==================================================================================*/
418   CHKERRQ(PetscOmpCtrlCreateBarrier(ctrl));
419 
420   /*=================================================================================
421     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
422     is the union of the bindings of all ranks in the omp_comm
423     =================================================================================*/
424 
425   ctrl->cpuset = hwloc_bitmap_alloc(); PetscCheckFalse(!ctrl->cpuset,PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed");
426   CHKERRQ(hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS));
427 
428   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
429   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
430   CHKERRQ(PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs));
431   if (nr_cpu_ulongs == 1) {
432     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
433   } else {
434     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
435   }
436 
437   CHKERRMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm));
438 
439   if (ctrl->is_omp_master) {
440     ctrl->omp_cpuset = hwloc_bitmap_alloc(); PetscCheckFalse(!ctrl->omp_cpuset,PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed");
441     if (nr_cpu_ulongs == 1) {
442 #if HWLOC_API_VERSION >= 0x00020000
443       CHKERRQ(hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]));
444 #else
445       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
446 #endif
447     } else {
448       for (i=0; i<nr_cpu_ulongs; i++)  {
449 #if HWLOC_API_VERSION >= 0x00020000
450         CHKERRQ(hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]));
451 #else
452         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
453 #endif
454       }
455     }
456   }
457 
458   CHKERRQ(PetscFree(cpu_ulongs));
459   *pctrl = ctrl;
460   PetscFunctionReturn(0);
461 }
462 
463 /*@C
464     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
465 
466     Input Parameter:
467 .   pctrl  - a PETSc OpenMP controller
468 
469     Level: developer
470 
471 .seealso PetscOmpCtrlCreate()
472 @*/
473 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
474 {
475   PetscOmpCtrl ctrl = *pctrl;
476 
477   PetscFunctionBegin;
478   hwloc_bitmap_free(ctrl->cpuset);
479   hwloc_topology_destroy(ctrl->topology);
480   PetscOmpCtrlDestroyBarrier(ctrl);
481   CHKERRMPI(MPI_Comm_free(&ctrl->omp_comm));
482   if (ctrl->is_omp_master) {
483     hwloc_bitmap_free(ctrl->omp_cpuset);
484     CHKERRMPI(MPI_Comm_free(&ctrl->omp_master_comm));
485   }
486   CHKERRQ(PetscFree(ctrl));
487   PetscFunctionReturn(0);
488 }
489 
490 /*@C
491     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
492 
493     Input Parameter:
494 .   ctrl - a PETSc OMP controller
495 
496     Output Parameters:
497 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
498 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
499                        on slave ranks, MPI_COMM_NULL will be return in reality.
500 -   is_omp_master    - true if the calling process is an OMP master rank.
501 
502     Notes: any output parameter can be NULL. The parameter is just ignored.
503 
504     Level: developer
505 @*/
506 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
507 {
508   PetscFunctionBegin;
509   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
510   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
511   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
512   PetscFunctionReturn(0);
513 }
514 
515 /*@C
516     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
517 
518     Input Parameter:
519 .   ctrl - a PETSc OMP controller
520 
521     Notes:
522     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
523     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
524     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
525 
526     A code using PetscOmpCtrlBarrier() would be like this,
527 
528     if (is_omp_master) {
529       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
530       Call the library using OpenMP
531       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
532     }
533     PetscOmpCtrlBarrier(ctrl);
534 
535     Level: developer
536 
537 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
538 @*/
539 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
540 {
541   int err;
542 
543   PetscFunctionBegin;
544   err = pthread_barrier_wait(ctrl->barrier);
545   PetscCheckFalse(err && err != PTHREAD_BARRIER_SERIAL_THREAD,PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %" PetscInt_FMT, err);
546   PetscFunctionReturn(0);
547 }
548 
549 /*@C
550     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
551 
552     Input Parameter:
553 .   ctrl - a PETSc OMP controller
554 
555     Notes:
556     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
557     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
558 
559     Level: developer
560 
561 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
562 @*/
563 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
564 {
565   PetscFunctionBegin;
566   CHKERRQ(hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS));
567   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
568   PetscFunctionReturn(0);
569 }
570 
571 /*@C
572    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
573 
574    Input Parameter:
575 .  ctrl - a PETSc OMP controller
576 
577    Notes:
578    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
579    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
580 
581    Level: developer
582 
583 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
584 @*/
585 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
586 {
587   PetscFunctionBegin;
588   CHKERRQ(hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS));
589   omp_set_num_threads(1);
590   PetscFunctionReturn(0);
591 }
592 
593 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
594 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
595