xref: /petsc/src/sys/utils/mpishm.c (revision 2fa40bb9206b96114faa7cb222621ec184d31cd2)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt shmsize;          /* size of the shared memory communicator */
7   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19 {
20   PetscErrorCode  ierr;
21   PetscShmComm p = (PetscShmComm)val;
22 
23   PetscFunctionBegin;
24   ierr = PetscInfo1(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
25   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
26   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
27   ierr = PetscFree(val);CHKERRMPI(ierr);
28   PetscFunctionReturn(MPI_SUCCESS);
29 }
30 
31 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
32 /* Data structures to support freeing comms created in PetscShmCommGet().
33   Since we predict communicators passed to PetscShmCommGet() are very likely
34   either a petsc inner communicator or an MPI communicator with a linked petsc
35   inner communicator, we use a simple static array to store dupped communicators
36   on rare cases otherwise.
37  */
38 #define MAX_SHMCOMM_DUPPED_COMMS 16
39 static PetscInt       num_dupped_comms=0;
40 static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
41 static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
42 {
43   PetscErrorCode   ierr;
44   PetscInt         i;
45   PetscFunctionBegin;
46   for (i=0; i<num_dupped_comms; i++) {ierr = PetscCommDestroy(&shmcomm_dupped_comms[i]);CHKERRQ(ierr);}
47   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
48   PetscFunctionReturn(0);
49 }
50 #endif
51 
52 /*@C
53     PetscShmCommGet - Given a communicator returns a sub-communicator of all ranks that share a common memory
54 
55     Collective.
56 
57     Input Parameter:
58 .   globcomm - MPI_Comm, which can be a user MPI_Comm or a PETSc inner MPI_Comm
59 
60     Output Parameter:
61 .   pshmcomm - the PETSc shared memory communicator object
62 
63     Level: developer
64 
65     Notes:
66        When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
67 
68 @*/
69 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
70 {
71 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
72   PetscErrorCode   ierr;
73   MPI_Group        globgroup,shmgroup;
74   PetscMPIInt      *shmranks,i,flg;
75   PetscCommCounter *counter;
76 
77   PetscFunctionBegin;
78   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
79   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRMPI(ierr);
80   if (!flg) { /* globcomm is not a petsc comm */
81     union {MPI_Comm comm; void *ptr;} ucomm;
82     /* check if globcomm already has a linked petsc inner comm */
83     ierr = MPI_Comm_get_attr(globcomm,Petsc_InnerComm_keyval,&ucomm,&flg);CHKERRMPI(ierr);
84     if (!flg) {
85       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
86       if (num_dupped_comms >= MAX_SHMCOMM_DUPPED_COMMS) SETERRQ1(globcomm,PETSC_ERR_PLIB,"PetscShmCommGet() is trying to dup more than %d MPI_Comms",MAX_SHMCOMM_DUPPED_COMMS);
87       ierr = PetscCommDuplicate(globcomm,&globcomm,NULL);CHKERRQ(ierr);
88       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
89       if (num_dupped_comms == 0) {ierr = PetscRegisterFinalize(PetscShmCommDestroyDuppedComms);CHKERRQ(ierr);}
90       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
91       num_dupped_comms++;
92     } else {
93       /* otherwise, we pull out the inner comm and use it as globcomm */
94       globcomm = ucomm.comm;
95     }
96   }
97 
98   /* Check if globcomm already has an attached pshmcomm. If no, create one */
99   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRMPI(ierr);
100   if (flg) PetscFunctionReturn(0);
101 
102   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
103   (*pshmcomm)->globcomm = globcomm;
104 
105   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRMPI(ierr);
106 
107   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRMPI(ierr);
108   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRMPI(ierr);
109   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRMPI(ierr);
110   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
111   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
112   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
113   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRMPI(ierr);
114   ierr = PetscFree(shmranks);CHKERRQ(ierr);
115   ierr = MPI_Group_free(&globgroup);CHKERRMPI(ierr);
116   ierr = MPI_Group_free(&shmgroup);CHKERRMPI(ierr);
117 
118   for (i=0; i<(*pshmcomm)->shmsize; i++) {
119     ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
120   }
121   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRMPI(ierr);
122   PetscFunctionReturn(0);
123 #else
124   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
125 #endif
126 }
127 
128 /*@C
129     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
130 
131     Input Parameters:
132 +   pshmcomm - the shared memory communicator object
133 -   grank    - the global rank
134 
135     Output Parameter:
136 .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
137 
138     Level: developer
139 
140     Developer Notes:
141     Assumes the pshmcomm->globranks[] is sorted
142 
143     It may be better to rewrite this to map multiple global ranks to local in the same function call
144 
145 @*/
146 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
147 {
148   PetscMPIInt    low,high,t,i;
149   PetscBool      flg = PETSC_FALSE;
150   PetscErrorCode ierr;
151 
152   PetscFunctionBegin;
153   *lrank = MPI_PROC_NULL;
154   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
155   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
156   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
157   if (flg) PetscFunctionReturn(0);
158   low  = 0;
159   high = pshmcomm->shmsize;
160   while (high-low > 5) {
161     t = (low+high)/2;
162     if (pshmcomm->globranks[t] > grank) high = t;
163     else low = t;
164   }
165   for (i=low; i<high; i++) {
166     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
167     if (pshmcomm->globranks[i] == grank) {
168       *lrank = i;
169       PetscFunctionReturn(0);
170     }
171   }
172   PetscFunctionReturn(0);
173 }
174 
175 /*@C
176     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
177 
178     Input Parameters:
179 +   pshmcomm - the shared memory communicator object
180 -   lrank    - the local rank in the shared memory communicator
181 
182     Output Parameter:
183 .   grank - the global rank in the global communicator where the shared memory communicator is built
184 
185     Level: developer
186 
187 @*/
188 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
189 {
190   PetscFunctionBegin;
191   if (lrank < 0 || lrank >= pshmcomm->shmsize) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);
192   *grank = pshmcomm->globranks[lrank];
193   PetscFunctionReturn(0);
194 }
195 
196 /*@C
197     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
198 
199     Input Parameter:
200 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
201 
202     Output Parameter:
203 .   comm     - the MPI communicator
204 
205     Level: developer
206 
207 @*/
208 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
209 {
210   PetscFunctionBegin;
211   *comm = pshmcomm->shmcomm;
212   PetscFunctionReturn(0);
213 }
214 
215 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
216 #include <pthread.h>
217 #include <hwloc.h>
218 #include <omp.h>
219 
220 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
221    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
222    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
223    by 50%. Until the reason is found out, we use mmap() instead.
224 */
225 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
226 
227 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
228 #include <sys/mman.h>
229 #include <sys/types.h>
230 #include <sys/stat.h>
231 #include <fcntl.h>
232 #endif
233 
234 struct _n_PetscOmpCtrl {
235   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
236   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
237   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
238   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
239   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
240   pthread_barrier_t *barrier;        /* pointer to the barrier */
241   hwloc_topology_t  topology;
242   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
243   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
244 };
245 
246 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
247    contained by the controller.
248 
249    PETSc OpenMP controller users do not call this function directly. This function exists
250    only because we want to separate shared memory allocation methods from other code.
251  */
252 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
253 {
254   PetscErrorCode        ierr;
255   MPI_Aint              size;
256   void                  *baseptr;
257   pthread_barrierattr_t  attr;
258 
259 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
260   PetscInt              fd;
261   PetscChar             pathname[PETSC_MAX_PATH_LEN];
262 #else
263   PetscMPIInt           disp_unit;
264 #endif
265 
266   PetscFunctionBegin;
267 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
268   size = sizeof(pthread_barrier_t);
269   if (ctrl->is_omp_master) {
270     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
271     ierr    = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
272     ierr    = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
273     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
274     fd      = mkstemp(pathname); if (fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname);
275     ierr    = ftruncate(fd,size);CHKERRQ(ierr);
276     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
277     ierr    = close(fd);CHKERRQ(ierr);
278     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRMPI(ierr);
279     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
280     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
281     ierr    = unlink(pathname);CHKERRQ(ierr);
282   } else {
283     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRMPI(ierr);
284     fd      = open(pathname,O_RDWR); if (fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname);
285     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
286     ierr    = close(fd);CHKERRQ(ierr);
287     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
288   }
289 #else
290   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
291   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRMPI(ierr);
292   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRMPI(ierr);
293 #endif
294   ctrl->barrier = (pthread_barrier_t*)baseptr;
295 
296   /* omp master initializes the barrier */
297   if (ctrl->is_omp_master) {
298     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRMPI(ierr);
299     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
300     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
301     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
302     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
303   }
304 
305   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
306   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 /* Destroy the pthread barrier in the PETSc OpenMP controller */
311 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
312 {
313   PetscErrorCode ierr;
314 
315   PetscFunctionBegin;
316   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
317   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
318   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
319 
320 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
321   ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr);
322 #else
323   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRMPI(ierr);
324 #endif
325   PetscFunctionReturn(0);
326 }
327 
328 /*@C
329     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP
330 
331     Input Parameters:
332 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
333 -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
334 
335     Output Parameter:
336 .   pctrl      - a PETSc OpenMP controller
337 
338     Level: developer
339 
340     TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use
341 
342 .seealso PetscOmpCtrlDestroy()
343 @*/
344 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
345 {
346   PetscErrorCode        ierr;
347   PetscOmpCtrl          ctrl;
348   unsigned long         *cpu_ulongs=NULL;
349   PetscInt              i,nr_cpu_ulongs;
350   PetscShmComm          pshmcomm;
351   MPI_Comm              shm_comm;
352   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
353   PetscInt              num_packages,num_cores;
354 
355   PetscFunctionBegin;
356   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
357 
358   /*=================================================================================
359     Init hwloc
360    ==================================================================================*/
361   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
362 #if HWLOC_API_VERSION >= 0x00020000
363   /* to filter out unneeded info and have faster hwloc_topology_load */
364   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
365   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
366 #endif
367   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
368 
369   /*=================================================================================
370     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
371     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
372     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
373     which is usually passed to third party libraries.
374    ==================================================================================*/
375 
376   /* fetch the stored shared memory communicator */
377   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
378   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
379 
380   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRMPI(ierr);
381   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRMPI(ierr);
382 
383   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
384   if (nthreads == -1) {
385     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE);
386     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 :  hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);
387     nthreads     = num_cores/num_packages;
388     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
389   }
390 
391   if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %D can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
392   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %D is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
393 
394   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
395      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
396      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
397      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
398      Use 0 as key so that rank ordering wont change in new comm.
399    */
400   color = shm_rank / nthreads;
401   ierr  = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRMPI(ierr);
402 
403   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
404   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRMPI(ierr);
405   if (!omp_rank) {
406     ctrl->is_omp_master = PETSC_TRUE;  /* master */
407     color = 0;
408   } else {
409     ctrl->is_omp_master = PETSC_FALSE; /* slave */
410     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
411   }
412   ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRMPI(ierr);
413 
414   /*=================================================================================
415     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
416     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
417     and run them on the idle CPUs.
418    ==================================================================================*/
419   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
420 
421   /*=================================================================================
422     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
423     is the union of the bindings of all ranks in the omp_comm
424     =================================================================================*/
425 
426   ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
427   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
428 
429   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
430   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
431   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
432   if (nr_cpu_ulongs == 1) {
433     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
434   } else {
435     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
436   }
437 
438   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRMPI(ierr);
439 
440   if (ctrl->is_omp_master) {
441     ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
442     if (nr_cpu_ulongs == 1) {
443 #if HWLOC_API_VERSION >= 0x00020000
444       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
445 #else
446       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
447 #endif
448     } else {
449       for (i=0; i<nr_cpu_ulongs; i++)  {
450 #if HWLOC_API_VERSION >= 0x00020000
451         ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr);
452 #else
453         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
454 #endif
455       }
456     }
457   }
458 
459   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
460   *pctrl = ctrl;
461   PetscFunctionReturn(0);
462 }
463 
464 /*@C
465     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
466 
467     Input Parameter:
468 .   pctrl  - a PETSc OpenMP controller
469 
470     Level: developer
471 
472 .seealso PetscOmpCtrlCreate()
473 @*/
474 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
475 {
476   PetscErrorCode  ierr;
477   PetscOmpCtrl    ctrl = *pctrl;
478 
479   PetscFunctionBegin;
480   hwloc_bitmap_free(ctrl->cpuset);
481   hwloc_topology_destroy(ctrl->topology);
482   PetscOmpCtrlDestroyBarrier(ctrl);
483   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRMPI(ierr);
484   if (ctrl->is_omp_master) {
485     hwloc_bitmap_free(ctrl->omp_cpuset);
486     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRMPI(ierr);
487   }
488   ierr = PetscFree(ctrl);CHKERRQ(ierr);
489   PetscFunctionReturn(0);
490 }
491 
492 /*@C
493     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
494 
495     Input Parameter:
496 .   ctrl - a PETSc OMP controller
497 
498     Output Parameters:
499 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
500 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
501                        on slave ranks, MPI_COMM_NULL will be return in reality.
502 -   is_omp_master    - true if the calling process is an OMP master rank.
503 
504     Notes: any output parameter can be NULL. The parameter is just ignored.
505 
506     Level: developer
507 @*/
508 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
509 {
510   PetscFunctionBegin;
511   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
512   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
513   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
514   PetscFunctionReturn(0);
515 }
516 
517 /*@C
518     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
519 
520     Input Parameter:
521 .   ctrl - a PETSc OMP controller
522 
523     Notes:
524     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
525     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
526     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
527 
528     A code using PetscOmpCtrlBarrier() would be like this,
529 
530     if (is_omp_master) {
531       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
532       Call the library using OpenMP
533       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
534     }
535     PetscOmpCtrlBarrier(ctrl);
536 
537     Level: developer
538 
539 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
540 @*/
541 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
542 {
543   int err;
544 
545   PetscFunctionBegin;
546   err = pthread_barrier_wait(ctrl->barrier);
547   if (err && err != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", err);
548   PetscFunctionReturn(0);
549 }
550 
551 /*@C
552     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
553 
554     Input Parameter:
555 .   ctrl - a PETSc OMP controller
556 
557     Notes:
558     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
559     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
560 
561     Level: developer
562 
563 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
564 @*/
565 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
566 {
567   PetscErrorCode ierr;
568 
569   PetscFunctionBegin;
570   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
571   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
572   PetscFunctionReturn(0);
573 }
574 
575 /*@C
576    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
577 
578    Input Parameter:
579 .  ctrl - a PETSc OMP controller
580 
581    Notes:
582    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
583    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
584 
585    Level: developer
586 
587 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
588 @*/
589 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
590 {
591   PetscErrorCode ierr;
592 
593   PetscFunctionBegin;
594   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
595   omp_set_num_threads(1);
596   PetscFunctionReturn(0);
597 }
598 
599 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
600 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
601