xref: /petsc/src/sys/utils/mpishm.c (revision 34136279daf4c5803e26cd9ecf5a2cf0d75aa7fe)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt shmsize;          /* size of the shared memory communicator */
7   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal tag/name shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19 {
20   PetscErrorCode  ierr;
21   PetscShmComm p = (PetscShmComm)val;
22 
23   PetscFunctionBegin;
24   ierr = PetscInfo1(0,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
25   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
26   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
27   ierr = PetscFree(val);CHKERRMPI(ierr);
28   PetscFunctionReturn(MPI_SUCCESS);
29 }
30 
31 /*@C
32     PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory
33 
34 
35     Collective on comm.
36 
37     Input Parameter:
38 .   globcomm - MPI_Comm
39 
40     Output Parameter:
41 .   pshmcomm - the PETSc shared memory communicator object
42 
43     Level: developer
44 
45     Notes:
46     This should be called only with an PetscCommDuplicate() communictor
47 
48            When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
49 
50     Concepts: MPI subcomm^numbering
51 
52 @*/
53 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
54 {
55 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
56   PetscErrorCode   ierr;
57   MPI_Group        globgroup,shmgroup;
58   PetscMPIInt      *shmranks,i,flg;
59   PetscCommCounter *counter;
60 
61   PetscFunctionBegin;
62   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
63   if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator");
64 
65   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr);
66   if (flg) PetscFunctionReturn(0);
67 
68   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
69   (*pshmcomm)->globcomm = globcomm;
70 
71   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr);
72 
73   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr);
74   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr);
75   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr);
76   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
77   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
78   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
79   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr);
80   ierr = PetscFree(shmranks);CHKERRQ(ierr);
81   ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr);
82   ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr);
83 
84   for (i=0; i<(*pshmcomm)->shmsize; i++) {
85     ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
86   }
87   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr);
88   PetscFunctionReturn(0);
89 #else
90   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
91 #endif
92 }
93 
94 /*@C
95     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
96 
97     Input Parameters:
98 +   pshmcomm - the shared memory communicator object
99 -   grank    - the global rank
100 
101     Output Parameter:
102 .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
103 
104     Level: developer
105 
106     Developer Notes:
107     Assumes the pshmcomm->globranks[] is sorted
108 
109     It may be better to rewrite this to map multiple global ranks to local in the same function call
110 
111     Concepts: MPI subcomm^numbering
112 
113 @*/
114 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
115 {
116   PetscMPIInt    low,high,t,i;
117   PetscBool      flg = PETSC_FALSE;
118   PetscErrorCode ierr;
119 
120   PetscFunctionBegin;
121   *lrank = MPI_PROC_NULL;
122   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
123   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
124   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
125   if (flg) PetscFunctionReturn(0);
126   low  = 0;
127   high = pshmcomm->shmsize;
128   while (high-low > 5) {
129     t = (low+high)/2;
130     if (pshmcomm->globranks[t] > grank) high = t;
131     else low = t;
132   }
133   for (i=low; i<high; i++) {
134     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
135     if (pshmcomm->globranks[i] == grank) {
136       *lrank = i;
137       PetscFunctionReturn(0);
138     }
139   }
140   PetscFunctionReturn(0);
141 }
142 
143 /*@C
144     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
145 
146     Input Parameters:
147 +   pshmcomm - the shared memory communicator object
148 -   lrank    - the local rank in the shared memory communicator
149 
150     Output Parameter:
151 .   grank - the global rank in the global communicator where the shared memory communicator is built
152 
153     Level: developer
154 
155     Concepts: MPI subcomm^numbering
156 @*/
157 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
158 {
159   PetscFunctionBegin;
160 #ifdef PETSC_USE_DEBUG
161   {
162     PetscErrorCode ierr;
163     if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);CHKERRQ(ierr); }
164   }
165 #endif
166   *grank = pshmcomm->globranks[lrank];
167   PetscFunctionReturn(0);
168 }
169 
170 /*@C
171     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
172 
173     Input Parameter:
174 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
175 
176     Output Parameter:
177 .   comm     - the MPI communicator
178 
179     Level: developer
180 
181 @*/
182 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
183 {
184   PetscFunctionBegin;
185   *comm = pshmcomm->shmcomm;
186   PetscFunctionReturn(0);
187 }
188 
189 #if defined(PETSC_HAVE_OPENMP) && defined(PETSC_HAVE_PTHREAD) && (defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY) || defined(PETSC_HAVE_MMAP)) && defined(PETSC_HAVE_HWLOC)
190 #include <pthread.h>
191 #include <hwloc.h>
192 #include <omp.h>
193 
194 /* Use mmap() to allocate shared mmeory (for the pthread_barrierattr_t object) if it is available,
195    otherwise use MPI_Win_allocate_shared. They should have the same effect besides MPI-3 is much
196    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
197    by 50%. Until the reason is found out, we use mmap() instead.
198 */
199 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
200 
201 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
202 #include <sys/mman.h>
203 #include <sys/types.h>
204 #include <sys/stat.h>
205 #include <fcntl.h>
206 #endif
207 
208 struct _n_PetscOmpCtrl {
209   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
210   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
211   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
212   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
213   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
214   pthread_barrier_t *barrier;        /* pointer to the barrier */
215   hwloc_topology_t  topology;
216   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
217   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
218 };
219 
220 
221 /* Allocate a shared pthread_barrier_t object in ctrl->omp_comm, set ctrl->barrier */
222 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
223 {
224   PetscErrorCode        ierr;
225   MPI_Aint              size;
226   void                  *baseptr;
227   pthread_barrierattr_t  attr;
228 
229 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
230   PetscInt              fd;
231   PetscChar             pathname[PETSC_MAX_PATH_LEN];
232 #else
233   PetscMPIInt           disp_unit;
234 #endif
235 
236   PetscFunctionBegin;
237 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
238   size = sizeof(pthread_barrier_t);
239   if (ctrl->is_omp_master) {
240     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the unfinished pathname to slaves */
241     ierr    = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
242     ierr    = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
243     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
244     fd      = mkstemp(pathname); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname);
245     ierr    = ftruncate(fd,size);CHKERRQ(ierr);
246     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
247     ierr    = close(fd);CHKERRQ(ierr);
248     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
249     /* this MPI_Barrier is to wait slaves open the file before master unlinks it */
250     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
251     ierr    = unlink(pathname);CHKERRQ(ierr);
252   } else {
253     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr);
254     fd      = open(pathname,O_RDWR); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname);
255     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n");
256     ierr    = close(fd);CHKERRQ(ierr);
257     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
258   }
259 #else
260   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
261   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr);
262   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr);
263 #endif
264   ctrl->barrier = (pthread_barrier_t*)baseptr;
265 
266   /* omp master initializes the barrier */
267   if (ctrl->is_omp_master) {
268     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr);
269     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
270     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
271     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
272     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
273   }
274 
275   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
276   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
277   PetscFunctionReturn(0);
278 }
279 
280 /* Destroy ctrl->barrier */
281 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
282 {
283   PetscErrorCode ierr;
284 
285   PetscFunctionBegin;
286   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
287   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
288   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
289 
290 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
291   ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr);
292 #else
293   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr);
294 #endif
295   PetscFunctionReturn(0);
296 }
297 
298 /* create a PETSc OpenMP controler, which manages PETSc's interaction with OpenMP runtime */
299 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
300 {
301   PetscErrorCode        ierr;
302   PetscOmpCtrl          ctrl;
303   unsigned long         *cpu_ulongs=NULL;
304   PetscInt              i,nr_cpu_ulongs;
305   PetscShmComm          pshmcomm;
306   MPI_Comm              shm_comm;
307   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
308 
309   PetscFunctionBegin;
310   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
311 
312   /*=================================================================================
313     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
314     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
315     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
316     which is usually passed to third party libraries.
317    ==================================================================================*/
318 
319   /* fetch the stored shared memory communicator */
320   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
321   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
322 
323   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr);
324   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr);
325 
326   if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %d can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size);
327   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %d is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
328 
329   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
330      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
331      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
332      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
333      Use 0 as key so that rank ordering wont change in new comm.
334    */
335   color = shm_rank / nthreads;
336   ierr  = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRQ(ierr);
337 
338   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
339   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr);
340   if (!omp_rank) {
341     ctrl->is_omp_master = PETSC_TRUE;  /* master */
342     color = 0;
343   } else {
344     ctrl->is_omp_master = PETSC_FALSE; /* slave */
345     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
346   }
347   ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRQ(ierr); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */
348 
349   /*=================================================================================
350     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
351     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
352     and run them on the idle CPUs.
353    ==================================================================================*/
354   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
355 
356   /*=================================================================================
357     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
358     is the union of the bindings of all ranks in the omp_comm
359     =================================================================================*/
360   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
361 #if HWLOC_API_VERSION >= 0x00020000
362   /* to filter out unneeded info and have faster hwloc_topology_load */
363   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
364   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
365 #endif
366   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
367 
368   ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
369   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
370 
371   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
372   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
373   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
374   if (nr_cpu_ulongs == 1) {
375     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
376   } else {
377     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
378   }
379 
380   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr);
381 
382   if (ctrl->is_omp_master) {
383     ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n");
384     if (nr_cpu_ulongs == 1) {
385 #if HWLOC_API_VERSION >= 0x00020000
386       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
387 #else
388       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
389 #endif
390     } else {
391       for (i=0; i<nr_cpu_ulongs; i++)  {
392 #if HWLOC_API_VERSION >= 0x00020000
393         ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr);
394 #else
395         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
396 #endif
397       }
398     }
399   }
400 
401   /* all wait for the master to finish the initialization before using the barrier */
402   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr);
403   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
404   *pctrl = ctrl;
405   PetscFunctionReturn(0);
406 }
407 
408 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
409 {
410   PetscErrorCode  ierr;
411   PetscOmpCtrl    ctrl = *pctrl;
412 
413   PetscFunctionBegin;
414   hwloc_bitmap_free(ctrl->cpuset);
415   hwloc_topology_destroy(ctrl->topology);
416   PetscOmpCtrlDestroyBarrier(ctrl);
417   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr);
418   if (ctrl->is_omp_master) {
419     hwloc_bitmap_free(ctrl->omp_cpuset);
420     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr);
421   }
422   ierr = PetscFree(ctrl);CHKERRQ(ierr);
423   PetscFunctionReturn(0);
424 }
425 
426 /*@C
427     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PetscOmpCtrl
428 
429     Input Parameter:
430 .   ctrl - a PetscOmpCtrl
431 
432     Output Parameter:
433 +   omp_comm         - a communicator that includes a master rank and slave ranks.
434 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
435                        on slave ranks, MPI_COMM_NULL will be return in reality.
436 -   is_omp_master    - true if the calling process is an OMP master rank.
437 
438     Level: developer
439 @*/
440 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
441 {
442   PetscFunctionBegin;
443   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
444   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
445   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
446   PetscFunctionReturn(0);
447 }
448 
449 /* a barrier in the scope of an omp_comm. Not using MPI_Barrier since it keeps polling and does not free CPUs OMP wants to use */
450 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
451 {
452   PetscErrorCode ierr;
453 
454   PetscFunctionBegin;
455   ierr = pthread_barrier_wait(ctrl->barrier);
456   if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr);
457   PetscFunctionReturn(0);
458 }
459 
460 /* call this on master ranks before calling a library using OpenMP */
461 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
462 {
463   PetscErrorCode ierr;
464 
465   PetscFunctionBegin;
466   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
467   omp_set_num_threads(ctrl->omp_comm_size); /* may override OMP_NUM_THREAD in environment */
468   PetscFunctionReturn(0);
469 }
470 
471 /* call this on master ranks after leaving a library using OpenMP */
472 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
473 {
474   PetscErrorCode ierr;
475 
476   PetscFunctionBegin;
477   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
478   PetscFunctionReturn(0);
479 }
480 
481 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
482 #endif /* defined(PETSC_HAVE_PTHREAD) && .. */
483