xref: /petsc/src/sys/utils/mpishm.c (revision cb9b7bb0b9f68fc121f797778e409e2e056048fa)
1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 struct _n_PetscShmComm {
5   PetscMPIInt *globranks;       /* global ranks of each rank in the shared memory communicator */
6   PetscMPIInt shmsize;          /* size of the shared memory communicator */
7   MPI_Comm    globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8 };
9 
10 /*
11    Private routine to delete internal shared memory communicator when a communicator is freed.
12 
13    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.
14 
15    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
16 
17 */
18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state)
19 {
20   PetscErrorCode  ierr;
21   PetscShmComm p = (PetscShmComm)val;
22 
23   PetscFunctionBegin;
24   ierr = PetscInfo(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr);
25   ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr);
26   ierr = PetscFree(p->globranks);CHKERRMPI(ierr);
27   ierr = PetscFree(val);CHKERRMPI(ierr);
28   PetscFunctionReturn(MPI_SUCCESS);
29 }
30 
31 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
32 /* Data structures to support freeing comms created in PetscShmCommGet().
33   Since we predict communicators passed to PetscShmCommGet() are very likely
34   either a petsc inner communicator or an MPI communicator with a linked petsc
35   inner communicator, we use a simple static array to store dupped communicators
36   on rare cases otherwise.
37  */
38 #define MAX_SHMCOMM_DUPPED_COMMS 16
39 static PetscInt       num_dupped_comms=0;
40 static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
41 static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
42 {
43   PetscErrorCode   ierr;
44   PetscInt         i;
45   PetscFunctionBegin;
46   for (i=0; i<num_dupped_comms; i++) {ierr = PetscCommDestroy(&shmcomm_dupped_comms[i]);CHKERRQ(ierr);}
47   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
48   PetscFunctionReturn(0);
49 }
50 #endif
51 
52 /*@C
53     PetscShmCommGet - Given a communicator returns a sub-communicator of all ranks that share a common memory
54 
55     Collective.
56 
57     Input Parameter:
58 .   globcomm - MPI_Comm, which can be a user MPI_Comm or a PETSc inner MPI_Comm
59 
60     Output Parameter:
61 .   pshmcomm - the PETSc shared memory communicator object
62 
63     Level: developer
64 
65     Notes:
66        When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis
67 
68 @*/
69 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm)
70 {
71 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
72   PetscErrorCode   ierr;
73   MPI_Group        globgroup,shmgroup;
74   PetscMPIInt      *shmranks,i,flg;
75   PetscCommCounter *counter;
76 
77   PetscFunctionBegin;
78   PetscValidPointer(pshmcomm,2);
79   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
80   ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRMPI(ierr);
81   if (!flg) { /* globcomm is not a petsc comm */
82     union {MPI_Comm comm; void *ptr;} ucomm;
83     /* check if globcomm already has a linked petsc inner comm */
84     ierr = MPI_Comm_get_attr(globcomm,Petsc_InnerComm_keyval,&ucomm,&flg);CHKERRMPI(ierr);
85     if (!flg) {
86       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
87       PetscAssertFalse(num_dupped_comms >= MAX_SHMCOMM_DUPPED_COMMS,globcomm,PETSC_ERR_PLIB,"PetscShmCommGet() is trying to dup more than %d MPI_Comms",MAX_SHMCOMM_DUPPED_COMMS);
88       ierr = PetscCommDuplicate(globcomm,&globcomm,NULL);CHKERRQ(ierr);
89       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
90       if (num_dupped_comms == 0) {ierr = PetscRegisterFinalize(PetscShmCommDestroyDuppedComms);CHKERRQ(ierr);}
91       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
92       num_dupped_comms++;
93     } else {
94       /* otherwise, we pull out the inner comm and use it as globcomm */
95       globcomm = ucomm.comm;
96     }
97   }
98 
99   /* Check if globcomm already has an attached pshmcomm. If no, create one */
100   ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRMPI(ierr);
101   if (flg) PetscFunctionReturn(0);
102 
103   ierr        = PetscNew(pshmcomm);CHKERRQ(ierr);
104   (*pshmcomm)->globcomm = globcomm;
105 
106   ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRMPI(ierr);
107 
108   ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRMPI(ierr);
109   ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRMPI(ierr);
110   ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRMPI(ierr);
111   ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr);
112   ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr);
113   for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i;
114   ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRMPI(ierr);
115   ierr = PetscFree(shmranks);CHKERRQ(ierr);
116   ierr = MPI_Group_free(&globgroup);CHKERRMPI(ierr);
117   ierr = MPI_Group_free(&shmgroup);CHKERRMPI(ierr);
118 
119   for (i=0; i<(*pshmcomm)->shmsize; i++) {
120     ierr = PetscInfo(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr);
121   }
122   ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRMPI(ierr);
123   PetscFunctionReturn(0);
124 #else
125   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
126 #endif
127 }
128 
129 /*@C
130     PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
131 
132     Input Parameters:
133 +   pshmcomm - the shared memory communicator object
134 -   grank    - the global rank
135 
136     Output Parameter:
137 .   lrank - the local rank, or MPI_PROC_NULL if it does not exist
138 
139     Level: developer
140 
141     Developer Notes:
142     Assumes the pshmcomm->globranks[] is sorted
143 
144     It may be better to rewrite this to map multiple global ranks to local in the same function call
145 
146 @*/
147 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank)
148 {
149   PetscMPIInt    low,high,t,i;
150   PetscBool      flg = PETSC_FALSE;
151   PetscErrorCode ierr;
152 
153   PetscFunctionBegin;
154   PetscValidPointer(pshmcomm,1);
155   PetscValidPointer(lrank,3);
156   *lrank = MPI_PROC_NULL;
157   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0);
158   if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0);
159   ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr);
160   if (flg) PetscFunctionReturn(0);
161   low  = 0;
162   high = pshmcomm->shmsize;
163   while (high-low > 5) {
164     t = (low+high)/2;
165     if (pshmcomm->globranks[t] > grank) high = t;
166     else low = t;
167   }
168   for (i=low; i<high; i++) {
169     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0);
170     if (pshmcomm->globranks[i] == grank) {
171       *lrank = i;
172       PetscFunctionReturn(0);
173     }
174   }
175   PetscFunctionReturn(0);
176 }
177 
178 /*@C
179     PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
180 
181     Input Parameters:
182 +   pshmcomm - the shared memory communicator object
183 -   lrank    - the local rank in the shared memory communicator
184 
185     Output Parameter:
186 .   grank - the global rank in the global communicator where the shared memory communicator is built
187 
188     Level: developer
189 
190 @*/
191 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank)
192 {
193   PetscFunctionBegin;
194   PetscValidPointer(pshmcomm,1);
195   PetscValidPointer(grank,3);
196   PetscAssertFalse(PetscUnlikely((lrank < 0) || (lrank >= pshmcomm->shmsize)),PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %d in the shared memory communicator",lrank);
197   *grank = pshmcomm->globranks[lrank];
198   PetscFunctionReturn(0);
199 }
200 
201 /*@C
202     PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
203 
204     Input Parameter:
205 .   pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
206 
207     Output Parameter:
208 .   comm     - the MPI communicator
209 
210     Level: developer
211 
212 @*/
213 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm)
214 {
215   PetscFunctionBegin;
216   PetscValidPointer(pshmcomm,1);
217   PetscValidPointer(comm,2);
218   *comm = pshmcomm->shmcomm;
219   PetscFunctionReturn(0);
220 }
221 
222 #if defined(PETSC_HAVE_OPENMP_SUPPORT)
223 #include <pthread.h>
224 #include <hwloc.h>
225 #include <omp.h>
226 
227 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
228    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
229    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
230    by 50%. Until the reason is found out, we use mmap() instead.
231 */
232 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
233 
234 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
235 #include <sys/mman.h>
236 #include <sys/types.h>
237 #include <sys/stat.h>
238 #include <fcntl.h>
239 #endif
240 
241 struct _n_PetscOmpCtrl {
242   MPI_Comm          omp_comm;        /* a shared memory communicator to spawn omp threads */
243   MPI_Comm          omp_master_comm; /* a communicator to give to third party libraries */
244   PetscMPIInt       omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
245   PetscBool         is_omp_master;   /* rank 0's in omp_comm */
246   MPI_Win           omp_win;         /* a shared memory window containing a barrier */
247   pthread_barrier_t *barrier;        /* pointer to the barrier */
248   hwloc_topology_t  topology;
249   hwloc_cpuset_t    cpuset;          /* cpu bindings of omp master */
250   hwloc_cpuset_t    omp_cpuset;      /* union of cpu bindings of ranks in omp_comm */
251 };
252 
253 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
254    contained by the controller.
255 
256    PETSc OpenMP controller users do not call this function directly. This function exists
257    only because we want to separate shared memory allocation methods from other code.
258  */
259 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
260 {
261   PetscErrorCode        ierr;
262   MPI_Aint              size;
263   void                  *baseptr;
264   pthread_barrierattr_t  attr;
265 
266 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
267   PetscInt              fd;
268   PetscChar             pathname[PETSC_MAX_PATH_LEN];
269 #else
270   PetscMPIInt           disp_unit;
271 #endif
272 
273   PetscFunctionBegin;
274 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
275   size = sizeof(pthread_barrier_t);
276   if (ctrl->is_omp_master) {
277     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
278     ierr    = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
279     ierr    = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr);
280     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
281     fd      = mkstemp(pathname); PetscAssertFalse(fd == -1,PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp", pathname);
282     ierr    = ftruncate(fd,size);CHKERRQ(ierr);
283     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); PetscAssertFalse(baseptr == MAP_FAILED,PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed");
284     ierr    = close(fd);CHKERRQ(ierr);
285     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRMPI(ierr);
286     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
287     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
288     ierr    = unlink(pathname);CHKERRQ(ierr);
289   } else {
290     ierr    = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRMPI(ierr);
291     fd      = open(pathname,O_RDWR); PetscAssertFalse(fd == -1,PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s", pathname);
292     baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); PetscAssertFalse(baseptr == MAP_FAILED,PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed");
293     ierr    = close(fd);CHKERRQ(ierr);
294     ierr    = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
295   }
296 #else
297   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
298   ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRMPI(ierr);
299   ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRMPI(ierr);
300 #endif
301   ctrl->barrier = (pthread_barrier_t*)baseptr;
302 
303   /* omp master initializes the barrier */
304   if (ctrl->is_omp_master) {
305     ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRMPI(ierr);
306     ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr);
307     ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */
308     ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr);
309     ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr);
310   }
311 
312   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
313   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
314   PetscFunctionReturn(0);
315 }
316 
317 /* Destroy the pthread barrier in the PETSc OpenMP controller */
318 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
319 {
320   PetscErrorCode ierr;
321 
322   PetscFunctionBegin;
323   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
324   ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr);
325   if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); }
326 
327 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
328   ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr);
329 #else
330   ierr = MPI_Win_free(&ctrl->omp_win);CHKERRMPI(ierr);
331 #endif
332   PetscFunctionReturn(0);
333 }
334 
335 /*@C
336     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP
337 
338     Input Parameters:
339 +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
340 -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
341 
342     Output Parameter:
343 .   pctrl      - a PETSc OpenMP controller
344 
345     Level: developer
346 
347     TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use
348 
349 .seealso PetscOmpCtrlDestroy()
350 @*/
351 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl)
352 {
353   PetscErrorCode        ierr;
354   PetscOmpCtrl          ctrl;
355   unsigned long         *cpu_ulongs=NULL;
356   PetscInt              i,nr_cpu_ulongs;
357   PetscShmComm          pshmcomm;
358   MPI_Comm              shm_comm;
359   PetscMPIInt           shm_rank,shm_comm_size,omp_rank,color;
360   PetscInt              num_packages,num_cores;
361 
362   PetscFunctionBegin;
363   ierr = PetscNew(&ctrl);CHKERRQ(ierr);
364 
365   /*=================================================================================
366     Init hwloc
367    ==================================================================================*/
368   ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr);
369 #if HWLOC_API_VERSION >= 0x00020000
370   /* to filter out unneeded info and have faster hwloc_topology_load */
371   ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr);
372   ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr);
373 #endif
374   ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr);
375 
376   /*=================================================================================
377     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
378     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
379     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
380     which is usually passed to third party libraries.
381    ==================================================================================*/
382 
383   /* fetch the stored shared memory communicator */
384   ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr);
385   ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr);
386 
387   ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRMPI(ierr);
388   ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRMPI(ierr);
389 
390   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
391   if (nthreads == -1) {
392     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE);
393     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 :  hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE);
394     nthreads     = num_cores/num_packages;
395     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
396   }
397 
398   PetscAssertFalse(nthreads < 1 || nthreads > shm_comm_size,petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d",nthreads,shm_comm_size);
399   if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); }
400 
401   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
402      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
403      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
404      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
405      Use 0 as key so that rank ordering wont change in new comm.
406    */
407   color = shm_rank / nthreads;
408   ierr  = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRMPI(ierr);
409 
410   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
411   ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRMPI(ierr);
412   if (!omp_rank) {
413     ctrl->is_omp_master = PETSC_TRUE;  /* master */
414     color = 0;
415   } else {
416     ctrl->is_omp_master = PETSC_FALSE; /* slave */
417     color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
418   }
419   ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRMPI(ierr);
420 
421   /*=================================================================================
422     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
423     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
424     and run them on the idle CPUs.
425    ==================================================================================*/
426   ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr);
427 
428   /*=================================================================================
429     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
430     is the union of the bindings of all ranks in the omp_comm
431     =================================================================================*/
432 
433   ctrl->cpuset = hwloc_bitmap_alloc(); PetscAssertFalse(!ctrl->cpuset,PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed");
434   ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
435 
436   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
437   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8;
438   ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr);
439   if (nr_cpu_ulongs == 1) {
440     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
441   } else {
442     for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i);
443   }
444 
445   ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRMPI(ierr);
446 
447   if (ctrl->is_omp_master) {
448     ctrl->omp_cpuset = hwloc_bitmap_alloc(); PetscAssertFalse(!ctrl->omp_cpuset,PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed");
449     if (nr_cpu_ulongs == 1) {
450 #if HWLOC_API_VERSION >= 0x00020000
451       ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr);
452 #else
453       hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);
454 #endif
455     } else {
456       for (i=0; i<nr_cpu_ulongs; i++)  {
457 #if HWLOC_API_VERSION >= 0x00020000
458         ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr);
459 #else
460         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);
461 #endif
462       }
463     }
464   }
465 
466   ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr);
467   *pctrl = ctrl;
468   PetscFunctionReturn(0);
469 }
470 
471 /*@C
472     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
473 
474     Input Parameter:
475 .   pctrl  - a PETSc OpenMP controller
476 
477     Level: developer
478 
479 .seealso PetscOmpCtrlCreate()
480 @*/
481 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
482 {
483   PetscErrorCode  ierr;
484   PetscOmpCtrl    ctrl = *pctrl;
485 
486   PetscFunctionBegin;
487   hwloc_bitmap_free(ctrl->cpuset);
488   hwloc_topology_destroy(ctrl->topology);
489   PetscOmpCtrlDestroyBarrier(ctrl);
490   ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRMPI(ierr);
491   if (ctrl->is_omp_master) {
492     hwloc_bitmap_free(ctrl->omp_cpuset);
493     ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRMPI(ierr);
494   }
495   ierr = PetscFree(ctrl);CHKERRQ(ierr);
496   PetscFunctionReturn(0);
497 }
498 
499 /*@C
500     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
501 
502     Input Parameter:
503 .   ctrl - a PETSc OMP controller
504 
505     Output Parameters:
506 +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
507 .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
508                        on slave ranks, MPI_COMM_NULL will be return in reality.
509 -   is_omp_master    - true if the calling process is an OMP master rank.
510 
511     Notes: any output parameter can be NULL. The parameter is just ignored.
512 
513     Level: developer
514 @*/
515 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master)
516 {
517   PetscFunctionBegin;
518   if (omp_comm)        *omp_comm        = ctrl->omp_comm;
519   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
520   if (is_omp_master)   *is_omp_master   = ctrl->is_omp_master;
521   PetscFunctionReturn(0);
522 }
523 
524 /*@C
525     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
526 
527     Input Parameter:
528 .   ctrl - a PETSc OMP controller
529 
530     Notes:
531     this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not
532     require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency,
533     MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
534 
535     A code using PetscOmpCtrlBarrier() would be like this,
536 
537     if (is_omp_master) {
538       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
539       Call the library using OpenMP
540       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
541     }
542     PetscOmpCtrlBarrier(ctrl);
543 
544     Level: developer
545 
546 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd()
547 @*/
548 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
549 {
550   int err;
551 
552   PetscFunctionBegin;
553   err = pthread_barrier_wait(ctrl->barrier);
554   PetscAssertFalse(err && err != PTHREAD_BARRIER_SERIAL_THREAD,PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %" PetscInt_FMT, err);
555   PetscFunctionReturn(0);
556 }
557 
558 /*@C
559     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
560 
561     Input Parameter:
562 .   ctrl - a PETSc OMP controller
563 
564     Notes:
565     Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
566     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
567 
568     Level: developer
569 
570 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd()
571 @*/
572 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
573 {
574   PetscErrorCode ierr;
575 
576   PetscFunctionBegin;
577   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
578   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
579   PetscFunctionReturn(0);
580 }
581 
582 /*@C
583    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
584 
585    Input Parameter:
586 .  ctrl - a PETSc OMP controller
587 
588    Notes:
589    Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank.
590    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
591 
592    Level: developer
593 
594 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin()
595 @*/
596 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
597 {
598   PetscErrorCode ierr;
599 
600   PetscFunctionBegin;
601   ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr);
602   omp_set_num_threads(1);
603   PetscFunctionReturn(0);
604 }
605 
606 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
607 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */
608