xref: /petsc/src/sys/utils/openmp/mpmpishm.c (revision d8f7746b020311b8d34d94008ae0987ac909cf22) !
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 #include <pthread.h>
4 #include <hwloc.h>
5 #include <omp.h>
6 
7 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
8    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
9    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
10    by 50%. Until the reason is found out, we use mmap() instead.
11 */
12 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
13 
14 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
15   #include <sys/mman.h>
16   #include <sys/types.h>
17   #include <sys/stat.h>
18   #include <fcntl.h>
19 #endif
20 
21 struct _n_PetscOmpCtrl {
22   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
23   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
24   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
25   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
26   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
27   pthread_barrier_t *barrier;         /* pointer to the barrier */
28   hwloc_topology_t   topology;
29   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
30   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
31 };
32 
33 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
34    contained by the controller.
35 
36    PETSc OpenMP controller users do not call this function directly. This function exists
37    only because we want to separate shared memory allocation methods from other code.
38  */
PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)39 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
40 {
41   MPI_Aint              size;
42   void                 *baseptr;
43   pthread_barrierattr_t attr;
44 
45 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
46   int  fd;
47   char pathname[PETSC_MAX_PATH_LEN];
48 #else
49   PetscMPIInt disp_unit;
50 #endif
51 
52   PetscFunctionBegin;
53 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
54   size = sizeof(pthread_barrier_t);
55   if (ctrl->is_omp_master) {
56     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
57     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
58     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
59     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
60     fd = mkstemp(pathname);
61     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
62     PetscCallExternal(ftruncate, fd, size);
63     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
64     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
65     PetscCallExternal(close, fd);
66     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
67     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
68     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
69     PetscCallExternal(unlink, pathname);
70   } else {
71     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
72     fd = open(pathname, O_RDWR);
73     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
74     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
75     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
76     PetscCallExternal(close, fd);
77     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
78   }
79 #else
80   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
81   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
82   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
83 #endif
84   ctrl->barrier = (pthread_barrier_t *)baseptr;
85 
86   /* omp master initializes the barrier */
87   if (ctrl->is_omp_master) {
88     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
89     PetscCallExternal(pthread_barrierattr_init, &attr);
90     PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
91     PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
92     PetscCallExternal(pthread_barrierattr_destroy, &attr);
93   }
94 
95   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
96   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
97   PetscFunctionReturn(PETSC_SUCCESS);
98 }
99 
100 /* Destroy the pthread barrier in the PETSc OpenMP controller */
PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)101 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
102 {
103   PetscFunctionBegin;
104   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
105   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
106   if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);
107 
108 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
109   PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
110 #else
111   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
112 #endif
113   PetscFunctionReturn(PETSC_SUCCESS);
114 }
115 
116 /*@C
117   PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
118 
119   Input Parameters:
120 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
121 - nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
122 
123   Output Parameter:
124 . pctrl - a PETSc OpenMP controller
125 
126   Level: developer
127 
128   Developer Note:
129   Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
130 
131 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
132 @*/
PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl * pctrl)133 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
134 {
135   PetscOmpCtrl   ctrl;
136   unsigned long *cpu_ulongs = NULL;
137   PetscShmComm   pshmcomm;
138   MPI_Comm       shm_comm;
139   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color, nr_cpu_ulongs;
140   PetscInt       num_packages, num_cores;
141 
142   PetscFunctionBegin;
143   PetscCall(PetscNew(&ctrl));
144 
145   /*=================================================================================
146     Init hwloc
147    ==================================================================================*/
148   PetscCallExternal(hwloc_topology_init, &ctrl->topology);
149 #if HWLOC_API_VERSION >= 0x00020000
150   /* to filter out unneeded info and have faster hwloc_topology_load */
151   PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
152   PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
153 #endif
154   PetscCallExternal(hwloc_topology_load, ctrl->topology);
155 
156   /*=================================================================================
157     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
158     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
159     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
160     which is usually passed to third party libraries.
161    ==================================================================================*/
162 
163   /* fetch the stored shared memory communicator */
164   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
165   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
166 
167   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
168   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
169 
170   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
171   if (nthreads == -1) {
172     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
173     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
174     nthreads     = num_cores / num_packages;
175     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
176   }
177 
178   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
179   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
180 
181   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
182      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
183      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
184      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
185      Use 0 as key so that rank ordering wont change in new comm.
186    */
187   color = shm_rank / nthreads;
188   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
189 
190   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
191   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
192   if (!omp_rank) {
193     ctrl->is_omp_master = PETSC_TRUE; /* master */
194     color               = 0;
195   } else {
196     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
197     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
198   }
199   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
200 
201   /*=================================================================================
202     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
203     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
204     and run them on the idle CPUs.
205    ==================================================================================*/
206   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
207 
208   /*=================================================================================
209     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
210     is the union of the bindings of all ranks in the omp_comm
211     =================================================================================*/
212 
213   ctrl->cpuset = hwloc_bitmap_alloc();
214   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
215   PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
216 
217   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
218   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
219   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
220   if (nr_cpu_ulongs == 1) {
221     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
222   } else {
223     for (PetscMPIInt i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
224   }
225 
226   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
227 
228   if (ctrl->is_omp_master) {
229     ctrl->omp_cpuset = hwloc_bitmap_alloc();
230     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
231     if (nr_cpu_ulongs == 1) {
232 #if HWLOC_API_VERSION >= 0x00020000
233       PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
234 #else
235       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
236 #endif
237     } else {
238       for (PetscMPIInt i = 0; i < nr_cpu_ulongs; i++) {
239 #if HWLOC_API_VERSION >= 0x00020000
240         PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
241 #else
242         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
243 #endif
244       }
245     }
246   }
247   PetscCall(PetscFree(cpu_ulongs));
248   *pctrl = ctrl;
249   PetscFunctionReturn(PETSC_SUCCESS);
250 }
251 
252 /*@C
253   PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
254 
255   Input Parameter:
256 . pctrl - a PETSc OpenMP controller
257 
258   Level: developer
259 
260 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
261 @*/
PetscOmpCtrlDestroy(PetscOmpCtrl * pctrl)262 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
263 {
264   PetscOmpCtrl ctrl = *pctrl;
265 
266   PetscFunctionBegin;
267   hwloc_bitmap_free(ctrl->cpuset);
268   hwloc_topology_destroy(ctrl->topology);
269   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
270   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
271   if (ctrl->is_omp_master) {
272     hwloc_bitmap_free(ctrl->omp_cpuset);
273     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
274   }
275   PetscCall(PetscFree(ctrl));
276   PetscFunctionReturn(PETSC_SUCCESS);
277 }
278 
279 /*@C
280   PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
281 
282   Input Parameter:
283 . ctrl - a PETSc OMP controller
284 
285   Output Parameters:
286 + omp_comm        - a communicator that includes a master rank and slave ranks where master spawns threads
287 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm;
288                     on slave ranks, `MPI_COMM_NULL` will be return in reality.
289 - is_omp_master   - true if the calling process is an OMP master rank.
290 
291   Note:
292   Any output parameter can be `NULL`. The parameter is just ignored.
293 
294   Level: developer
295 
296 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
297 @*/
PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm * omp_comm,MPI_Comm * omp_master_comm,PetscBool * is_omp_master)298 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
299 {
300   PetscFunctionBegin;
301   if (omp_comm) *omp_comm = ctrl->omp_comm;
302   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
303   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
304   PetscFunctionReturn(PETSC_SUCCESS);
305 }
306 
307 /*@C
308   PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
309 
310   Input Parameter:
311 . ctrl - a PETSc OMP controller
312 
313   Notes:
314   This is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
315   require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
316   MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
317 
318   A code using `PetscOmpCtrlBarrier()` would be like this,
319 .vb
320   if (is_omp_master) {
321     PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
322     Call the library using OpenMP
323     PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
324   }
325   PetscOmpCtrlBarrier(ctrl);
326 .ve
327 
328   Level: developer
329 
330 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
331 @*/
PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)332 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
333 {
334   int err;
335 
336   PetscFunctionBegin;
337   err = pthread_barrier_wait(ctrl->barrier);
338   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
339   PetscFunctionReturn(PETSC_SUCCESS);
340 }
341 
342 /*@C
343   PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
344 
345   Input Parameter:
346 . ctrl - a PETSc OMP controller
347 
348   Note:
349   Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
350   This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
351 
352   Level: developer
353 
354 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
355 @*/
PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)356 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
357 {
358   PetscFunctionBegin;
359   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
360   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
361   PetscFunctionReturn(PETSC_SUCCESS);
362 }
363 
364 /*@C
365   PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
366 
367   Input Parameter:
368 . ctrl - a PETSc OMP controller
369 
370   Note:
371   Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
372   This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
373 
374   Level: developer
375 
376 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
377 @*/
PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)378 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
379 {
380   PetscFunctionBegin;
381   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
382   omp_set_num_threads(1);
383   PetscFunctionReturn(PETSC_SUCCESS);
384 }
385 
386 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
387