xref: /petsc/src/sys/utils/openmp/mpmpishm.c (revision 6dd63270497ad23dcf16ae500a87ff2b2a0b7474)
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 #include <pthread.h>
4 #include <hwloc.h>
5 #include <omp.h>
6 
7 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
8    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
9    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
10    by 50%. Until the reason is found out, we use mmap() instead.
11 */
12 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
13 
14 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
15   #include <sys/mman.h>
16   #include <sys/types.h>
17   #include <sys/stat.h>
18   #include <fcntl.h>
19 #endif
20 
21 struct _n_PetscOmpCtrl {
22   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
23   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
24   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
25   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
26   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
27   pthread_barrier_t *barrier;         /* pointer to the barrier */
28   hwloc_topology_t   topology;
29   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
30   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
31 };
32 
33 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
34    contained by the controller.
35 
36    PETSc OpenMP controller users do not call this function directly. This function exists
37    only because we want to separate shared memory allocation methods from other code.
38  */
39 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
40 {
41   MPI_Aint              size;
42   void                 *baseptr;
43   pthread_barrierattr_t attr;
44 
45 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
46   int  fd;
47   char pathname[PETSC_MAX_PATH_LEN];
48 #else
49   PetscMPIInt disp_unit;
50 #endif
51 
52   PetscFunctionBegin;
53 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
54   size = sizeof(pthread_barrier_t);
55   if (ctrl->is_omp_master) {
56     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
57     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
58     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
59     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
60     fd = mkstemp(pathname);
61     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
62     PetscCallExternal(ftruncate, fd, size);
63     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
64     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
65     PetscCallExternal(close, fd);
66     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
67     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
68     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
69     PetscCallExternal(unlink, pathname);
70   } else {
71     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
72     fd = open(pathname, O_RDWR);
73     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
74     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
75     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
76     PetscCallExternal(close, fd);
77     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
78   }
79 #else
80   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
81   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
82   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
83 #endif
84   ctrl->barrier = (pthread_barrier_t *)baseptr;
85 
86   /* omp master initializes the barrier */
87   if (ctrl->is_omp_master) {
88     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
89     PetscCallExternal(pthread_barrierattr_init, &attr);
90     PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
91     PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
92     PetscCallExternal(pthread_barrierattr_destroy, &attr);
93   }
94 
95   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
96   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
97   PetscFunctionReturn(PETSC_SUCCESS);
98 }
99 
100 /* Destroy the pthread barrier in the PETSc OpenMP controller */
101 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
102 {
103   PetscFunctionBegin;
104   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
105   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
106   if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);
107 
108 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
109   PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
110 #else
111   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
112 #endif
113   PetscFunctionReturn(PETSC_SUCCESS);
114 }
115 
116 /*@C
117   PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
118 
119   Input Parameters:
120 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
121 - nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
122 
123   Output Parameter:
124 . pctrl - a PETSc OpenMP controller
125 
126   Level: developer
127 
128   Developer Note:
129   Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
130 
131 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
132 @*/
133 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
134 {
135   PetscOmpCtrl   ctrl;
136   unsigned long *cpu_ulongs = NULL;
137   PetscInt       i, nr_cpu_ulongs;
138   PetscShmComm   pshmcomm;
139   MPI_Comm       shm_comm;
140   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color;
141   PetscInt       num_packages, num_cores;
142 
143   PetscFunctionBegin;
144   PetscCall(PetscNew(&ctrl));
145 
146   /*=================================================================================
147     Init hwloc
148    ==================================================================================*/
149   PetscCallExternal(hwloc_topology_init, &ctrl->topology);
150 #if HWLOC_API_VERSION >= 0x00020000
151   /* to filter out unneeded info and have faster hwloc_topology_load */
152   PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
153   PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
154 #endif
155   PetscCallExternal(hwloc_topology_load, ctrl->topology);
156 
157   /*=================================================================================
158     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
159     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
160     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
161     which is usually passed to third party libraries.
162    ==================================================================================*/
163 
164   /* fetch the stored shared memory communicator */
165   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
166   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
167 
168   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
169   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
170 
171   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
172   if (nthreads == -1) {
173     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
174     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
175     nthreads     = num_cores / num_packages;
176     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
177   }
178 
179   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
180   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
181 
182   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
183      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
184      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
185      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
186      Use 0 as key so that rank ordering wont change in new comm.
187    */
188   color = shm_rank / nthreads;
189   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
190 
191   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
192   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
193   if (!omp_rank) {
194     ctrl->is_omp_master = PETSC_TRUE; /* master */
195     color               = 0;
196   } else {
197     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
198     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
199   }
200   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
201 
202   /*=================================================================================
203     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
204     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
205     and run them on the idle CPUs.
206    ==================================================================================*/
207   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
208 
209   /*=================================================================================
210     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
211     is the union of the bindings of all ranks in the omp_comm
212     =================================================================================*/
213 
214   ctrl->cpuset = hwloc_bitmap_alloc();
215   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
216   PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
217 
218   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
219   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
220   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
221   if (nr_cpu_ulongs == 1) {
222     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
223   } else {
224     for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
225   }
226 
227   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
228 
229   if (ctrl->is_omp_master) {
230     ctrl->omp_cpuset = hwloc_bitmap_alloc();
231     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
232     if (nr_cpu_ulongs == 1) {
233 #if HWLOC_API_VERSION >= 0x00020000
234       PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
235 #else
236       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
237 #endif
238     } else {
239       for (i = 0; i < nr_cpu_ulongs; i++) {
240 #if HWLOC_API_VERSION >= 0x00020000
241         PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
242 #else
243         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
244 #endif
245       }
246     }
247   }
248   PetscCall(PetscFree(cpu_ulongs));
249   *pctrl = ctrl;
250   PetscFunctionReturn(PETSC_SUCCESS);
251 }
252 
253 /*@C
254   PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
255 
256   Input Parameter:
257 . pctrl - a PETSc OpenMP controller
258 
259   Level: developer
260 
261 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
262 @*/
263 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
264 {
265   PetscOmpCtrl ctrl = *pctrl;
266 
267   PetscFunctionBegin;
268   hwloc_bitmap_free(ctrl->cpuset);
269   hwloc_topology_destroy(ctrl->topology);
270   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
271   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
272   if (ctrl->is_omp_master) {
273     hwloc_bitmap_free(ctrl->omp_cpuset);
274     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
275   }
276   PetscCall(PetscFree(ctrl));
277   PetscFunctionReturn(PETSC_SUCCESS);
278 }
279 
280 /*@C
281   PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
282 
283   Input Parameter:
284 . ctrl - a PETSc OMP controller
285 
286   Output Parameters:
287 + omp_comm        - a communicator that includes a master rank and slave ranks where master spawns threads
288 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm;
289                     on slave ranks, `MPI_COMM_NULL` will be return in reality.
290 - is_omp_master   - true if the calling process is an OMP master rank.
291 
292   Note:
293   Any output parameter can be `NULL`. The parameter is just ignored.
294 
295   Level: developer
296 
297 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
298 @*/
299 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
300 {
301   PetscFunctionBegin;
302   if (omp_comm) *omp_comm = ctrl->omp_comm;
303   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
304   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
305   PetscFunctionReturn(PETSC_SUCCESS);
306 }
307 
308 /*@C
309   PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
310 
311   Input Parameter:
312 . ctrl - a PETSc OMP controller
313 
314   Notes:
315   This is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
316   require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
317   MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
318 
319   A code using `PetscOmpCtrlBarrier()` would be like this,
320 .vb
321   if (is_omp_master) {
322     PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
323     Call the library using OpenMP
324     PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
325   }
326   PetscOmpCtrlBarrier(ctrl);
327 .ve
328 
329   Level: developer
330 
331 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
332 @*/
333 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
334 {
335   int err;
336 
337   PetscFunctionBegin;
338   err = pthread_barrier_wait(ctrl->barrier);
339   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
340   PetscFunctionReturn(PETSC_SUCCESS);
341 }
342 
343 /*@C
344   PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
345 
346   Input Parameter:
347 . ctrl - a PETSc OMP controller
348 
349   Note:
350   Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
351   This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
352 
353   Level: developer
354 
355 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
356 @*/
357 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
358 {
359   PetscFunctionBegin;
360   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
361   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
362   PetscFunctionReturn(PETSC_SUCCESS);
363 }
364 
365 /*@C
366   PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
367 
368   Input Parameter:
369 . ctrl - a PETSc OMP controller
370 
371   Note:
372   Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
373   This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
374 
375   Level: developer
376 
377 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
378 @*/
379 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
380 {
381   PetscFunctionBegin;
382   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
383   omp_set_num_threads(1);
384   PetscFunctionReturn(PETSC_SUCCESS);
385 }
386 
387 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
388