1 #include <petscsys.h> /*I "petscsys.h" I*/
2 #include <petsc/private/petscimpl.h>
3 #include <pthread.h>
4 #include <hwloc.h>
5 #include <omp.h>
6
7 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
8 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
9 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
10 by 50%. Until the reason is found out, we use mmap() instead.
11 */
12 #define USE_MMAP_ALLOCATE_SHARED_MEMORY
13
14 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
15 #include <sys/mman.h>
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <fcntl.h>
19 #endif
20
21 struct _n_PetscOmpCtrl {
22 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */
23 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */
24 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */
25 PetscBool is_omp_master; /* rank 0's in omp_comm */
26 MPI_Win omp_win; /* a shared memory window containing a barrier */
27 pthread_barrier_t *barrier; /* pointer to the barrier */
28 hwloc_topology_t topology;
29 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */
30 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
31 };
32
33 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
34 contained by the controller.
35
36 PETSc OpenMP controller users do not call this function directly. This function exists
37 only because we want to separate shared memory allocation methods from other code.
38 */
PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)39 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
40 {
41 MPI_Aint size;
42 void *baseptr;
43 pthread_barrierattr_t attr;
44
45 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
46 int fd;
47 char pathname[PETSC_MAX_PATH_LEN];
48 #else
49 PetscMPIInt disp_unit;
50 #endif
51
52 PetscFunctionBegin;
53 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
54 size = sizeof(pthread_barrier_t);
55 if (ctrl->is_omp_master) {
56 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
57 PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
58 PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
59 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
60 fd = mkstemp(pathname);
61 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
62 PetscCallExternal(ftruncate, fd, size);
63 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
64 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
65 PetscCallExternal(close, fd);
66 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
67 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
68 PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
69 PetscCallExternal(unlink, pathname);
70 } else {
71 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
72 fd = open(pathname, O_RDWR);
73 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
74 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
75 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
76 PetscCallExternal(close, fd);
77 PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
78 }
79 #else
80 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
81 PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
82 PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
83 #endif
84 ctrl->barrier = (pthread_barrier_t *)baseptr;
85
86 /* omp master initializes the barrier */
87 if (ctrl->is_omp_master) {
88 PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
89 PetscCallExternal(pthread_barrierattr_init, &attr);
90 PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
91 PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
92 PetscCallExternal(pthread_barrierattr_destroy, &attr);
93 }
94
95 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
96 PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
97 PetscFunctionReturn(PETSC_SUCCESS);
98 }
99
100 /* Destroy the pthread barrier in the PETSc OpenMP controller */
PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)101 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
102 {
103 PetscFunctionBegin;
104 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
105 PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
106 if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);
107
108 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
109 PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
110 #else
111 PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
112 #endif
113 PetscFunctionReturn(PETSC_SUCCESS);
114 }
115
116 /*@C
117 PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
118
119 Input Parameters:
120 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
121 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
122
123 Output Parameter:
124 . pctrl - a PETSc OpenMP controller
125
126 Level: developer
127
128 Developer Note:
129 Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
130
131 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
132 @*/
PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl * pctrl)133 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
134 {
135 PetscOmpCtrl ctrl;
136 unsigned long *cpu_ulongs = NULL;
137 PetscShmComm pshmcomm;
138 MPI_Comm shm_comm;
139 PetscMPIInt shm_rank, shm_comm_size, omp_rank, color, nr_cpu_ulongs;
140 PetscInt num_packages, num_cores;
141
142 PetscFunctionBegin;
143 PetscCall(PetscNew(&ctrl));
144
145 /*=================================================================================
146 Init hwloc
147 ==================================================================================*/
148 PetscCallExternal(hwloc_topology_init, &ctrl->topology);
149 #if HWLOC_API_VERSION >= 0x00020000
150 /* to filter out unneeded info and have faster hwloc_topology_load */
151 PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
152 PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
153 #endif
154 PetscCallExternal(hwloc_topology_load, ctrl->topology);
155
156 /*=================================================================================
157 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
158 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
159 others are called slaves. OMP Masters make up a new comm called omp_master_comm,
160 which is usually passed to third party libraries.
161 ==================================================================================*/
162
163 /* fetch the stored shared memory communicator */
164 PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
165 PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
166
167 PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
168 PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
169
170 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
171 if (nthreads == -1) {
172 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
173 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
174 nthreads = num_cores / num_packages;
175 if (nthreads > shm_comm_size) nthreads = shm_comm_size;
176 }
177
178 PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
179 if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
180
181 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
182 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
183 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
184 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
185 Use 0 as key so that rank ordering wont change in new comm.
186 */
187 color = shm_rank / nthreads;
188 PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
189
190 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
191 PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
192 if (!omp_rank) {
193 ctrl->is_omp_master = PETSC_TRUE; /* master */
194 color = 0;
195 } else {
196 ctrl->is_omp_master = PETSC_FALSE; /* slave */
197 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
198 }
199 PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
200
201 /*=================================================================================
202 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
203 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
204 and run them on the idle CPUs.
205 ==================================================================================*/
206 PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
207
208 /*=================================================================================
209 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
210 is the union of the bindings of all ranks in the omp_comm
211 =================================================================================*/
212
213 ctrl->cpuset = hwloc_bitmap_alloc();
214 PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
215 PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
216
217 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
218 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
219 PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
220 if (nr_cpu_ulongs == 1) {
221 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
222 } else {
223 for (PetscMPIInt i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
224 }
225
226 PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
227
228 if (ctrl->is_omp_master) {
229 ctrl->omp_cpuset = hwloc_bitmap_alloc();
230 PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
231 if (nr_cpu_ulongs == 1) {
232 #if HWLOC_API_VERSION >= 0x00020000
233 PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
234 #else
235 hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
236 #endif
237 } else {
238 for (PetscMPIInt i = 0; i < nr_cpu_ulongs; i++) {
239 #if HWLOC_API_VERSION >= 0x00020000
240 PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
241 #else
242 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
243 #endif
244 }
245 }
246 }
247 PetscCall(PetscFree(cpu_ulongs));
248 *pctrl = ctrl;
249 PetscFunctionReturn(PETSC_SUCCESS);
250 }
251
252 /*@C
253 PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
254
255 Input Parameter:
256 . pctrl - a PETSc OpenMP controller
257
258 Level: developer
259
260 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
261 @*/
PetscOmpCtrlDestroy(PetscOmpCtrl * pctrl)262 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
263 {
264 PetscOmpCtrl ctrl = *pctrl;
265
266 PetscFunctionBegin;
267 hwloc_bitmap_free(ctrl->cpuset);
268 hwloc_topology_destroy(ctrl->topology);
269 PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
270 PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
271 if (ctrl->is_omp_master) {
272 hwloc_bitmap_free(ctrl->omp_cpuset);
273 PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
274 }
275 PetscCall(PetscFree(ctrl));
276 PetscFunctionReturn(PETSC_SUCCESS);
277 }
278
279 /*@C
280 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
281
282 Input Parameter:
283 . ctrl - a PETSc OMP controller
284
285 Output Parameters:
286 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads
287 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm;
288 on slave ranks, `MPI_COMM_NULL` will be return in reality.
289 - is_omp_master - true if the calling process is an OMP master rank.
290
291 Note:
292 Any output parameter can be `NULL`. The parameter is just ignored.
293
294 Level: developer
295
296 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
297 @*/
PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm * omp_comm,MPI_Comm * omp_master_comm,PetscBool * is_omp_master)298 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
299 {
300 PetscFunctionBegin;
301 if (omp_comm) *omp_comm = ctrl->omp_comm;
302 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
303 if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
304 PetscFunctionReturn(PETSC_SUCCESS);
305 }
306
307 /*@C
308 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
309
310 Input Parameter:
311 . ctrl - a PETSc OMP controller
312
313 Notes:
314 This is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
315 require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
316 MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
317
318 A code using `PetscOmpCtrlBarrier()` would be like this,
319 .vb
320 if (is_omp_master) {
321 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
322 Call the library using OpenMP
323 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
324 }
325 PetscOmpCtrlBarrier(ctrl);
326 .ve
327
328 Level: developer
329
330 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
331 @*/
PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)332 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
333 {
334 int err;
335
336 PetscFunctionBegin;
337 err = pthread_barrier_wait(ctrl->barrier);
338 PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
339 PetscFunctionReturn(PETSC_SUCCESS);
340 }
341
342 /*@C
343 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
344
345 Input Parameter:
346 . ctrl - a PETSc OMP controller
347
348 Note:
349 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
350 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
351
352 Level: developer
353
354 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
355 @*/
PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)356 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
357 {
358 PetscFunctionBegin;
359 PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
360 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
361 PetscFunctionReturn(PETSC_SUCCESS);
362 }
363
364 /*@C
365 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
366
367 Input Parameter:
368 . ctrl - a PETSc OMP controller
369
370 Note:
371 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
372 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
373
374 Level: developer
375
376 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
377 @*/
PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)378 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
379 {
380 PetscFunctionBegin;
381 PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
382 omp_set_num_threads(1);
383 PetscFunctionReturn(PETSC_SUCCESS);
384 }
385
386 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
387