1 #include <petscsys.h> /*I "petscsys.h" I*/ 2 #include <petsc/private/petscimpl.h> 3 #include <pthread.h> 4 #include <hwloc.h> 5 #include <omp.h> 6 7 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available, 8 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much 9 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 10 by 50%. Until the reason is found out, we use mmap() instead. 11 */ 12 #define USE_MMAP_ALLOCATE_SHARED_MEMORY 13 14 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 15 #include <sys/mman.h> 16 #include <sys/types.h> 17 #include <sys/stat.h> 18 #include <fcntl.h> 19 #endif 20 21 struct _n_PetscOmpCtrl { 22 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 23 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 24 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 25 PetscBool is_omp_master; /* rank 0's in omp_comm */ 26 MPI_Win omp_win; /* a shared memory window containing a barrier */ 27 pthread_barrier_t *barrier; /* pointer to the barrier */ 28 hwloc_topology_t topology; 29 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 30 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 31 }; 32 33 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm 34 contained by the controller. 35 36 PETSc OpenMP controller users do not call this function directly. This function exists 37 only because we want to separate shared memory allocation methods from other code. 38 */ 39 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) 40 { 41 MPI_Aint size; 42 void *baseptr; 43 pthread_barrierattr_t attr; 44 45 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 46 int fd; 47 char pathname[PETSC_MAX_PATH_LEN]; 48 #else 49 PetscMPIInt disp_unit; 50 #endif 51 52 PetscFunctionBegin; 53 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 54 size = sizeof(pthread_barrier_t); 55 if (ctrl->is_omp_master) { 56 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */ 57 PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN)); 58 PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN)); 59 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 60 fd = mkstemp(pathname); 61 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname); 62 PetscCallExternal(ftruncate, fd, size); 63 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 64 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 65 PetscCallExternal(close, fd); 66 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 67 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */ 68 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 69 PetscCallExternal(unlink, pathname); 70 } else { 71 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 72 fd = open(pathname, O_RDWR); 73 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname); 74 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 75 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 76 PetscCallExternal(close, fd); 77 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 78 } 79 #else 80 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 81 PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win)); 82 PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr)); 83 #endif 84 ctrl->barrier = (pthread_barrier_t *)baseptr; 85 86 /* omp master initializes the barrier */ 87 if (ctrl->is_omp_master) { 88 PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size)); 89 PetscCallExternal(pthread_barrierattr_init, &attr); 90 PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */ 91 PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size); 92 PetscCallExternal(pthread_barrierattr_destroy, &attr); 93 } 94 95 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 96 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 97 PetscFunctionReturn(PETSC_SUCCESS); 98 } 99 100 /* Destroy the pthread barrier in the PETSc OpenMP controller */ 101 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) 102 { 103 PetscFunctionBegin; 104 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 105 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 106 if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier); 107 108 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 109 PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t)); 110 #else 111 PetscCallMPI(MPI_Win_free(&ctrl->omp_win)); 112 #endif 113 PetscFunctionReturn(PETSC_SUCCESS); 114 } 115 116 /*@C 117 PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP 118 119 Input Parameters: 120 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in 121 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value 122 123 Output Parameter: 124 . pctrl - a PETSc OpenMP controller 125 126 Level: developer 127 128 Developer Note: 129 Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use 130 131 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 132 @*/ 133 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl) 134 { 135 PetscOmpCtrl ctrl; 136 unsigned long *cpu_ulongs = NULL; 137 PetscInt i, nr_cpu_ulongs; 138 PetscShmComm pshmcomm; 139 MPI_Comm shm_comm; 140 PetscMPIInt shm_rank, shm_comm_size, omp_rank, color; 141 PetscInt num_packages, num_cores; 142 143 PetscFunctionBegin; 144 PetscCall(PetscNew(&ctrl)); 145 146 /*================================================================================= 147 Init hwloc 148 ==================================================================================*/ 149 PetscCallExternal(hwloc_topology_init, &ctrl->topology); 150 #if HWLOC_API_VERSION >= 0x00020000 151 /* to filter out unneeded info and have faster hwloc_topology_load */ 152 PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE); 153 PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL); 154 #endif 155 PetscCallExternal(hwloc_topology_load, ctrl->topology); 156 157 /*================================================================================= 158 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 159 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 160 others are called slaves. OMP Masters make up a new comm called omp_master_comm, 161 which is usually passed to third party libraries. 162 ==================================================================================*/ 163 164 /* fetch the stored shared memory communicator */ 165 PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm)); 166 PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm)); 167 168 PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank)); 169 PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size)); 170 171 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */ 172 if (nthreads == -1) { 173 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE); 174 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE); 175 nthreads = num_cores / num_packages; 176 if (nthreads > shm_comm_size) nthreads = shm_comm_size; 177 } 178 179 PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size); 180 if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size)); 181 182 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 183 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 184 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 185 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 186 Use 0 as key so that rank ordering wont change in new comm. 187 */ 188 color = shm_rank / nthreads; 189 PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm)); 190 191 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 192 PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank)); 193 if (!omp_rank) { 194 ctrl->is_omp_master = PETSC_TRUE; /* master */ 195 color = 0; 196 } else { 197 ctrl->is_omp_master = PETSC_FALSE; /* slave */ 198 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 199 } 200 PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm)); 201 202 /*================================================================================= 203 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 204 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 205 and run them on the idle CPUs. 206 ==================================================================================*/ 207 PetscCall(PetscOmpCtrlCreateBarrier(ctrl)); 208 209 /*================================================================================= 210 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 211 is the union of the bindings of all ranks in the omp_comm 212 =================================================================================*/ 213 214 ctrl->cpuset = hwloc_bitmap_alloc(); 215 PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 216 PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS); 217 218 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 219 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8; 220 PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs)); 221 if (nr_cpu_ulongs == 1) { 222 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 223 } else { 224 for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i); 225 } 226 227 PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm)); 228 229 if (ctrl->is_omp_master) { 230 ctrl->omp_cpuset = hwloc_bitmap_alloc(); 231 PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 232 if (nr_cpu_ulongs == 1) { 233 #if HWLOC_API_VERSION >= 0x00020000 234 PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]); 235 #else 236 hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]); 237 #endif 238 } else { 239 for (i = 0; i < nr_cpu_ulongs; i++) { 240 #if HWLOC_API_VERSION >= 0x00020000 241 PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]); 242 #else 243 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]); 244 #endif 245 } 246 } 247 } 248 PetscCall(PetscFree(cpu_ulongs)); 249 *pctrl = ctrl; 250 PetscFunctionReturn(PETSC_SUCCESS); 251 } 252 253 /*@C 254 PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller 255 256 Input Parameter: 257 . pctrl - a PETSc OpenMP controller 258 259 Level: developer 260 261 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 262 @*/ 263 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) 264 { 265 PetscOmpCtrl ctrl = *pctrl; 266 267 PetscFunctionBegin; 268 hwloc_bitmap_free(ctrl->cpuset); 269 hwloc_topology_destroy(ctrl->topology); 270 PetscCall(PetscOmpCtrlDestroyBarrier(ctrl)); 271 PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm)); 272 if (ctrl->is_omp_master) { 273 hwloc_bitmap_free(ctrl->omp_cpuset); 274 PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm)); 275 } 276 PetscCall(PetscFree(ctrl)); 277 PetscFunctionReturn(PETSC_SUCCESS); 278 } 279 280 /*@C 281 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller 282 283 Input Parameter: 284 . ctrl - a PETSc OMP controller 285 286 Output Parameters: 287 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads 288 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 289 on slave ranks, `MPI_COMM_NULL` will be return in reality. 290 - is_omp_master - true if the calling process is an OMP master rank. 291 292 Note: 293 Any output parameter can be `NULL`. The parameter is just ignored. 294 295 Level: developer 296 297 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 298 @*/ 299 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master) 300 { 301 PetscFunctionBegin; 302 if (omp_comm) *omp_comm = ctrl->omp_comm; 303 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 304 if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 305 PetscFunctionReturn(PETSC_SUCCESS); 306 } 307 308 /*@C 309 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU) 310 311 Input Parameter: 312 . ctrl - a PETSc OMP controller 313 314 Notes: 315 This is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not 316 require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency, 317 MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement. 318 319 A code using `PetscOmpCtrlBarrier()` would be like this, 320 .vb 321 if (is_omp_master) { 322 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl); 323 Call the library using OpenMP 324 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl); 325 } 326 PetscOmpCtrlBarrier(ctrl); 327 .ve 328 329 Level: developer 330 331 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, 332 @*/ 333 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) 334 { 335 int err; 336 337 PetscFunctionBegin; 338 err = pthread_barrier_wait(ctrl->barrier); 339 PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err); 340 PetscFunctionReturn(PETSC_SUCCESS); 341 } 342 343 /*@C 344 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks 345 346 Input Parameter: 347 . ctrl - a PETSc OMP controller 348 349 Note: 350 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank. 351 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime 352 353 Level: developer 354 355 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()` 356 @*/ 357 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) 358 { 359 PetscFunctionBegin; 360 PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS); 361 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */ 362 PetscFunctionReturn(PETSC_SUCCESS); 363 } 364 365 /*@C 366 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks 367 368 Input Parameter: 369 . ctrl - a PETSc OMP controller 370 371 Note: 372 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank. 373 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1. 374 375 Level: developer 376 377 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()` 378 @*/ 379 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) 380 { 381 PetscFunctionBegin; 382 PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS); 383 omp_set_num_threads(1); 384 PetscFunctionReturn(PETSC_SUCCESS); 385 } 386 387 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 388