1 #include <petscsys.h> /*I "petscsys.h" I*/ 2 #include <petsc/private/petscimpl.h> 3 #include <pthread.h> 4 #include <hwloc.h> 5 #include <omp.h> 6 7 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available, 8 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much 9 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 10 by 50%. Until the reason is found out, we use mmap() instead. 11 */ 12 #define USE_MMAP_ALLOCATE_SHARED_MEMORY 13 14 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 15 #include <sys/mman.h> 16 #include <sys/types.h> 17 #include <sys/stat.h> 18 #include <fcntl.h> 19 #endif 20 21 struct _n_PetscOmpCtrl { 22 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 23 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 24 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 25 PetscBool is_omp_master; /* rank 0's in omp_comm */ 26 MPI_Win omp_win; /* a shared memory window containing a barrier */ 27 pthread_barrier_t *barrier; /* pointer to the barrier */ 28 hwloc_topology_t topology; 29 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 30 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 31 }; 32 33 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm 34 contained by the controller. 35 36 PETSc OpenMP controller users do not call this function directly. This function exists 37 only because we want to separate shared memory allocation methods from other code. 38 */ 39 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) 40 { 41 MPI_Aint size; 42 void *baseptr; 43 pthread_barrierattr_t attr; 44 45 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 46 int fd; 47 char pathname[PETSC_MAX_PATH_LEN]; 48 #else 49 PetscMPIInt disp_unit; 50 #endif 51 52 PetscFunctionBegin; 53 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 54 size = sizeof(pthread_barrier_t); 55 if (ctrl->is_omp_master) { 56 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */ 57 PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN)); 58 PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN)); 59 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 60 fd = mkstemp(pathname); 61 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname); 62 PetscCallExternal(ftruncate, fd, size); 63 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 64 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 65 PetscCallExternal(close, fd); 66 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 67 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */ 68 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 69 PetscCallExternal(unlink, pathname); 70 } else { 71 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 72 fd = open(pathname, O_RDWR); 73 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname); 74 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 75 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 76 PetscCallExternal(close, fd); 77 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 78 } 79 #else 80 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 81 PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win)); 82 PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr)); 83 #endif 84 ctrl->barrier = (pthread_barrier_t *)baseptr; 85 86 /* omp master initializes the barrier */ 87 if (ctrl->is_omp_master) { 88 PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size)); 89 PetscCallExternal(pthread_barrierattr_init, &attr); 90 PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */ 91 PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size); 92 PetscCallExternal(pthread_barrierattr_destroy, &attr); 93 } 94 95 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 96 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 97 PetscFunctionReturn(PETSC_SUCCESS); 98 } 99 100 /* Destroy the pthread barrier in the PETSc OpenMP controller */ 101 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) 102 { 103 PetscFunctionBegin; 104 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 105 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 106 if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier); 107 108 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 109 PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t)); 110 #else 111 PetscCallMPI(MPI_Win_free(&ctrl->omp_win)); 112 #endif 113 PetscFunctionReturn(PETSC_SUCCESS); 114 } 115 116 /*@C 117 PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP 118 119 Input Parameters: 120 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in 121 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value 122 123 Output Parameter: 124 . pctrl - a PETSc OpenMP controller 125 126 Level: developer 127 128 Developer Note: 129 Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use 130 131 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 132 @*/ 133 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl) 134 { 135 PetscOmpCtrl ctrl; 136 unsigned long *cpu_ulongs = NULL; 137 PetscShmComm pshmcomm; 138 MPI_Comm shm_comm; 139 PetscMPIInt shm_rank, shm_comm_size, omp_rank, color, nr_cpu_ulongs; 140 PetscInt num_packages, num_cores; 141 142 PetscFunctionBegin; 143 PetscCall(PetscNew(&ctrl)); 144 145 /*================================================================================= 146 Init hwloc 147 ==================================================================================*/ 148 PetscCallExternal(hwloc_topology_init, &ctrl->topology); 149 #if HWLOC_API_VERSION >= 0x00020000 150 /* to filter out unneeded info and have faster hwloc_topology_load */ 151 PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE); 152 PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL); 153 #endif 154 PetscCallExternal(hwloc_topology_load, ctrl->topology); 155 156 /*================================================================================= 157 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 158 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 159 others are called slaves. OMP Masters make up a new comm called omp_master_comm, 160 which is usually passed to third party libraries. 161 ==================================================================================*/ 162 163 /* fetch the stored shared memory communicator */ 164 PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm)); 165 PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm)); 166 167 PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank)); 168 PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size)); 169 170 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */ 171 if (nthreads == -1) { 172 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE); 173 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE); 174 nthreads = num_cores / num_packages; 175 if (nthreads > shm_comm_size) nthreads = shm_comm_size; 176 } 177 178 PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size); 179 if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size)); 180 181 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 182 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 183 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 184 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 185 Use 0 as key so that rank ordering wont change in new comm. 186 */ 187 color = shm_rank / nthreads; 188 PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm)); 189 190 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 191 PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank)); 192 if (!omp_rank) { 193 ctrl->is_omp_master = PETSC_TRUE; /* master */ 194 color = 0; 195 } else { 196 ctrl->is_omp_master = PETSC_FALSE; /* slave */ 197 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 198 } 199 PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm)); 200 201 /*================================================================================= 202 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 203 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 204 and run them on the idle CPUs. 205 ==================================================================================*/ 206 PetscCall(PetscOmpCtrlCreateBarrier(ctrl)); 207 208 /*================================================================================= 209 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 210 is the union of the bindings of all ranks in the omp_comm 211 =================================================================================*/ 212 213 ctrl->cpuset = hwloc_bitmap_alloc(); 214 PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 215 PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS); 216 217 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 218 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8; 219 PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs)); 220 if (nr_cpu_ulongs == 1) { 221 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 222 } else { 223 for (PetscMPIInt i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i); 224 } 225 226 PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm)); 227 228 if (ctrl->is_omp_master) { 229 ctrl->omp_cpuset = hwloc_bitmap_alloc(); 230 PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 231 if (nr_cpu_ulongs == 1) { 232 #if HWLOC_API_VERSION >= 0x00020000 233 PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]); 234 #else 235 hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]); 236 #endif 237 } else { 238 for (PetscMPIInt i = 0; i < nr_cpu_ulongs; i++) { 239 #if HWLOC_API_VERSION >= 0x00020000 240 PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]); 241 #else 242 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]); 243 #endif 244 } 245 } 246 } 247 PetscCall(PetscFree(cpu_ulongs)); 248 *pctrl = ctrl; 249 PetscFunctionReturn(PETSC_SUCCESS); 250 } 251 252 /*@C 253 PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller 254 255 Input Parameter: 256 . pctrl - a PETSc OpenMP controller 257 258 Level: developer 259 260 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 261 @*/ 262 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) 263 { 264 PetscOmpCtrl ctrl = *pctrl; 265 266 PetscFunctionBegin; 267 hwloc_bitmap_free(ctrl->cpuset); 268 hwloc_topology_destroy(ctrl->topology); 269 PetscCall(PetscOmpCtrlDestroyBarrier(ctrl)); 270 PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm)); 271 if (ctrl->is_omp_master) { 272 hwloc_bitmap_free(ctrl->omp_cpuset); 273 PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm)); 274 } 275 PetscCall(PetscFree(ctrl)); 276 PetscFunctionReturn(PETSC_SUCCESS); 277 } 278 279 /*@C 280 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller 281 282 Input Parameter: 283 . ctrl - a PETSc OMP controller 284 285 Output Parameters: 286 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads 287 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 288 on slave ranks, `MPI_COMM_NULL` will be return in reality. 289 - is_omp_master - true if the calling process is an OMP master rank. 290 291 Note: 292 Any output parameter can be `NULL`. The parameter is just ignored. 293 294 Level: developer 295 296 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 297 @*/ 298 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master) 299 { 300 PetscFunctionBegin; 301 if (omp_comm) *omp_comm = ctrl->omp_comm; 302 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 303 if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 304 PetscFunctionReturn(PETSC_SUCCESS); 305 } 306 307 /*@C 308 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU) 309 310 Input Parameter: 311 . ctrl - a PETSc OMP controller 312 313 Notes: 314 This is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not 315 require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency, 316 MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement. 317 318 A code using `PetscOmpCtrlBarrier()` would be like this, 319 .vb 320 if (is_omp_master) { 321 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl); 322 Call the library using OpenMP 323 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl); 324 } 325 PetscOmpCtrlBarrier(ctrl); 326 .ve 327 328 Level: developer 329 330 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, 331 @*/ 332 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) 333 { 334 int err; 335 336 PetscFunctionBegin; 337 err = pthread_barrier_wait(ctrl->barrier); 338 PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err); 339 PetscFunctionReturn(PETSC_SUCCESS); 340 } 341 342 /*@C 343 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks 344 345 Input Parameter: 346 . ctrl - a PETSc OMP controller 347 348 Note: 349 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank. 350 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime 351 352 Level: developer 353 354 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()` 355 @*/ 356 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) 357 { 358 PetscFunctionBegin; 359 PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS); 360 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */ 361 PetscFunctionReturn(PETSC_SUCCESS); 362 } 363 364 /*@C 365 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks 366 367 Input Parameter: 368 . ctrl - a PETSc OMP controller 369 370 Note: 371 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank. 372 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1. 373 374 Level: developer 375 376 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()` 377 @*/ 378 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) 379 { 380 PetscFunctionBegin; 381 PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS); 382 omp_set_num_threads(1); 383 PetscFunctionReturn(PETSC_SUCCESS); 384 } 385 386 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 387