1 #include <petscsys.h> /*I "petscsys.h" I*/ 2 #include <petsc/private/petscimpl.h> 3 4 struct _n_PetscShmComm { 5 PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */ 6 PetscMPIInt shmsize; /* size of the shared memory communicator */ 7 MPI_Comm globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */ 8 }; 9 10 /* 11 Private routine to delete internal shared memory communicator when a communicator is freed. 12 13 This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed. 14 15 Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval() 16 17 */ 18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state) 19 { 20 PetscShmComm p = (PetscShmComm)val; 21 22 PetscFunctionBegin; 23 PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm)); 24 PetscCallMPI(MPI_Comm_free(&p->shmcomm)); 25 PetscCallMPI(PetscFree(p->globranks)); 26 PetscCallMPI(PetscFree(val)); 27 PetscFunctionReturn(MPI_SUCCESS); 28 } 29 30 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 31 /* Data structures to support freeing comms created in PetscShmCommGet(). 32 Since we predict communicators passed to PetscShmCommGet() are very likely 33 either a petsc inner communicator or an MPI communicator with a linked petsc 34 inner communicator, we use a simple static array to store dupped communicators 35 on rare cases otherwise. 36 */ 37 #define MAX_SHMCOMM_DUPPED_COMMS 16 38 static PetscInt num_dupped_comms = 0; 39 static MPI_Comm shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS]; 40 static PetscErrorCode PetscShmCommDestroyDuppedComms(void) 41 { 42 PetscInt i; 43 PetscFunctionBegin; 44 for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i])); 45 num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */ 46 PetscFunctionReturn(PETSC_SUCCESS); 47 } 48 #endif 49 50 /*@C 51 PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory 52 53 Collective. 54 55 Input Parameter: 56 . globcomm - `MPI_Comm`, which can be a user MPI_Comm or a PETSc inner MPI_Comm 57 58 Output Parameter: 59 . pshmcomm - the PETSc shared memory communicator object 60 61 Level: developer 62 63 Note: 64 When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis 65 66 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()` 67 @*/ 68 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm) 69 { 70 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 71 MPI_Group globgroup, shmgroup; 72 PetscMPIInt *shmranks, i, flg; 73 PetscCommCounter *counter; 74 75 PetscFunctionBegin; 76 PetscValidPointer(pshmcomm, 2); 77 /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */ 78 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg)); 79 if (!flg) { /* globcomm is not a petsc comm */ 80 union 81 { 82 MPI_Comm comm; 83 void *ptr; 84 } ucomm; 85 /* check if globcomm already has a linked petsc inner comm */ 86 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg)); 87 if (!flg) { 88 /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */ 89 PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS); 90 PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL)); 91 /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */ 92 if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms)); 93 shmcomm_dupped_comms[num_dupped_comms] = globcomm; 94 num_dupped_comms++; 95 } else { 96 /* otherwise, we pull out the inner comm and use it as globcomm */ 97 globcomm = ucomm.comm; 98 } 99 } 100 101 /* Check if globcomm already has an attached pshmcomm. If no, create one */ 102 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg)); 103 if (flg) PetscFunctionReturn(PETSC_SUCCESS); 104 105 PetscCall(PetscNew(pshmcomm)); 106 (*pshmcomm)->globcomm = globcomm; 107 108 PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm)); 109 110 PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize)); 111 PetscCallMPI(MPI_Comm_group(globcomm, &globgroup)); 112 PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup)); 113 PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks)); 114 PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks)); 115 for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i; 116 PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks)); 117 PetscCall(PetscFree(shmranks)); 118 PetscCallMPI(MPI_Group_free(&globgroup)); 119 PetscCallMPI(MPI_Group_free(&shmgroup)); 120 121 for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i])); 122 PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm)); 123 PetscFunctionReturn(PETSC_SUCCESS); 124 #else 125 SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich."); 126 #endif 127 } 128 129 /*@C 130 PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator 131 132 Input Parameters: 133 + pshmcomm - the shared memory communicator object 134 - grank - the global rank 135 136 Output Parameter: 137 . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist 138 139 Level: developer 140 141 Developer Notes: 142 Assumes the pshmcomm->globranks[] is sorted 143 144 It may be better to rewrite this to map multiple global ranks to local in the same function call 145 146 .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()` 147 @*/ 148 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank) 149 { 150 PetscMPIInt low, high, t, i; 151 PetscBool flg = PETSC_FALSE; 152 153 PetscFunctionBegin; 154 PetscValidPointer(pshmcomm, 1); 155 PetscValidIntPointer(lrank, 3); 156 *lrank = MPI_PROC_NULL; 157 if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS); 158 if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS); 159 PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL)); 160 if (flg) PetscFunctionReturn(PETSC_SUCCESS); 161 low = 0; 162 high = pshmcomm->shmsize; 163 while (high - low > 5) { 164 t = (low + high) / 2; 165 if (pshmcomm->globranks[t] > grank) high = t; 166 else low = t; 167 } 168 for (i = low; i < high; i++) { 169 if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS); 170 if (pshmcomm->globranks[i] == grank) { 171 *lrank = i; 172 PetscFunctionReturn(PETSC_SUCCESS); 173 } 174 } 175 PetscFunctionReturn(PETSC_SUCCESS); 176 } 177 178 /*@C 179 PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank 180 181 Input Parameters: 182 + pshmcomm - the shared memory communicator object 183 - lrank - the local rank in the shared memory communicator 184 185 Output Parameter: 186 . grank - the global rank in the global communicator where the shared memory communicator is built 187 188 Level: developer 189 190 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()` 191 @*/ 192 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank) 193 { 194 PetscFunctionBegin; 195 PetscValidPointer(pshmcomm, 1); 196 PetscValidIntPointer(grank, 3); 197 PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank); 198 *grank = pshmcomm->globranks[lrank]; 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201 202 /*@C 203 PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory 204 205 Input Parameter: 206 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet() 207 208 Output Parameter: 209 . comm - the MPI communicator 210 211 Level: developer 212 213 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()` 214 @*/ 215 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm) 216 { 217 PetscFunctionBegin; 218 PetscValidPointer(pshmcomm, 1); 219 PetscValidPointer(comm, 2); 220 *comm = pshmcomm->shmcomm; 221 PetscFunctionReturn(PETSC_SUCCESS); 222 } 223 224 #if defined(PETSC_HAVE_OPENMP_SUPPORT) 225 #include <pthread.h> 226 #include <hwloc.h> 227 #include <omp.h> 228 229 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available, 230 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much 231 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 232 by 50%. Until the reason is found out, we use mmap() instead. 233 */ 234 #define USE_MMAP_ALLOCATE_SHARED_MEMORY 235 236 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 237 #include <sys/mman.h> 238 #include <sys/types.h> 239 #include <sys/stat.h> 240 #include <fcntl.h> 241 #endif 242 243 struct _n_PetscOmpCtrl { 244 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 245 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 246 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 247 PetscBool is_omp_master; /* rank 0's in omp_comm */ 248 MPI_Win omp_win; /* a shared memory window containing a barrier */ 249 pthread_barrier_t *barrier; /* pointer to the barrier */ 250 hwloc_topology_t topology; 251 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 252 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 253 }; 254 255 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm 256 contained by the controller. 257 258 PETSc OpenMP controller users do not call this function directly. This function exists 259 only because we want to separate shared memory allocation methods from other code. 260 */ 261 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) 262 { 263 MPI_Aint size; 264 void *baseptr; 265 pthread_barrierattr_t attr; 266 267 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 268 PetscInt fd; 269 PetscChar pathname[PETSC_MAX_PATH_LEN]; 270 #else 271 PetscMPIInt disp_unit; 272 #endif 273 274 PetscFunctionBegin; 275 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 276 size = sizeof(pthread_barrier_t); 277 if (ctrl->is_omp_master) { 278 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */ 279 PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN)); 280 PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN)); 281 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 282 fd = mkstemp(pathname); 283 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname); 284 PetscCall(ftruncate(fd, size)); 285 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 286 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 287 PetscCall(close(fd)); 288 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 289 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */ 290 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 291 PetscCall(unlink(pathname)); 292 } else { 293 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 294 fd = open(pathname, O_RDWR); 295 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname); 296 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 297 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 298 PetscCall(close(fd)); 299 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 300 } 301 #else 302 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 303 PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win)); 304 PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr)); 305 #endif 306 ctrl->barrier = (pthread_barrier_t *)baseptr; 307 308 /* omp master initializes the barrier */ 309 if (ctrl->is_omp_master) { 310 PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size)); 311 PetscCall(pthread_barrierattr_init(&attr)); 312 PetscCall(pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)); /* make the barrier also work for processes */ 313 PetscCall(pthread_barrier_init(ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size)); 314 PetscCall(pthread_barrierattr_destroy(&attr)); 315 } 316 317 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 318 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 319 PetscFunctionReturn(PETSC_SUCCESS); 320 } 321 322 /* Destroy the pthread barrier in the PETSc OpenMP controller */ 323 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) 324 { 325 PetscFunctionBegin; 326 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 327 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 328 if (ctrl->is_omp_master) PetscCall(pthread_barrier_destroy(ctrl->barrier)); 329 330 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 331 PetscCall(munmap(ctrl->barrier, sizeof(pthread_barrier_t))); 332 #else 333 PetscCallMPI(MPI_Win_free(&ctrl->omp_win)); 334 #endif 335 PetscFunctionReturn(PETSC_SUCCESS); 336 } 337 338 /*@C 339 PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP 340 341 Input Parameters: 342 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in 343 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value 344 345 Output Parameter: 346 . pctrl - a PETSc OpenMP controller 347 348 Level: developer 349 350 Developer Note: 351 Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use 352 353 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 354 @*/ 355 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl) 356 { 357 PetscOmpCtrl ctrl; 358 unsigned long *cpu_ulongs = NULL; 359 PetscInt i, nr_cpu_ulongs; 360 PetscShmComm pshmcomm; 361 MPI_Comm shm_comm; 362 PetscMPIInt shm_rank, shm_comm_size, omp_rank, color; 363 PetscInt num_packages, num_cores; 364 365 PetscFunctionBegin; 366 PetscCall(PetscNew(&ctrl)); 367 368 /*================================================================================= 369 Init hwloc 370 ==================================================================================*/ 371 PetscCall(hwloc_topology_init(&ctrl->topology)); 372 #if HWLOC_API_VERSION >= 0x00020000 373 /* to filter out unneeded info and have faster hwloc_topology_load */ 374 PetscCall(hwloc_topology_set_all_types_filter(ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE)); 375 PetscCall(hwloc_topology_set_type_filter(ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL)); 376 #endif 377 PetscCall(hwloc_topology_load(ctrl->topology)); 378 379 /*================================================================================= 380 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 381 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 382 others are called slaves. OMP Masters make up a new comm called omp_master_comm, 383 which is usually passed to third party libraries. 384 ==================================================================================*/ 385 386 /* fetch the stored shared memory communicator */ 387 PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm)); 388 PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm)); 389 390 PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank)); 391 PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size)); 392 393 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */ 394 if (nthreads == -1) { 395 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE); 396 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE); 397 nthreads = num_cores / num_packages; 398 if (nthreads > shm_comm_size) nthreads = shm_comm_size; 399 } 400 401 PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size); 402 if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size)); 403 404 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 405 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 406 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 407 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 408 Use 0 as key so that rank ordering wont change in new comm. 409 */ 410 color = shm_rank / nthreads; 411 PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm)); 412 413 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 414 PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank)); 415 if (!omp_rank) { 416 ctrl->is_omp_master = PETSC_TRUE; /* master */ 417 color = 0; 418 } else { 419 ctrl->is_omp_master = PETSC_FALSE; /* slave */ 420 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 421 } 422 PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm)); 423 424 /*================================================================================= 425 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 426 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 427 and run them on the idle CPUs. 428 ==================================================================================*/ 429 PetscCall(PetscOmpCtrlCreateBarrier(ctrl)); 430 431 /*================================================================================= 432 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 433 is the union of the bindings of all ranks in the omp_comm 434 =================================================================================*/ 435 436 ctrl->cpuset = hwloc_bitmap_alloc(); 437 PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 438 PetscCall(hwloc_get_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS)); 439 440 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 441 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8; 442 PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs)); 443 if (nr_cpu_ulongs == 1) { 444 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 445 } else { 446 for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i); 447 } 448 449 PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm)); 450 451 if (ctrl->is_omp_master) { 452 ctrl->omp_cpuset = hwloc_bitmap_alloc(); 453 PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 454 if (nr_cpu_ulongs == 1) { 455 #if HWLOC_API_VERSION >= 0x00020000 456 PetscCall(hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0])); 457 #else 458 hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]); 459 #endif 460 } else { 461 for (i = 0; i < nr_cpu_ulongs; i++) { 462 #if HWLOC_API_VERSION >= 0x00020000 463 PetscCall(hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i])); 464 #else 465 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]); 466 #endif 467 } 468 } 469 } 470 PetscCall(PetscFree(cpu_ulongs)); 471 *pctrl = ctrl; 472 PetscFunctionReturn(PETSC_SUCCESS); 473 } 474 475 /*@C 476 PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller 477 478 Input Parameter: 479 . pctrl - a PETSc OpenMP controller 480 481 Level: developer 482 483 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 484 @*/ 485 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) 486 { 487 PetscOmpCtrl ctrl = *pctrl; 488 489 PetscFunctionBegin; 490 hwloc_bitmap_free(ctrl->cpuset); 491 hwloc_topology_destroy(ctrl->topology); 492 PetscCall(PetscOmpCtrlDestroyBarrier(ctrl)); 493 PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm)); 494 if (ctrl->is_omp_master) { 495 hwloc_bitmap_free(ctrl->omp_cpuset); 496 PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm)); 497 } 498 PetscCall(PetscFree(ctrl)); 499 PetscFunctionReturn(PETSC_SUCCESS); 500 } 501 502 /*@C 503 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller 504 505 Input Parameter: 506 . ctrl - a PETSc OMP controller 507 508 Output Parameters: 509 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads 510 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 511 on slave ranks, `MPI_COMM_NULL` will be return in reality. 512 - is_omp_master - true if the calling process is an OMP master rank. 513 514 Note: 515 Any output parameter can be NULL. The parameter is just ignored. 516 517 Level: developer 518 519 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 520 @*/ 521 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master) 522 { 523 PetscFunctionBegin; 524 if (omp_comm) *omp_comm = ctrl->omp_comm; 525 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 526 if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 527 PetscFunctionReturn(PETSC_SUCCESS); 528 } 529 530 /*@C 531 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU) 532 533 Input Parameter: 534 . ctrl - a PETSc OMP controller 535 536 Notes: 537 this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not 538 require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency, 539 MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement. 540 541 A code using `PetscOmpCtrlBarrier()` would be like this, 542 .vb 543 if (is_omp_master) { 544 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl); 545 Call the library using OpenMP 546 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl); 547 } 548 PetscOmpCtrlBarrier(ctrl); 549 .ve 550 551 Level: developer 552 553 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, 554 @*/ 555 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) 556 { 557 int err; 558 559 PetscFunctionBegin; 560 err = pthread_barrier_wait(ctrl->barrier); 561 PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err); 562 PetscFunctionReturn(PETSC_SUCCESS); 563 } 564 565 /*@C 566 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks 567 568 Input Parameter: 569 . ctrl - a PETSc OMP controller 570 571 Note: 572 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank. 573 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime 574 575 Level: developer 576 577 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()` 578 @*/ 579 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) 580 { 581 PetscFunctionBegin; 582 PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS)); 583 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */ 584 PetscFunctionReturn(PETSC_SUCCESS); 585 } 586 587 /*@C 588 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks 589 590 Input Parameter: 591 . ctrl - a PETSc OMP controller 592 593 Note: 594 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank. 595 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1. 596 597 Level: developer 598 599 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()` 600 @*/ 601 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) 602 { 603 PetscFunctionBegin; 604 PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS)); 605 omp_set_num_threads(1); 606 PetscFunctionReturn(PETSC_SUCCESS); 607 } 608 609 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 610 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */ 611