1 #include <petscsys.h> /*I "petscsys.h" I*/ 2 #include <petsc/private/petscimpl.h> 3 4 struct _n_PetscShmComm { 5 PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */ 6 PetscMPIInt shmsize; /* size of the shared memory communicator */ 7 MPI_Comm globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */ 8 }; 9 10 /* 11 Private routine to delete internal shared memory communicator when a communicator is freed. 12 13 This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed. 14 15 Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval() 16 17 */ 18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_DeleteFn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state) 19 { 20 PetscShmComm p = (PetscShmComm)val; 21 22 PetscFunctionBegin; 23 PetscCallReturnMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm)); 24 PetscCallMPIReturnMPI(MPI_Comm_free(&p->shmcomm)); 25 PetscCallReturnMPI(PetscFree(p->globranks)); 26 PetscCallReturnMPI(PetscFree(val)); 27 PetscFunctionReturn(MPI_SUCCESS); 28 } 29 30 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 31 /* Data structures to support freeing comms created in PetscShmCommGet(). 32 Since we predict communicators passed to PetscShmCommGet() are very likely 33 either a petsc inner communicator or an MPI communicator with a linked petsc 34 inner communicator, we use a simple static array to store dupped communicators 35 on rare cases otherwise. 36 */ 37 #define MAX_SHMCOMM_DUPPED_COMMS 16 38 static PetscInt num_dupped_comms = 0; 39 static MPI_Comm shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS]; 40 static PetscErrorCode PetscShmCommDestroyDuppedComms(void) 41 { 42 PetscFunctionBegin; 43 for (PetscInt i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i])); 44 num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */ 45 PetscFunctionReturn(PETSC_SUCCESS); 46 } 47 #endif 48 49 /*@C 50 PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory 51 52 Collective. 53 54 Input Parameter: 55 . globcomm - `MPI_Comm`, which can be a user `MPI_Comm` or a PETSc inner `MPI_Comm` 56 57 Output Parameter: 58 . pshmcomm - the PETSc shared memory communicator object 59 60 Level: developer 61 62 Note: 63 When used with MPICH, MPICH must be configured with `--download-mpich-device=ch3:nemesis` 64 65 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()` 66 @*/ 67 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm) 68 { 69 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 70 MPI_Group globgroup, shmgroup; 71 PetscMPIInt *shmranks, i, flg; 72 PetscCommCounter *counter; 73 74 PetscFunctionBegin; 75 PetscAssertPointer(pshmcomm, 2); 76 /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */ 77 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg)); 78 if (!flg) { /* globcomm is not a petsc comm */ 79 union 80 { 81 MPI_Comm comm; 82 void *ptr; 83 } ucomm; 84 /* check if globcomm already has a linked petsc inner comm */ 85 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg)); 86 if (!flg) { 87 /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */ 88 PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS); 89 PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL)); 90 /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */ 91 if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms)); 92 shmcomm_dupped_comms[num_dupped_comms] = globcomm; 93 num_dupped_comms++; 94 } else { 95 /* otherwise, we pull out the inner comm and use it as globcomm */ 96 globcomm = ucomm.comm; 97 } 98 } 99 100 /* Check if globcomm already has an attached pshmcomm. If no, create one */ 101 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg)); 102 if (flg) PetscFunctionReturn(PETSC_SUCCESS); 103 104 PetscCall(PetscNew(pshmcomm)); 105 (*pshmcomm)->globcomm = globcomm; 106 107 PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm)); 108 109 PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize)); 110 PetscCallMPI(MPI_Comm_group(globcomm, &globgroup)); 111 PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup)); 112 PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks)); 113 PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks)); 114 for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i; 115 PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks)); 116 PetscCall(PetscFree(shmranks)); 117 PetscCallMPI(MPI_Group_free(&globgroup)); 118 PetscCallMPI(MPI_Group_free(&shmgroup)); 119 120 for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i])); 121 PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm)); 122 PetscFunctionReturn(PETSC_SUCCESS); 123 #else 124 SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich."); 125 #endif 126 } 127 128 /*@C 129 PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator 130 131 Input Parameters: 132 + pshmcomm - the shared memory communicator object 133 - grank - the global rank 134 135 Output Parameter: 136 . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist 137 138 Level: developer 139 140 Developer Notes: 141 Assumes the pshmcomm->globranks[] is sorted 142 143 It may be better to rewrite this to map multiple global ranks to local in the same function call 144 145 .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()` 146 @*/ 147 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank) 148 { 149 PetscMPIInt low, high, t, i; 150 PetscBool flg = PETSC_FALSE; 151 152 PetscFunctionBegin; 153 PetscAssertPointer(pshmcomm, 1); 154 PetscAssertPointer(lrank, 3); 155 *lrank = MPI_PROC_NULL; 156 if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS); 157 if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS); 158 PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL)); 159 if (flg) PetscFunctionReturn(PETSC_SUCCESS); 160 low = 0; 161 high = pshmcomm->shmsize; 162 while (high - low > 5) { 163 t = (low + high) / 2; 164 if (pshmcomm->globranks[t] > grank) high = t; 165 else low = t; 166 } 167 for (i = low; i < high; i++) { 168 if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS); 169 if (pshmcomm->globranks[i] == grank) { 170 *lrank = i; 171 PetscFunctionReturn(PETSC_SUCCESS); 172 } 173 } 174 PetscFunctionReturn(PETSC_SUCCESS); 175 } 176 177 /*@C 178 PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank 179 180 Input Parameters: 181 + pshmcomm - the shared memory communicator object 182 - lrank - the local rank in the shared memory communicator 183 184 Output Parameter: 185 . grank - the global rank in the global communicator where the shared memory communicator is built 186 187 Level: developer 188 189 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()` 190 @*/ 191 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank) 192 { 193 PetscFunctionBegin; 194 PetscAssertPointer(pshmcomm, 1); 195 PetscAssertPointer(grank, 3); 196 PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank); 197 *grank = pshmcomm->globranks[lrank]; 198 PetscFunctionReturn(PETSC_SUCCESS); 199 } 200 201 /*@C 202 PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory 203 204 Input Parameter: 205 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet() 206 207 Output Parameter: 208 . comm - the MPI communicator 209 210 Level: developer 211 212 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()` 213 @*/ 214 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm) 215 { 216 PetscFunctionBegin; 217 PetscAssertPointer(pshmcomm, 1); 218 PetscAssertPointer(comm, 2); 219 *comm = pshmcomm->shmcomm; 220 PetscFunctionReturn(PETSC_SUCCESS); 221 } 222 223 #if defined(PETSC_HAVE_OPENMP_SUPPORT) 224 #include <pthread.h> 225 #include <hwloc.h> 226 #include <omp.h> 227 228 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available, 229 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much 230 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 231 by 50%. Until the reason is found out, we use mmap() instead. 232 */ 233 #define USE_MMAP_ALLOCATE_SHARED_MEMORY 234 235 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 236 #include <sys/mman.h> 237 #include <sys/types.h> 238 #include <sys/stat.h> 239 #include <fcntl.h> 240 #endif 241 242 struct _n_PetscOmpCtrl { 243 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 244 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 245 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 246 PetscBool is_omp_master; /* rank 0's in omp_comm */ 247 MPI_Win omp_win; /* a shared memory window containing a barrier */ 248 pthread_barrier_t *barrier; /* pointer to the barrier */ 249 hwloc_topology_t topology; 250 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 251 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 252 }; 253 254 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm 255 contained by the controller. 256 257 PETSc OpenMP controller users do not call this function directly. This function exists 258 only because we want to separate shared memory allocation methods from other code. 259 */ 260 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) 261 { 262 MPI_Aint size; 263 void *baseptr; 264 pthread_barrierattr_t attr; 265 266 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 267 int fd; 268 PetscChar pathname[PETSC_MAX_PATH_LEN]; 269 #else 270 PetscMPIInt disp_unit; 271 #endif 272 273 PetscFunctionBegin; 274 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 275 size = sizeof(pthread_barrier_t); 276 if (ctrl->is_omp_master) { 277 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */ 278 PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN)); 279 PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN)); 280 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 281 fd = mkstemp(pathname); 282 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname); 283 PetscCallExternal(ftruncate, fd, size); 284 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 285 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 286 PetscCallExternal(close, fd); 287 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 288 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */ 289 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 290 PetscCallExternal(unlink, pathname); 291 } else { 292 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 293 fd = open(pathname, O_RDWR); 294 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname); 295 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 296 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 297 PetscCallExternal(close, fd); 298 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 299 } 300 #else 301 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 302 PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win)); 303 PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr)); 304 #endif 305 ctrl->barrier = (pthread_barrier_t *)baseptr; 306 307 /* omp master initializes the barrier */ 308 if (ctrl->is_omp_master) { 309 PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size)); 310 PetscCallExternal(pthread_barrierattr_init, &attr); 311 PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */ 312 PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size); 313 PetscCallExternal(pthread_barrierattr_destroy, &attr); 314 } 315 316 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 317 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 318 PetscFunctionReturn(PETSC_SUCCESS); 319 } 320 321 /* Destroy the pthread barrier in the PETSc OpenMP controller */ 322 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) 323 { 324 PetscFunctionBegin; 325 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 326 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 327 if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier); 328 329 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 330 PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t)); 331 #else 332 PetscCallMPI(MPI_Win_free(&ctrl->omp_win)); 333 #endif 334 PetscFunctionReturn(PETSC_SUCCESS); 335 } 336 337 /*@C 338 PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP 339 340 Input Parameters: 341 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in 342 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value 343 344 Output Parameter: 345 . pctrl - a PETSc OpenMP controller 346 347 Level: developer 348 349 Developer Note: 350 Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use 351 352 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 353 @*/ 354 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl) 355 { 356 PetscOmpCtrl ctrl; 357 unsigned long *cpu_ulongs = NULL; 358 PetscShmComm pshmcomm; 359 MPI_Comm shm_comm; 360 PetscMPIInt shm_rank, shm_comm_size, omp_rank, color, nr_cpu_ulongs; 361 PetscInt num_packages, num_cores; 362 363 PetscFunctionBegin; 364 PetscCall(PetscNew(&ctrl)); 365 366 /*================================================================================= 367 Init hwloc 368 ==================================================================================*/ 369 PetscCallExternal(hwloc_topology_init, &ctrl->topology); 370 #if HWLOC_API_VERSION >= 0x00020000 371 /* to filter out unneeded info and have faster hwloc_topology_load */ 372 PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE); 373 PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL); 374 #endif 375 PetscCallExternal(hwloc_topology_load, ctrl->topology); 376 377 /*================================================================================= 378 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 379 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 380 others are called slaves. OMP Masters make up a new comm called omp_master_comm, 381 which is usually passed to third party libraries. 382 ==================================================================================*/ 383 384 /* fetch the stored shared memory communicator */ 385 PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm)); 386 PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm)); 387 388 PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank)); 389 PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size)); 390 391 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */ 392 if (nthreads == -1) { 393 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE); 394 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE); 395 nthreads = num_cores / num_packages; 396 if (nthreads > shm_comm_size) nthreads = shm_comm_size; 397 } 398 399 PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size); 400 if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size)); 401 402 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 403 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 404 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 405 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 406 Use 0 as key so that rank ordering wont change in new comm. 407 */ 408 color = shm_rank / nthreads; 409 PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm)); 410 411 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 412 PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank)); 413 if (!omp_rank) { 414 ctrl->is_omp_master = PETSC_TRUE; /* master */ 415 color = 0; 416 } else { 417 ctrl->is_omp_master = PETSC_FALSE; /* slave */ 418 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 419 } 420 PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm)); 421 422 /*================================================================================= 423 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 424 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 425 and run them on the idle CPUs. 426 ==================================================================================*/ 427 PetscCall(PetscOmpCtrlCreateBarrier(ctrl)); 428 429 /*================================================================================= 430 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 431 is the union of the bindings of all ranks in the omp_comm 432 =================================================================================*/ 433 434 ctrl->cpuset = hwloc_bitmap_alloc(); 435 PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 436 PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS); 437 438 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 439 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8; 440 PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs)); 441 if (nr_cpu_ulongs == 1) { 442 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 443 } else { 444 for (PetscInt i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i); 445 } 446 447 PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm)); 448 449 if (ctrl->is_omp_master) { 450 ctrl->omp_cpuset = hwloc_bitmap_alloc(); 451 PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 452 if (nr_cpu_ulongs == 1) { 453 #if HWLOC_API_VERSION >= 0x00020000 454 PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]); 455 #else 456 hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]); 457 #endif 458 } else { 459 for (PetscInt i = 0; i < nr_cpu_ulongs; i++) { 460 #if HWLOC_API_VERSION >= 0x00020000 461 PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]); 462 #else 463 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]); 464 #endif 465 } 466 } 467 } 468 PetscCall(PetscFree(cpu_ulongs)); 469 *pctrl = ctrl; 470 PetscFunctionReturn(PETSC_SUCCESS); 471 } 472 473 /*@C 474 PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller 475 476 Input Parameter: 477 . pctrl - a PETSc OpenMP controller 478 479 Level: developer 480 481 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 482 @*/ 483 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) 484 { 485 PetscOmpCtrl ctrl = *pctrl; 486 487 PetscFunctionBegin; 488 hwloc_bitmap_free(ctrl->cpuset); 489 hwloc_topology_destroy(ctrl->topology); 490 PetscCall(PetscOmpCtrlDestroyBarrier(ctrl)); 491 PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm)); 492 if (ctrl->is_omp_master) { 493 hwloc_bitmap_free(ctrl->omp_cpuset); 494 PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm)); 495 } 496 PetscCall(PetscFree(ctrl)); 497 PetscFunctionReturn(PETSC_SUCCESS); 498 } 499 500 /*@C 501 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller 502 503 Input Parameter: 504 . ctrl - a PETSc OMP controller 505 506 Output Parameters: 507 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads 508 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 509 on slave ranks, `MPI_COMM_NULL` will be return in reality. 510 - is_omp_master - true if the calling process is an OMP master rank. 511 512 Note: 513 Any output parameter can be `NULL`. The parameter is just ignored. 514 515 Level: developer 516 517 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 518 @*/ 519 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master) 520 { 521 PetscFunctionBegin; 522 if (omp_comm) *omp_comm = ctrl->omp_comm; 523 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 524 if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 525 PetscFunctionReturn(PETSC_SUCCESS); 526 } 527 528 /*@C 529 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU) 530 531 Input Parameter: 532 . ctrl - a PETSc OMP controller 533 534 Notes: 535 this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not 536 require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency, 537 MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement. 538 539 A code using `PetscOmpCtrlBarrier()` would be like this, 540 .vb 541 if (is_omp_master) { 542 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl); 543 Call the library using OpenMP 544 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl); 545 } 546 PetscOmpCtrlBarrier(ctrl); 547 .ve 548 549 Level: developer 550 551 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, 552 @*/ 553 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) 554 { 555 int err; 556 557 PetscFunctionBegin; 558 err = pthread_barrier_wait(ctrl->barrier); 559 PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err); 560 PetscFunctionReturn(PETSC_SUCCESS); 561 } 562 563 /*@C 564 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks 565 566 Input Parameter: 567 . ctrl - a PETSc OMP controller 568 569 Note: 570 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank. 571 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime 572 573 Level: developer 574 575 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()` 576 @*/ 577 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) 578 { 579 PetscFunctionBegin; 580 PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS); 581 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */ 582 PetscFunctionReturn(PETSC_SUCCESS); 583 } 584 585 /*@C 586 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks 587 588 Input Parameter: 589 . ctrl - a PETSc OMP controller 590 591 Note: 592 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank. 593 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1. 594 595 Level: developer 596 597 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()` 598 @*/ 599 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) 600 { 601 PetscFunctionBegin; 602 PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS); 603 omp_set_num_threads(1); 604 PetscFunctionReturn(PETSC_SUCCESS); 605 } 606 607 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 608 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */ 609