1 #include <petscsys.h> /*I "petscsys.h" I*/ 2 #include <petsc/private/petscimpl.h> 3 4 struct _n_PetscShmComm { 5 PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */ 6 PetscMPIInt shmsize; /* size of the shared memory communicator */ 7 MPI_Comm globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */ 8 }; 9 10 /* 11 Private routine to delete internal shared memory communicator when a communicator is freed. 12 13 This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed. 14 15 Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval() 16 17 */ 18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_DeleteFn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state) 19 { 20 PetscShmComm p = (PetscShmComm)val; 21 22 PetscFunctionBegin; 23 PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm)); 24 PetscCallMPI(MPI_Comm_free(&p->shmcomm)); 25 PetscCallMPI(PetscFree(p->globranks)); 26 PetscCallMPI(PetscFree(val)); 27 PetscFunctionReturn(MPI_SUCCESS); 28 } 29 30 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 31 /* Data structures to support freeing comms created in PetscShmCommGet(). 32 Since we predict communicators passed to PetscShmCommGet() are very likely 33 either a petsc inner communicator or an MPI communicator with a linked petsc 34 inner communicator, we use a simple static array to store dupped communicators 35 on rare cases otherwise. 36 */ 37 #define MAX_SHMCOMM_DUPPED_COMMS 16 38 static PetscInt num_dupped_comms = 0; 39 static MPI_Comm shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS]; 40 static PetscErrorCode PetscShmCommDestroyDuppedComms(void) 41 { 42 PetscInt i; 43 44 PetscFunctionBegin; 45 for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i])); 46 num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */ 47 PetscFunctionReturn(PETSC_SUCCESS); 48 } 49 #endif 50 51 /*@C 52 PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory 53 54 Collective. 55 56 Input Parameter: 57 . globcomm - `MPI_Comm`, which can be a user MPI_Comm or a PETSc inner MPI_Comm 58 59 Output Parameter: 60 . pshmcomm - the PETSc shared memory communicator object 61 62 Level: developer 63 64 Note: 65 When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis 66 67 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()` 68 @*/ 69 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm) 70 { 71 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 72 MPI_Group globgroup, shmgroup; 73 PetscMPIInt *shmranks, i, flg; 74 PetscCommCounter *counter; 75 76 PetscFunctionBegin; 77 PetscAssertPointer(pshmcomm, 2); 78 /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */ 79 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg)); 80 if (!flg) { /* globcomm is not a petsc comm */ 81 union 82 { 83 MPI_Comm comm; 84 void *ptr; 85 } ucomm; 86 /* check if globcomm already has a linked petsc inner comm */ 87 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg)); 88 if (!flg) { 89 /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */ 90 PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS); 91 PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL)); 92 /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */ 93 if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms)); 94 shmcomm_dupped_comms[num_dupped_comms] = globcomm; 95 num_dupped_comms++; 96 } else { 97 /* otherwise, we pull out the inner comm and use it as globcomm */ 98 globcomm = ucomm.comm; 99 } 100 } 101 102 /* Check if globcomm already has an attached pshmcomm. If no, create one */ 103 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg)); 104 if (flg) PetscFunctionReturn(PETSC_SUCCESS); 105 106 PetscCall(PetscNew(pshmcomm)); 107 (*pshmcomm)->globcomm = globcomm; 108 109 PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm)); 110 111 PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize)); 112 PetscCallMPI(MPI_Comm_group(globcomm, &globgroup)); 113 PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup)); 114 PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks)); 115 PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks)); 116 for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i; 117 PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks)); 118 PetscCall(PetscFree(shmranks)); 119 PetscCallMPI(MPI_Group_free(&globgroup)); 120 PetscCallMPI(MPI_Group_free(&shmgroup)); 121 122 for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i])); 123 PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm)); 124 PetscFunctionReturn(PETSC_SUCCESS); 125 #else 126 SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich."); 127 #endif 128 } 129 130 /*@C 131 PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator 132 133 Input Parameters: 134 + pshmcomm - the shared memory communicator object 135 - grank - the global rank 136 137 Output Parameter: 138 . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist 139 140 Level: developer 141 142 Developer Notes: 143 Assumes the pshmcomm->globranks[] is sorted 144 145 It may be better to rewrite this to map multiple global ranks to local in the same function call 146 147 .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()` 148 @*/ 149 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank) 150 { 151 PetscMPIInt low, high, t, i; 152 PetscBool flg = PETSC_FALSE; 153 154 PetscFunctionBegin; 155 PetscAssertPointer(pshmcomm, 1); 156 PetscAssertPointer(lrank, 3); 157 *lrank = MPI_PROC_NULL; 158 if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS); 159 if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS); 160 PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL)); 161 if (flg) PetscFunctionReturn(PETSC_SUCCESS); 162 low = 0; 163 high = pshmcomm->shmsize; 164 while (high - low > 5) { 165 t = (low + high) / 2; 166 if (pshmcomm->globranks[t] > grank) high = t; 167 else low = t; 168 } 169 for (i = low; i < high; i++) { 170 if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS); 171 if (pshmcomm->globranks[i] == grank) { 172 *lrank = i; 173 PetscFunctionReturn(PETSC_SUCCESS); 174 } 175 } 176 PetscFunctionReturn(PETSC_SUCCESS); 177 } 178 179 /*@C 180 PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank 181 182 Input Parameters: 183 + pshmcomm - the shared memory communicator object 184 - lrank - the local rank in the shared memory communicator 185 186 Output Parameter: 187 . grank - the global rank in the global communicator where the shared memory communicator is built 188 189 Level: developer 190 191 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()` 192 @*/ 193 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank) 194 { 195 PetscFunctionBegin; 196 PetscAssertPointer(pshmcomm, 1); 197 PetscAssertPointer(grank, 3); 198 PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank); 199 *grank = pshmcomm->globranks[lrank]; 200 PetscFunctionReturn(PETSC_SUCCESS); 201 } 202 203 /*@C 204 PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory 205 206 Input Parameter: 207 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet() 208 209 Output Parameter: 210 . comm - the MPI communicator 211 212 Level: developer 213 214 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()` 215 @*/ 216 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm) 217 { 218 PetscFunctionBegin; 219 PetscAssertPointer(pshmcomm, 1); 220 PetscAssertPointer(comm, 2); 221 *comm = pshmcomm->shmcomm; 222 PetscFunctionReturn(PETSC_SUCCESS); 223 } 224 225 #if defined(PETSC_HAVE_OPENMP_SUPPORT) 226 #include <pthread.h> 227 #include <hwloc.h> 228 #include <omp.h> 229 230 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available, 231 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much 232 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 233 by 50%. Until the reason is found out, we use mmap() instead. 234 */ 235 #define USE_MMAP_ALLOCATE_SHARED_MEMORY 236 237 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 238 #include <sys/mman.h> 239 #include <sys/types.h> 240 #include <sys/stat.h> 241 #include <fcntl.h> 242 #endif 243 244 struct _n_PetscOmpCtrl { 245 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 246 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 247 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 248 PetscBool is_omp_master; /* rank 0's in omp_comm */ 249 MPI_Win omp_win; /* a shared memory window containing a barrier */ 250 pthread_barrier_t *barrier; /* pointer to the barrier */ 251 hwloc_topology_t topology; 252 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 253 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 254 }; 255 256 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm 257 contained by the controller. 258 259 PETSc OpenMP controller users do not call this function directly. This function exists 260 only because we want to separate shared memory allocation methods from other code. 261 */ 262 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) 263 { 264 MPI_Aint size; 265 void *baseptr; 266 pthread_barrierattr_t attr; 267 268 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 269 int fd; 270 PetscChar pathname[PETSC_MAX_PATH_LEN]; 271 #else 272 PetscMPIInt disp_unit; 273 #endif 274 275 PetscFunctionBegin; 276 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 277 size = sizeof(pthread_barrier_t); 278 if (ctrl->is_omp_master) { 279 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */ 280 PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN)); 281 PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN)); 282 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 283 fd = mkstemp(pathname); 284 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname); 285 PetscCallExternal(ftruncate, fd, size); 286 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 287 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 288 PetscCallExternal(close, fd); 289 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 290 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */ 291 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 292 PetscCallExternal(unlink, pathname); 293 } else { 294 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 295 fd = open(pathname, O_RDWR); 296 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname); 297 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 298 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 299 PetscCallExternal(close, fd); 300 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 301 } 302 #else 303 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 304 PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win)); 305 PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr)); 306 #endif 307 ctrl->barrier = (pthread_barrier_t *)baseptr; 308 309 /* omp master initializes the barrier */ 310 if (ctrl->is_omp_master) { 311 PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size)); 312 PetscCallExternal(pthread_barrierattr_init, &attr); 313 PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */ 314 PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size); 315 PetscCallExternal(pthread_barrierattr_destroy, &attr); 316 } 317 318 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 319 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 320 PetscFunctionReturn(PETSC_SUCCESS); 321 } 322 323 /* Destroy the pthread barrier in the PETSc OpenMP controller */ 324 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) 325 { 326 PetscFunctionBegin; 327 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 328 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 329 if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier); 330 331 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 332 PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t)); 333 #else 334 PetscCallMPI(MPI_Win_free(&ctrl->omp_win)); 335 #endif 336 PetscFunctionReturn(PETSC_SUCCESS); 337 } 338 339 /*@C 340 PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP 341 342 Input Parameters: 343 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in 344 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value 345 346 Output Parameter: 347 . pctrl - a PETSc OpenMP controller 348 349 Level: developer 350 351 Developer Note: 352 Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use 353 354 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 355 @*/ 356 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl) 357 { 358 PetscOmpCtrl ctrl; 359 unsigned long *cpu_ulongs = NULL; 360 PetscInt i, nr_cpu_ulongs; 361 PetscShmComm pshmcomm; 362 MPI_Comm shm_comm; 363 PetscMPIInt shm_rank, shm_comm_size, omp_rank, color; 364 PetscInt num_packages, num_cores; 365 366 PetscFunctionBegin; 367 PetscCall(PetscNew(&ctrl)); 368 369 /*================================================================================= 370 Init hwloc 371 ==================================================================================*/ 372 PetscCallExternal(hwloc_topology_init, &ctrl->topology); 373 #if HWLOC_API_VERSION >= 0x00020000 374 /* to filter out unneeded info and have faster hwloc_topology_load */ 375 PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE); 376 PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL); 377 #endif 378 PetscCallExternal(hwloc_topology_load, ctrl->topology); 379 380 /*================================================================================= 381 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 382 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 383 others are called slaves. OMP Masters make up a new comm called omp_master_comm, 384 which is usually passed to third party libraries. 385 ==================================================================================*/ 386 387 /* fetch the stored shared memory communicator */ 388 PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm)); 389 PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm)); 390 391 PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank)); 392 PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size)); 393 394 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */ 395 if (nthreads == -1) { 396 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE); 397 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE); 398 nthreads = num_cores / num_packages; 399 if (nthreads > shm_comm_size) nthreads = shm_comm_size; 400 } 401 402 PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size); 403 if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size)); 404 405 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 406 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 407 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 408 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 409 Use 0 as key so that rank ordering wont change in new comm. 410 */ 411 color = shm_rank / nthreads; 412 PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm)); 413 414 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 415 PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank)); 416 if (!omp_rank) { 417 ctrl->is_omp_master = PETSC_TRUE; /* master */ 418 color = 0; 419 } else { 420 ctrl->is_omp_master = PETSC_FALSE; /* slave */ 421 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 422 } 423 PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm)); 424 425 /*================================================================================= 426 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 427 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 428 and run them on the idle CPUs. 429 ==================================================================================*/ 430 PetscCall(PetscOmpCtrlCreateBarrier(ctrl)); 431 432 /*================================================================================= 433 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 434 is the union of the bindings of all ranks in the omp_comm 435 =================================================================================*/ 436 437 ctrl->cpuset = hwloc_bitmap_alloc(); 438 PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 439 PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS); 440 441 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 442 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8; 443 PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs)); 444 if (nr_cpu_ulongs == 1) { 445 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 446 } else { 447 for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i); 448 } 449 450 PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm)); 451 452 if (ctrl->is_omp_master) { 453 ctrl->omp_cpuset = hwloc_bitmap_alloc(); 454 PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 455 if (nr_cpu_ulongs == 1) { 456 #if HWLOC_API_VERSION >= 0x00020000 457 PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]); 458 #else 459 hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]); 460 #endif 461 } else { 462 for (i = 0; i < nr_cpu_ulongs; i++) { 463 #if HWLOC_API_VERSION >= 0x00020000 464 PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]); 465 #else 466 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]); 467 #endif 468 } 469 } 470 } 471 PetscCall(PetscFree(cpu_ulongs)); 472 *pctrl = ctrl; 473 PetscFunctionReturn(PETSC_SUCCESS); 474 } 475 476 /*@C 477 PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller 478 479 Input Parameter: 480 . pctrl - a PETSc OpenMP controller 481 482 Level: developer 483 484 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 485 @*/ 486 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) 487 { 488 PetscOmpCtrl ctrl = *pctrl; 489 490 PetscFunctionBegin; 491 hwloc_bitmap_free(ctrl->cpuset); 492 hwloc_topology_destroy(ctrl->topology); 493 PetscCall(PetscOmpCtrlDestroyBarrier(ctrl)); 494 PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm)); 495 if (ctrl->is_omp_master) { 496 hwloc_bitmap_free(ctrl->omp_cpuset); 497 PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm)); 498 } 499 PetscCall(PetscFree(ctrl)); 500 PetscFunctionReturn(PETSC_SUCCESS); 501 } 502 503 /*@C 504 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller 505 506 Input Parameter: 507 . ctrl - a PETSc OMP controller 508 509 Output Parameters: 510 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads 511 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 512 on slave ranks, `MPI_COMM_NULL` will be return in reality. 513 - is_omp_master - true if the calling process is an OMP master rank. 514 515 Note: 516 Any output parameter can be NULL. The parameter is just ignored. 517 518 Level: developer 519 520 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 521 @*/ 522 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master) 523 { 524 PetscFunctionBegin; 525 if (omp_comm) *omp_comm = ctrl->omp_comm; 526 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 527 if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 528 PetscFunctionReturn(PETSC_SUCCESS); 529 } 530 531 /*@C 532 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU) 533 534 Input Parameter: 535 . ctrl - a PETSc OMP controller 536 537 Notes: 538 this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not 539 require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency, 540 MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement. 541 542 A code using `PetscOmpCtrlBarrier()` would be like this, 543 .vb 544 if (is_omp_master) { 545 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl); 546 Call the library using OpenMP 547 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl); 548 } 549 PetscOmpCtrlBarrier(ctrl); 550 .ve 551 552 Level: developer 553 554 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, 555 @*/ 556 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) 557 { 558 int err; 559 560 PetscFunctionBegin; 561 err = pthread_barrier_wait(ctrl->barrier); 562 PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err); 563 PetscFunctionReturn(PETSC_SUCCESS); 564 } 565 566 /*@C 567 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks 568 569 Input Parameter: 570 . ctrl - a PETSc OMP controller 571 572 Note: 573 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank. 574 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime 575 576 Level: developer 577 578 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()` 579 @*/ 580 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) 581 { 582 PetscFunctionBegin; 583 PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS); 584 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */ 585 PetscFunctionReturn(PETSC_SUCCESS); 586 } 587 588 /*@C 589 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks 590 591 Input Parameter: 592 . ctrl - a PETSc OMP controller 593 594 Note: 595 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank. 596 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1. 597 598 Level: developer 599 600 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()` 601 @*/ 602 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) 603 { 604 PetscFunctionBegin; 605 PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS); 606 omp_set_num_threads(1); 607 PetscFunctionReturn(PETSC_SUCCESS); 608 } 609 610 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 611 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */ 612