1 #include <petscsys.h> /*I "petscsys.h" I*/ 2 #include <petsc/private/petscimpl.h> 3 4 struct _n_PetscShmComm { 5 PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */ 6 PetscMPIInt shmsize; /* size of the shared memory communicator */ 7 MPI_Comm globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */ 8 }; 9 10 /* 11 Private routine to delete internal shared memory communicator when a communicator is freed. 12 13 This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed. 14 15 Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval() 16 17 */ 18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state) { 19 PetscShmComm p = (PetscShmComm)val; 20 21 PetscFunctionBegin; 22 PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm)); 23 PetscCallMPI(MPI_Comm_free(&p->shmcomm)); 24 PetscCallMPI(PetscFree(p->globranks)); 25 PetscCallMPI(PetscFree(val)); 26 PetscFunctionReturn(MPI_SUCCESS); 27 } 28 29 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 30 /* Data structures to support freeing comms created in PetscShmCommGet(). 31 Since we predict communicators passed to PetscShmCommGet() are very likely 32 either a petsc inner communicator or an MPI communicator with a linked petsc 33 inner communicator, we use a simple static array to store dupped communicators 34 on rare cases otherwise. 35 */ 36 #define MAX_SHMCOMM_DUPPED_COMMS 16 37 static PetscInt num_dupped_comms = 0; 38 static MPI_Comm shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS]; 39 static PetscErrorCode PetscShmCommDestroyDuppedComms(void) { 40 PetscInt i; 41 PetscFunctionBegin; 42 for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i])); 43 num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */ 44 PetscFunctionReturn(0); 45 } 46 #endif 47 48 /*@C 49 PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory 50 51 Collective. 52 53 Input Parameter: 54 . globcomm - `MPI_Comm`, which can be a user MPI_Comm or a PETSc inner MPI_Comm 55 56 Output Parameter: 57 . pshmcomm - the PETSc shared memory communicator object 58 59 Level: developer 60 61 Note: 62 When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis 63 64 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()` 65 @*/ 66 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm) { 67 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 68 MPI_Group globgroup, shmgroup; 69 PetscMPIInt *shmranks, i, flg; 70 PetscCommCounter *counter; 71 72 PetscFunctionBegin; 73 PetscValidPointer(pshmcomm, 2); 74 /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */ 75 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg)); 76 if (!flg) { /* globcomm is not a petsc comm */ 77 union 78 { 79 MPI_Comm comm; 80 void *ptr; 81 } ucomm; 82 /* check if globcomm already has a linked petsc inner comm */ 83 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg)); 84 if (!flg) { 85 /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */ 86 PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS); 87 PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL)); 88 /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */ 89 if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms)); 90 shmcomm_dupped_comms[num_dupped_comms] = globcomm; 91 num_dupped_comms++; 92 } else { 93 /* otherwise, we pull out the inner comm and use it as globcomm */ 94 globcomm = ucomm.comm; 95 } 96 } 97 98 /* Check if globcomm already has an attached pshmcomm. If no, create one */ 99 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg)); 100 if (flg) PetscFunctionReturn(0); 101 102 PetscCall(PetscNew(pshmcomm)); 103 (*pshmcomm)->globcomm = globcomm; 104 105 PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm)); 106 107 PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize)); 108 PetscCallMPI(MPI_Comm_group(globcomm, &globgroup)); 109 PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup)); 110 PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks)); 111 PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks)); 112 for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i; 113 PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks)); 114 PetscCall(PetscFree(shmranks)); 115 PetscCallMPI(MPI_Group_free(&globgroup)); 116 PetscCallMPI(MPI_Group_free(&shmgroup)); 117 118 for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i])); 119 PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm)); 120 PetscFunctionReturn(0); 121 #else 122 SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich."); 123 #endif 124 } 125 126 /*@C 127 PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator 128 129 Input Parameters: 130 + pshmcomm - the shared memory communicator object 131 - grank - the global rank 132 133 Output Parameter: 134 . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist 135 136 Level: developer 137 138 Developer Notes: 139 Assumes the pshmcomm->globranks[] is sorted 140 141 It may be better to rewrite this to map multiple global ranks to local in the same function call 142 143 .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()` 144 @*/ 145 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank) { 146 PetscMPIInt low, high, t, i; 147 PetscBool flg = PETSC_FALSE; 148 149 PetscFunctionBegin; 150 PetscValidPointer(pshmcomm, 1); 151 PetscValidIntPointer(lrank, 3); 152 *lrank = MPI_PROC_NULL; 153 if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0); 154 if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(0); 155 PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL)); 156 if (flg) PetscFunctionReturn(0); 157 low = 0; 158 high = pshmcomm->shmsize; 159 while (high - low > 5) { 160 t = (low + high) / 2; 161 if (pshmcomm->globranks[t] > grank) high = t; 162 else low = t; 163 } 164 for (i = low; i < high; i++) { 165 if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0); 166 if (pshmcomm->globranks[i] == grank) { 167 *lrank = i; 168 PetscFunctionReturn(0); 169 } 170 } 171 PetscFunctionReturn(0); 172 } 173 174 /*@C 175 PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank 176 177 Input Parameters: 178 + pshmcomm - the shared memory communicator object 179 - lrank - the local rank in the shared memory communicator 180 181 Output Parameter: 182 . grank - the global rank in the global communicator where the shared memory communicator is built 183 184 Level: developer 185 186 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()` 187 @*/ 188 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank) { 189 PetscFunctionBegin; 190 PetscValidPointer(pshmcomm, 1); 191 PetscValidIntPointer(grank, 3); 192 PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank); 193 *grank = pshmcomm->globranks[lrank]; 194 PetscFunctionReturn(0); 195 } 196 197 /*@C 198 PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory 199 200 Input Parameter: 201 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet() 202 203 Output Parameter: 204 . comm - the MPI communicator 205 206 Level: developer 207 208 .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()` 209 @*/ 210 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm) { 211 PetscFunctionBegin; 212 PetscValidPointer(pshmcomm, 1); 213 PetscValidPointer(comm, 2); 214 *comm = pshmcomm->shmcomm; 215 PetscFunctionReturn(0); 216 } 217 218 #if defined(PETSC_HAVE_OPENMP_SUPPORT) 219 #include <pthread.h> 220 #include <hwloc.h> 221 #include <omp.h> 222 223 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available, 224 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much 225 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 226 by 50%. Until the reason is found out, we use mmap() instead. 227 */ 228 #define USE_MMAP_ALLOCATE_SHARED_MEMORY 229 230 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 231 #include <sys/mman.h> 232 #include <sys/types.h> 233 #include <sys/stat.h> 234 #include <fcntl.h> 235 #endif 236 237 struct _n_PetscOmpCtrl { 238 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 239 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 240 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 241 PetscBool is_omp_master; /* rank 0's in omp_comm */ 242 MPI_Win omp_win; /* a shared memory window containing a barrier */ 243 pthread_barrier_t *barrier; /* pointer to the barrier */ 244 hwloc_topology_t topology; 245 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 246 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 247 }; 248 249 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm 250 contained by the controller. 251 252 PETSc OpenMP controller users do not call this function directly. This function exists 253 only because we want to separate shared memory allocation methods from other code. 254 */ 255 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) { 256 MPI_Aint size; 257 void *baseptr; 258 pthread_barrierattr_t attr; 259 260 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 261 PetscInt fd; 262 PetscChar pathname[PETSC_MAX_PATH_LEN]; 263 #else 264 PetscMPIInt disp_unit; 265 #endif 266 267 PetscFunctionBegin; 268 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 269 size = sizeof(pthread_barrier_t); 270 if (ctrl->is_omp_master) { 271 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */ 272 PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN)); 273 PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN)); 274 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 275 fd = mkstemp(pathname); 276 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname); 277 PetscCall(ftruncate(fd, size)); 278 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 279 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 280 PetscCall(close(fd)); 281 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 282 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */ 283 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 284 PetscCall(unlink(pathname)); 285 } else { 286 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 287 fd = open(pathname, O_RDWR); 288 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname); 289 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 290 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 291 PetscCall(close(fd)); 292 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 293 } 294 #else 295 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 296 PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win)); 297 PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr)); 298 #endif 299 ctrl->barrier = (pthread_barrier_t *)baseptr; 300 301 /* omp master initializes the barrier */ 302 if (ctrl->is_omp_master) { 303 PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size)); 304 PetscCall(pthread_barrierattr_init(&attr)); 305 PetscCall(pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)); /* make the barrier also work for processes */ 306 PetscCall(pthread_barrier_init(ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size)); 307 PetscCall(pthread_barrierattr_destroy(&attr)); 308 } 309 310 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 311 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 312 PetscFunctionReturn(0); 313 } 314 315 /* Destroy the pthread barrier in the PETSc OpenMP controller */ 316 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) { 317 PetscFunctionBegin; 318 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 319 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 320 if (ctrl->is_omp_master) PetscCall(pthread_barrier_destroy(ctrl->barrier)); 321 322 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 323 PetscCall(munmap(ctrl->barrier, sizeof(pthread_barrier_t))); 324 #else 325 PetscCallMPI(MPI_Win_free(&ctrl->omp_win)); 326 #endif 327 PetscFunctionReturn(0); 328 } 329 330 /*@C 331 PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP 332 333 Input Parameters: 334 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in 335 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value 336 337 Output Parameter: 338 . pctrl - a PETSc OpenMP controller 339 340 Level: developer 341 342 Developer Note: 343 Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use 344 345 .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 346 @*/ 347 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl) { 348 PetscOmpCtrl ctrl; 349 unsigned long *cpu_ulongs = NULL; 350 PetscInt i, nr_cpu_ulongs; 351 PetscShmComm pshmcomm; 352 MPI_Comm shm_comm; 353 PetscMPIInt shm_rank, shm_comm_size, omp_rank, color; 354 PetscInt num_packages, num_cores; 355 356 PetscFunctionBegin; 357 PetscCall(PetscNew(&ctrl)); 358 359 /*================================================================================= 360 Init hwloc 361 ==================================================================================*/ 362 PetscCall(hwloc_topology_init(&ctrl->topology)); 363 #if HWLOC_API_VERSION >= 0x00020000 364 /* to filter out unneeded info and have faster hwloc_topology_load */ 365 PetscCall(hwloc_topology_set_all_types_filter(ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE)); 366 PetscCall(hwloc_topology_set_type_filter(ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL)); 367 #endif 368 PetscCall(hwloc_topology_load(ctrl->topology)); 369 370 /*================================================================================= 371 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 372 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 373 others are called slaves. OMP Masters make up a new comm called omp_master_comm, 374 which is usually passed to third party libraries. 375 ==================================================================================*/ 376 377 /* fetch the stored shared memory communicator */ 378 PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm)); 379 PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm)); 380 381 PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank)); 382 PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size)); 383 384 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */ 385 if (nthreads == -1) { 386 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE); 387 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE); 388 nthreads = num_cores / num_packages; 389 if (nthreads > shm_comm_size) nthreads = shm_comm_size; 390 } 391 392 PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size); 393 if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size)); 394 395 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 396 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 397 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 398 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 399 Use 0 as key so that rank ordering wont change in new comm. 400 */ 401 color = shm_rank / nthreads; 402 PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm)); 403 404 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 405 PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank)); 406 if (!omp_rank) { 407 ctrl->is_omp_master = PETSC_TRUE; /* master */ 408 color = 0; 409 } else { 410 ctrl->is_omp_master = PETSC_FALSE; /* slave */ 411 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 412 } 413 PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm)); 414 415 /*================================================================================= 416 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 417 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 418 and run them on the idle CPUs. 419 ==================================================================================*/ 420 PetscCall(PetscOmpCtrlCreateBarrier(ctrl)); 421 422 /*================================================================================= 423 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 424 is the union of the bindings of all ranks in the omp_comm 425 =================================================================================*/ 426 427 ctrl->cpuset = hwloc_bitmap_alloc(); 428 PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 429 PetscCall(hwloc_get_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS)); 430 431 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 432 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8; 433 PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs)); 434 if (nr_cpu_ulongs == 1) { 435 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 436 } else { 437 for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i); 438 } 439 440 PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm)); 441 442 if (ctrl->is_omp_master) { 443 ctrl->omp_cpuset = hwloc_bitmap_alloc(); 444 PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 445 if (nr_cpu_ulongs == 1) { 446 #if HWLOC_API_VERSION >= 0x00020000 447 PetscCall(hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0])); 448 #else 449 hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]); 450 #endif 451 } else { 452 for (i = 0; i < nr_cpu_ulongs; i++) { 453 #if HWLOC_API_VERSION >= 0x00020000 454 PetscCall(hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i])); 455 #else 456 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]); 457 #endif 458 } 459 } 460 } 461 PetscCall(PetscFree(cpu_ulongs)); 462 *pctrl = ctrl; 463 PetscFunctionReturn(0); 464 } 465 466 /*@C 467 PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller 468 469 Input Parameter: 470 . pctrl - a PETSc OpenMP controller 471 472 Level: developer 473 474 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 475 @*/ 476 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) { 477 PetscOmpCtrl ctrl = *pctrl; 478 479 PetscFunctionBegin; 480 hwloc_bitmap_free(ctrl->cpuset); 481 hwloc_topology_destroy(ctrl->topology); 482 PetscOmpCtrlDestroyBarrier(ctrl); 483 PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm)); 484 if (ctrl->is_omp_master) { 485 hwloc_bitmap_free(ctrl->omp_cpuset); 486 PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm)); 487 } 488 PetscCall(PetscFree(ctrl)); 489 PetscFunctionReturn(0); 490 } 491 492 /*@C 493 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller 494 495 Input Parameter: 496 . ctrl - a PETSc OMP controller 497 498 Output Parameters: 499 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads 500 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 501 on slave ranks, `MPI_COMM_NULL` will be return in reality. 502 - is_omp_master - true if the calling process is an OMP master rank. 503 504 Note: 505 Any output parameter can be NULL. The parameter is just ignored. 506 507 Level: developer 508 509 .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, 510 @*/ 511 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master) { 512 PetscFunctionBegin; 513 if (omp_comm) *omp_comm = ctrl->omp_comm; 514 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 515 if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 516 PetscFunctionReturn(0); 517 } 518 519 /*@C 520 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU) 521 522 Input Parameter: 523 . ctrl - a PETSc OMP controller 524 525 Notes: 526 this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not 527 require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency, 528 MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement. 529 530 A code using `PetscOmpCtrlBarrier()` would be like this, 531 .vb 532 if (is_omp_master) { 533 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl); 534 Call the library using OpenMP 535 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl); 536 } 537 PetscOmpCtrlBarrier(ctrl); 538 .ve 539 540 Level: developer 541 542 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, 543 @*/ 544 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) { 545 int err; 546 547 PetscFunctionBegin; 548 err = pthread_barrier_wait(ctrl->barrier); 549 PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err); 550 PetscFunctionReturn(0); 551 } 552 553 /*@C 554 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks 555 556 Input Parameter: 557 . ctrl - a PETSc OMP controller 558 559 Note: 560 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank. 561 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime 562 563 Level: developer 564 565 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()` 566 @*/ 567 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) { 568 PetscFunctionBegin; 569 PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS)); 570 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */ 571 PetscFunctionReturn(0); 572 } 573 574 /*@C 575 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks 576 577 Input Parameter: 578 . ctrl - a PETSc OMP controller 579 580 Note: 581 Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank. 582 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1. 583 584 Level: developer 585 586 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()` 587 @*/ 588 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) { 589 PetscFunctionBegin; 590 PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS)); 591 omp_set_num_threads(1); 592 PetscFunctionReturn(0); 593 } 594 595 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 596 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */ 597