1 #include <petscsys.h> /*I "petscsys.h" I*/ 2 #include <petsc/private/petscimpl.h> 3 4 struct _n_PetscShmComm { 5 PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */ 6 PetscMPIInt shmsize; /* size of the shared memory communicator */ 7 MPI_Comm globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */ 8 }; 9 10 /* 11 Private routine to delete internal shared memory communicator when a communicator is freed. 12 13 This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed. 14 15 Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval() 16 17 */ 18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state) { 19 PetscShmComm p = (PetscShmComm)val; 20 21 PetscFunctionBegin; 22 PetscCallMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm)); 23 PetscCallMPI(MPI_Comm_free(&p->shmcomm)); 24 PetscCallMPI(PetscFree(p->globranks)); 25 PetscCallMPI(PetscFree(val)); 26 PetscFunctionReturn(MPI_SUCCESS); 27 } 28 29 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 30 /* Data structures to support freeing comms created in PetscShmCommGet(). 31 Since we predict communicators passed to PetscShmCommGet() are very likely 32 either a petsc inner communicator or an MPI communicator with a linked petsc 33 inner communicator, we use a simple static array to store dupped communicators 34 on rare cases otherwise. 35 */ 36 #define MAX_SHMCOMM_DUPPED_COMMS 16 37 static PetscInt num_dupped_comms = 0; 38 static MPI_Comm shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS]; 39 static PetscErrorCode PetscShmCommDestroyDuppedComms(void) { 40 PetscInt i; 41 PetscFunctionBegin; 42 for (i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i])); 43 num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */ 44 PetscFunctionReturn(0); 45 } 46 #endif 47 48 /*@C 49 PetscShmCommGet - Given a communicator returns a sub-communicator of all ranks that share a common memory 50 51 Collective. 52 53 Input Parameter: 54 . globcomm - MPI_Comm, which can be a user MPI_Comm or a PETSc inner MPI_Comm 55 56 Output Parameter: 57 . pshmcomm - the PETSc shared memory communicator object 58 59 Level: developer 60 61 Notes: 62 When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis 63 64 @*/ 65 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm) { 66 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 67 MPI_Group globgroup, shmgroup; 68 PetscMPIInt *shmranks, i, flg; 69 PetscCommCounter *counter; 70 71 PetscFunctionBegin; 72 PetscValidPointer(pshmcomm, 2); 73 /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */ 74 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg)); 75 if (!flg) { /* globcomm is not a petsc comm */ 76 union 77 { 78 MPI_Comm comm; 79 void *ptr; 80 } ucomm; 81 /* check if globcomm already has a linked petsc inner comm */ 82 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg)); 83 if (!flg) { 84 /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */ 85 PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS); 86 PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL)); 87 /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */ 88 if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms)); 89 shmcomm_dupped_comms[num_dupped_comms] = globcomm; 90 num_dupped_comms++; 91 } else { 92 /* otherwise, we pull out the inner comm and use it as globcomm */ 93 globcomm = ucomm.comm; 94 } 95 } 96 97 /* Check if globcomm already has an attached pshmcomm. If no, create one */ 98 PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg)); 99 if (flg) PetscFunctionReturn(0); 100 101 PetscCall(PetscNew(pshmcomm)); 102 (*pshmcomm)->globcomm = globcomm; 103 104 PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm)); 105 106 PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize)); 107 PetscCallMPI(MPI_Comm_group(globcomm, &globgroup)); 108 PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup)); 109 PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks)); 110 PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks)); 111 for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i; 112 PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks)); 113 PetscCall(PetscFree(shmranks)); 114 PetscCallMPI(MPI_Group_free(&globgroup)); 115 PetscCallMPI(MPI_Group_free(&shmgroup)); 116 117 for (i = 0; i < (*pshmcomm)->shmsize; i++) { PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i])); } 118 PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm)); 119 PetscFunctionReturn(0); 120 #else 121 SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich."); 122 #endif 123 } 124 125 /*@C 126 PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator 127 128 Input Parameters: 129 + pshmcomm - the shared memory communicator object 130 - grank - the global rank 131 132 Output Parameter: 133 . lrank - the local rank, or MPI_PROC_NULL if it does not exist 134 135 Level: developer 136 137 Developer Notes: 138 Assumes the pshmcomm->globranks[] is sorted 139 140 It may be better to rewrite this to map multiple global ranks to local in the same function call 141 142 @*/ 143 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank) { 144 PetscMPIInt low, high, t, i; 145 PetscBool flg = PETSC_FALSE; 146 147 PetscFunctionBegin; 148 PetscValidPointer(pshmcomm, 1); 149 PetscValidIntPointer(lrank, 3); 150 *lrank = MPI_PROC_NULL; 151 if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0); 152 if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(0); 153 PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL)); 154 if (flg) PetscFunctionReturn(0); 155 low = 0; 156 high = pshmcomm->shmsize; 157 while (high - low > 5) { 158 t = (low + high) / 2; 159 if (pshmcomm->globranks[t] > grank) high = t; 160 else low = t; 161 } 162 for (i = low; i < high; i++) { 163 if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0); 164 if (pshmcomm->globranks[i] == grank) { 165 *lrank = i; 166 PetscFunctionReturn(0); 167 } 168 } 169 PetscFunctionReturn(0); 170 } 171 172 /*@C 173 PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank 174 175 Input Parameters: 176 + pshmcomm - the shared memory communicator object 177 - lrank - the local rank in the shared memory communicator 178 179 Output Parameter: 180 . grank - the global rank in the global communicator where the shared memory communicator is built 181 182 Level: developer 183 184 @*/ 185 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank) { 186 PetscFunctionBegin; 187 PetscValidPointer(pshmcomm, 1); 188 PetscValidIntPointer(grank, 3); 189 PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank); 190 *grank = pshmcomm->globranks[lrank]; 191 PetscFunctionReturn(0); 192 } 193 194 /*@C 195 PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory 196 197 Input Parameter: 198 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet() 199 200 Output Parameter: 201 . comm - the MPI communicator 202 203 Level: developer 204 205 @*/ 206 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm) { 207 PetscFunctionBegin; 208 PetscValidPointer(pshmcomm, 1); 209 PetscValidPointer(comm, 2); 210 *comm = pshmcomm->shmcomm; 211 PetscFunctionReturn(0); 212 } 213 214 #if defined(PETSC_HAVE_OPENMP_SUPPORT) 215 #include <pthread.h> 216 #include <hwloc.h> 217 #include <omp.h> 218 219 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available, 220 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much 221 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 222 by 50%. Until the reason is found out, we use mmap() instead. 223 */ 224 #define USE_MMAP_ALLOCATE_SHARED_MEMORY 225 226 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 227 #include <sys/mman.h> 228 #include <sys/types.h> 229 #include <sys/stat.h> 230 #include <fcntl.h> 231 #endif 232 233 struct _n_PetscOmpCtrl { 234 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 235 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 236 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 237 PetscBool is_omp_master; /* rank 0's in omp_comm */ 238 MPI_Win omp_win; /* a shared memory window containing a barrier */ 239 pthread_barrier_t *barrier; /* pointer to the barrier */ 240 hwloc_topology_t topology; 241 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 242 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 243 }; 244 245 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm 246 contained by the controller. 247 248 PETSc OpenMP controller users do not call this function directly. This function exists 249 only because we want to separate shared memory allocation methods from other code. 250 */ 251 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) { 252 MPI_Aint size; 253 void *baseptr; 254 pthread_barrierattr_t attr; 255 256 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 257 PetscInt fd; 258 PetscChar pathname[PETSC_MAX_PATH_LEN]; 259 #else 260 PetscMPIInt disp_unit; 261 #endif 262 263 PetscFunctionBegin; 264 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 265 size = sizeof(pthread_barrier_t); 266 if (ctrl->is_omp_master) { 267 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */ 268 PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN)); 269 PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN)); 270 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 271 fd = mkstemp(pathname); 272 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname); 273 PetscCall(ftruncate(fd, size)); 274 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 275 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 276 PetscCall(close(fd)); 277 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 278 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */ 279 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 280 PetscCall(unlink(pathname)); 281 } else { 282 PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm)); 283 fd = open(pathname, O_RDWR); 284 PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname); 285 baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 286 PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed"); 287 PetscCall(close(fd)); 288 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 289 } 290 #else 291 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 292 PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win)); 293 PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr)); 294 #endif 295 ctrl->barrier = (pthread_barrier_t *)baseptr; 296 297 /* omp master initializes the barrier */ 298 if (ctrl->is_omp_master) { 299 PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size)); 300 PetscCall(pthread_barrierattr_init(&attr)); 301 PetscCall(pthread_barrierattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)); /* make the barrier also work for processes */ 302 PetscCall(pthread_barrier_init(ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size)); 303 PetscCall(pthread_barrierattr_destroy(&attr)); 304 } 305 306 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 307 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 308 PetscFunctionReturn(0); 309 } 310 311 /* Destroy the pthread barrier in the PETSc OpenMP controller */ 312 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) { 313 PetscFunctionBegin; 314 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 315 PetscCallMPI(MPI_Barrier(ctrl->omp_comm)); 316 if (ctrl->is_omp_master) PetscCall(pthread_barrier_destroy(ctrl->barrier)); 317 318 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 319 PetscCall(munmap(ctrl->barrier, sizeof(pthread_barrier_t))); 320 #else 321 PetscCallMPI(MPI_Win_free(&ctrl->omp_win)); 322 #endif 323 PetscFunctionReturn(0); 324 } 325 326 /*@C 327 PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP 328 329 Input Parameters: 330 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in 331 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value 332 333 Output Parameter: 334 . pctrl - a PETSc OpenMP controller 335 336 Level: developer 337 338 TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use 339 340 .seealso `PetscOmpCtrlDestroy()` 341 @*/ 342 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl) { 343 PetscOmpCtrl ctrl; 344 unsigned long *cpu_ulongs = NULL; 345 PetscInt i, nr_cpu_ulongs; 346 PetscShmComm pshmcomm; 347 MPI_Comm shm_comm; 348 PetscMPIInt shm_rank, shm_comm_size, omp_rank, color; 349 PetscInt num_packages, num_cores; 350 351 PetscFunctionBegin; 352 PetscCall(PetscNew(&ctrl)); 353 354 /*================================================================================= 355 Init hwloc 356 ==================================================================================*/ 357 PetscCall(hwloc_topology_init(&ctrl->topology)); 358 #if HWLOC_API_VERSION >= 0x00020000 359 /* to filter out unneeded info and have faster hwloc_topology_load */ 360 PetscCall(hwloc_topology_set_all_types_filter(ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE)); 361 PetscCall(hwloc_topology_set_type_filter(ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL)); 362 #endif 363 PetscCall(hwloc_topology_load(ctrl->topology)); 364 365 /*================================================================================= 366 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 367 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 368 others are called slaves. OMP Masters make up a new comm called omp_master_comm, 369 which is usually passed to third party libraries. 370 ==================================================================================*/ 371 372 /* fetch the stored shared memory communicator */ 373 PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm)); 374 PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm)); 375 376 PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank)); 377 PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size)); 378 379 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */ 380 if (nthreads == -1) { 381 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE); 382 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE); 383 nthreads = num_cores / num_packages; 384 if (nthreads > shm_comm_size) nthreads = shm_comm_size; 385 } 386 387 PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size); 388 if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size)); 389 390 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 391 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 392 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 393 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 394 Use 0 as key so that rank ordering wont change in new comm. 395 */ 396 color = shm_rank / nthreads; 397 PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm)); 398 399 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 400 PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank)); 401 if (!omp_rank) { 402 ctrl->is_omp_master = PETSC_TRUE; /* master */ 403 color = 0; 404 } else { 405 ctrl->is_omp_master = PETSC_FALSE; /* slave */ 406 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 407 } 408 PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm)); 409 410 /*================================================================================= 411 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 412 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 413 and run them on the idle CPUs. 414 ==================================================================================*/ 415 PetscCall(PetscOmpCtrlCreateBarrier(ctrl)); 416 417 /*================================================================================= 418 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 419 is the union of the bindings of all ranks in the omp_comm 420 =================================================================================*/ 421 422 ctrl->cpuset = hwloc_bitmap_alloc(); 423 PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 424 PetscCall(hwloc_get_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS)); 425 426 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 427 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8; 428 PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs)); 429 if (nr_cpu_ulongs == 1) { 430 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 431 } else { 432 for (i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i); 433 } 434 435 PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm)); 436 437 if (ctrl->is_omp_master) { 438 ctrl->omp_cpuset = hwloc_bitmap_alloc(); 439 PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed"); 440 if (nr_cpu_ulongs == 1) { 441 #if HWLOC_API_VERSION >= 0x00020000 442 PetscCall(hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0])); 443 #else 444 hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]); 445 #endif 446 } else { 447 for (i = 0; i < nr_cpu_ulongs; i++) { 448 #if HWLOC_API_VERSION >= 0x00020000 449 PetscCall(hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i])); 450 #else 451 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]); 452 #endif 453 } 454 } 455 } 456 457 PetscCall(PetscFree(cpu_ulongs)); 458 *pctrl = ctrl; 459 PetscFunctionReturn(0); 460 } 461 462 /*@C 463 PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller 464 465 Input Parameter: 466 . pctrl - a PETSc OpenMP controller 467 468 Level: developer 469 470 .seealso `PetscOmpCtrlCreate()` 471 @*/ 472 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) { 473 PetscOmpCtrl ctrl = *pctrl; 474 475 PetscFunctionBegin; 476 hwloc_bitmap_free(ctrl->cpuset); 477 hwloc_topology_destroy(ctrl->topology); 478 PetscOmpCtrlDestroyBarrier(ctrl); 479 PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm)); 480 if (ctrl->is_omp_master) { 481 hwloc_bitmap_free(ctrl->omp_cpuset); 482 PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm)); 483 } 484 PetscCall(PetscFree(ctrl)); 485 PetscFunctionReturn(0); 486 } 487 488 /*@C 489 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller 490 491 Input Parameter: 492 . ctrl - a PETSc OMP controller 493 494 Output Parameters: 495 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads 496 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 497 on slave ranks, MPI_COMM_NULL will be return in reality. 498 - is_omp_master - true if the calling process is an OMP master rank. 499 500 Notes: any output parameter can be NULL. The parameter is just ignored. 501 502 Level: developer 503 @*/ 504 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master) { 505 PetscFunctionBegin; 506 if (omp_comm) *omp_comm = ctrl->omp_comm; 507 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 508 if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 509 PetscFunctionReturn(0); 510 } 511 512 /*@C 513 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU) 514 515 Input Parameter: 516 . ctrl - a PETSc OMP controller 517 518 Notes: 519 this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not 520 require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency, 521 MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement. 522 523 A code using PetscOmpCtrlBarrier() would be like this, 524 525 if (is_omp_master) { 526 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl); 527 Call the library using OpenMP 528 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl); 529 } 530 PetscOmpCtrlBarrier(ctrl); 531 532 Level: developer 533 534 .seealso `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()` 535 @*/ 536 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) { 537 int err; 538 539 PetscFunctionBegin; 540 err = pthread_barrier_wait(ctrl->barrier); 541 PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %" PetscInt_FMT, err); 542 PetscFunctionReturn(0); 543 } 544 545 /*@C 546 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks 547 548 Input Parameter: 549 . ctrl - a PETSc OMP controller 550 551 Notes: 552 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank. 553 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime 554 555 Level: developer 556 557 .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()` 558 @*/ 559 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) { 560 PetscFunctionBegin; 561 PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS)); 562 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */ 563 PetscFunctionReturn(0); 564 } 565 566 /*@C 567 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks 568 569 Input Parameter: 570 . ctrl - a PETSc OMP controller 571 572 Notes: 573 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank. 574 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1. 575 576 Level: developer 577 578 .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()` 579 @*/ 580 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) { 581 PetscFunctionBegin; 582 PetscCall(hwloc_set_cpubind(ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS)); 583 omp_set_num_threads(1); 584 PetscFunctionReturn(0); 585 } 586 587 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 588 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */ 589