1 #include <petscsys.h> /*I "petscsys.h" I*/ 2 #include <petsc/private/petscimpl.h> 3 4 struct _n_PetscShmComm { 5 PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */ 6 PetscMPIInt shmsize; /* size of the shared memory communicator */ 7 MPI_Comm globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */ 8 }; 9 10 /* 11 Private routine to delete internal shared memory communicator when a communicator is freed. 12 13 This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed. 14 15 Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval() 16 17 */ 18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state) 19 { 20 PetscShmComm p = (PetscShmComm)val; 21 22 PetscFunctionBegin; 23 CHKERRMPI(PetscInfo(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm)); 24 CHKERRMPI(MPI_Comm_free(&p->shmcomm)); 25 CHKERRMPI(PetscFree(p->globranks)); 26 CHKERRMPI(PetscFree(val)); 27 PetscFunctionReturn(MPI_SUCCESS); 28 } 29 30 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 31 /* Data structures to support freeing comms created in PetscShmCommGet(). 32 Since we predict communicators passed to PetscShmCommGet() are very likely 33 either a petsc inner communicator or an MPI communicator with a linked petsc 34 inner communicator, we use a simple static array to store dupped communicators 35 on rare cases otherwise. 36 */ 37 #define MAX_SHMCOMM_DUPPED_COMMS 16 38 static PetscInt num_dupped_comms=0; 39 static MPI_Comm shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS]; 40 static PetscErrorCode PetscShmCommDestroyDuppedComms(void) 41 { 42 PetscInt i; 43 PetscFunctionBegin; 44 for (i=0; i<num_dupped_comms; i++) CHKERRQ(PetscCommDestroy(&shmcomm_dupped_comms[i])); 45 num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */ 46 PetscFunctionReturn(0); 47 } 48 #endif 49 50 /*@C 51 PetscShmCommGet - Given a communicator returns a sub-communicator of all ranks that share a common memory 52 53 Collective. 54 55 Input Parameter: 56 . globcomm - MPI_Comm, which can be a user MPI_Comm or a PETSc inner MPI_Comm 57 58 Output Parameter: 59 . pshmcomm - the PETSc shared memory communicator object 60 61 Level: developer 62 63 Notes: 64 When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis 65 66 @*/ 67 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm) 68 { 69 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 70 MPI_Group globgroup,shmgroup; 71 PetscMPIInt *shmranks,i,flg; 72 PetscCommCounter *counter; 73 74 PetscFunctionBegin; 75 PetscValidPointer(pshmcomm,2); 76 /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */ 77 CHKERRMPI(MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg)); 78 if (!flg) { /* globcomm is not a petsc comm */ 79 union {MPI_Comm comm; void *ptr;} ucomm; 80 /* check if globcomm already has a linked petsc inner comm */ 81 CHKERRMPI(MPI_Comm_get_attr(globcomm,Petsc_InnerComm_keyval,&ucomm,&flg)); 82 if (!flg) { 83 /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */ 84 PetscCheckFalse(num_dupped_comms >= MAX_SHMCOMM_DUPPED_COMMS,globcomm,PETSC_ERR_PLIB,"PetscShmCommGet() is trying to dup more than %d MPI_Comms",MAX_SHMCOMM_DUPPED_COMMS); 85 CHKERRQ(PetscCommDuplicate(globcomm,&globcomm,NULL)); 86 /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */ 87 if (num_dupped_comms == 0) CHKERRQ(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms)); 88 shmcomm_dupped_comms[num_dupped_comms] = globcomm; 89 num_dupped_comms++; 90 } else { 91 /* otherwise, we pull out the inner comm and use it as globcomm */ 92 globcomm = ucomm.comm; 93 } 94 } 95 96 /* Check if globcomm already has an attached pshmcomm. If no, create one */ 97 CHKERRMPI(MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg)); 98 if (flg) PetscFunctionReturn(0); 99 100 CHKERRQ(PetscNew(pshmcomm)); 101 (*pshmcomm)->globcomm = globcomm; 102 103 CHKERRMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm)); 104 105 CHKERRMPI(MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize)); 106 CHKERRMPI(MPI_Comm_group(globcomm, &globgroup)); 107 CHKERRMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup)); 108 CHKERRQ(PetscMalloc1((*pshmcomm)->shmsize,&shmranks)); 109 CHKERRQ(PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks)); 110 for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i; 111 CHKERRMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks)); 112 CHKERRQ(PetscFree(shmranks)); 113 CHKERRMPI(MPI_Group_free(&globgroup)); 114 CHKERRMPI(MPI_Group_free(&shmgroup)); 115 116 for (i=0; i<(*pshmcomm)->shmsize; i++) { 117 CHKERRQ(PetscInfo(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i])); 118 } 119 CHKERRMPI(MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm)); 120 PetscFunctionReturn(0); 121 #else 122 SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich."); 123 #endif 124 } 125 126 /*@C 127 PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator 128 129 Input Parameters: 130 + pshmcomm - the shared memory communicator object 131 - grank - the global rank 132 133 Output Parameter: 134 . lrank - the local rank, or MPI_PROC_NULL if it does not exist 135 136 Level: developer 137 138 Developer Notes: 139 Assumes the pshmcomm->globranks[] is sorted 140 141 It may be better to rewrite this to map multiple global ranks to local in the same function call 142 143 @*/ 144 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank) 145 { 146 PetscMPIInt low,high,t,i; 147 PetscBool flg = PETSC_FALSE; 148 149 PetscFunctionBegin; 150 PetscValidPointer(pshmcomm,1); 151 PetscValidPointer(lrank,3); 152 *lrank = MPI_PROC_NULL; 153 if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0); 154 if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0); 155 CHKERRQ(PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL)); 156 if (flg) PetscFunctionReturn(0); 157 low = 0; 158 high = pshmcomm->shmsize; 159 while (high-low > 5) { 160 t = (low+high)/2; 161 if (pshmcomm->globranks[t] > grank) high = t; 162 else low = t; 163 } 164 for (i=low; i<high; i++) { 165 if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0); 166 if (pshmcomm->globranks[i] == grank) { 167 *lrank = i; 168 PetscFunctionReturn(0); 169 } 170 } 171 PetscFunctionReturn(0); 172 } 173 174 /*@C 175 PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank 176 177 Input Parameters: 178 + pshmcomm - the shared memory communicator object 179 - lrank - the local rank in the shared memory communicator 180 181 Output Parameter: 182 . grank - the global rank in the global communicator where the shared memory communicator is built 183 184 Level: developer 185 186 @*/ 187 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank) 188 { 189 PetscFunctionBegin; 190 PetscValidPointer(pshmcomm,1); 191 PetscValidPointer(grank,3); 192 PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %d in the shared memory communicator",lrank); 193 *grank = pshmcomm->globranks[lrank]; 194 PetscFunctionReturn(0); 195 } 196 197 /*@C 198 PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory 199 200 Input Parameter: 201 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet() 202 203 Output Parameter: 204 . comm - the MPI communicator 205 206 Level: developer 207 208 @*/ 209 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm) 210 { 211 PetscFunctionBegin; 212 PetscValidPointer(pshmcomm,1); 213 PetscValidPointer(comm,2); 214 *comm = pshmcomm->shmcomm; 215 PetscFunctionReturn(0); 216 } 217 218 #if defined(PETSC_HAVE_OPENMP_SUPPORT) 219 #include <pthread.h> 220 #include <hwloc.h> 221 #include <omp.h> 222 223 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available, 224 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much 225 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 226 by 50%. Until the reason is found out, we use mmap() instead. 227 */ 228 #define USE_MMAP_ALLOCATE_SHARED_MEMORY 229 230 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 231 #include <sys/mman.h> 232 #include <sys/types.h> 233 #include <sys/stat.h> 234 #include <fcntl.h> 235 #endif 236 237 struct _n_PetscOmpCtrl { 238 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 239 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 240 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 241 PetscBool is_omp_master; /* rank 0's in omp_comm */ 242 MPI_Win omp_win; /* a shared memory window containing a barrier */ 243 pthread_barrier_t *barrier; /* pointer to the barrier */ 244 hwloc_topology_t topology; 245 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 246 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 247 }; 248 249 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm 250 contained by the controller. 251 252 PETSc OpenMP controller users do not call this function directly. This function exists 253 only because we want to separate shared memory allocation methods from other code. 254 */ 255 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) 256 { 257 MPI_Aint size; 258 void *baseptr; 259 pthread_barrierattr_t attr; 260 261 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 262 PetscInt fd; 263 PetscChar pathname[PETSC_MAX_PATH_LEN]; 264 #else 265 PetscMPIInt disp_unit; 266 #endif 267 268 PetscFunctionBegin; 269 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 270 size = sizeof(pthread_barrier_t); 271 if (ctrl->is_omp_master) { 272 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */ 273 CHKERRQ(PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN)); 274 CHKERRQ(PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN)); 275 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 276 fd = mkstemp(pathname); PetscCheckFalse(fd == -1,PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp", pathname); 277 CHKERRQ(ftruncate(fd,size)); 278 baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); PetscCheckFalse(baseptr == MAP_FAILED,PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed"); 279 CHKERRQ(close(fd)); 280 CHKERRMPI(MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm)); 281 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */ 282 CHKERRMPI(MPI_Barrier(ctrl->omp_comm)); 283 CHKERRQ(unlink(pathname)); 284 } else { 285 CHKERRMPI(MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm)); 286 fd = open(pathname,O_RDWR); PetscCheckFalse(fd == -1,PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s", pathname); 287 baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); PetscCheckFalse(baseptr == MAP_FAILED,PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed"); 288 CHKERRQ(close(fd)); 289 CHKERRMPI(MPI_Barrier(ctrl->omp_comm)); 290 } 291 #else 292 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 293 CHKERRMPI(MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win)); 294 CHKERRMPI(MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr)); 295 #endif 296 ctrl->barrier = (pthread_barrier_t*)baseptr; 297 298 /* omp master initializes the barrier */ 299 if (ctrl->is_omp_master) { 300 CHKERRMPI(MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size)); 301 CHKERRQ(pthread_barrierattr_init(&attr)); 302 CHKERRQ(pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED)); /* make the barrier also work for processes */ 303 CHKERRQ(pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size)); 304 CHKERRQ(pthread_barrierattr_destroy(&attr)); 305 } 306 307 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 308 CHKERRMPI(MPI_Barrier(ctrl->omp_comm)); 309 PetscFunctionReturn(0); 310 } 311 312 /* Destroy the pthread barrier in the PETSc OpenMP controller */ 313 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) 314 { 315 PetscFunctionBegin; 316 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 317 CHKERRMPI(MPI_Barrier(ctrl->omp_comm)); 318 if (ctrl->is_omp_master) CHKERRQ(pthread_barrier_destroy(ctrl->barrier)); 319 320 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 321 CHKERRQ(munmap(ctrl->barrier,sizeof(pthread_barrier_t))); 322 #else 323 CHKERRMPI(MPI_Win_free(&ctrl->omp_win)); 324 #endif 325 PetscFunctionReturn(0); 326 } 327 328 /*@C 329 PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP 330 331 Input Parameters: 332 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in 333 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value 334 335 Output Parameter: 336 . pctrl - a PETSc OpenMP controller 337 338 Level: developer 339 340 TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use 341 342 .seealso PetscOmpCtrlDestroy() 343 @*/ 344 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl) 345 { 346 PetscOmpCtrl ctrl; 347 unsigned long *cpu_ulongs = NULL; 348 PetscInt i,nr_cpu_ulongs; 349 PetscShmComm pshmcomm; 350 MPI_Comm shm_comm; 351 PetscMPIInt shm_rank,shm_comm_size,omp_rank,color; 352 PetscInt num_packages,num_cores; 353 354 PetscFunctionBegin; 355 CHKERRQ(PetscNew(&ctrl)); 356 357 /*================================================================================= 358 Init hwloc 359 ==================================================================================*/ 360 CHKERRQ(hwloc_topology_init(&ctrl->topology)); 361 #if HWLOC_API_VERSION >= 0x00020000 362 /* to filter out unneeded info and have faster hwloc_topology_load */ 363 CHKERRQ(hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE)); 364 CHKERRQ(hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL)); 365 #endif 366 CHKERRQ(hwloc_topology_load(ctrl->topology)); 367 368 /*================================================================================= 369 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 370 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 371 others are called slaves. OMP Masters make up a new comm called omp_master_comm, 372 which is usually passed to third party libraries. 373 ==================================================================================*/ 374 375 /* fetch the stored shared memory communicator */ 376 CHKERRQ(PetscShmCommGet(petsc_comm,&pshmcomm)); 377 CHKERRQ(PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm)); 378 379 CHKERRMPI(MPI_Comm_rank(shm_comm,&shm_rank)); 380 CHKERRMPI(MPI_Comm_size(shm_comm,&shm_comm_size)); 381 382 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */ 383 if (nthreads == -1) { 384 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE); 385 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE); 386 nthreads = num_cores/num_packages; 387 if (nthreads > shm_comm_size) nthreads = shm_comm_size; 388 } 389 390 PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size,petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d",nthreads,shm_comm_size); 391 if (shm_comm_size % nthreads) CHKERRQ(PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size)); 392 393 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 394 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 395 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 396 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 397 Use 0 as key so that rank ordering wont change in new comm. 398 */ 399 color = shm_rank / nthreads; 400 CHKERRMPI(MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm)); 401 402 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 403 CHKERRMPI(MPI_Comm_rank(ctrl->omp_comm,&omp_rank)); 404 if (!omp_rank) { 405 ctrl->is_omp_master = PETSC_TRUE; /* master */ 406 color = 0; 407 } else { 408 ctrl->is_omp_master = PETSC_FALSE; /* slave */ 409 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 410 } 411 CHKERRMPI(MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm)); 412 413 /*================================================================================= 414 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 415 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 416 and run them on the idle CPUs. 417 ==================================================================================*/ 418 CHKERRQ(PetscOmpCtrlCreateBarrier(ctrl)); 419 420 /*================================================================================= 421 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 422 is the union of the bindings of all ranks in the omp_comm 423 =================================================================================*/ 424 425 ctrl->cpuset = hwloc_bitmap_alloc(); PetscCheckFalse(!ctrl->cpuset,PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed"); 426 CHKERRQ(hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS)); 427 428 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 429 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8; 430 CHKERRQ(PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs)); 431 if (nr_cpu_ulongs == 1) { 432 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 433 } else { 434 for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i); 435 } 436 437 CHKERRMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm)); 438 439 if (ctrl->is_omp_master) { 440 ctrl->omp_cpuset = hwloc_bitmap_alloc(); PetscCheckFalse(!ctrl->omp_cpuset,PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed"); 441 if (nr_cpu_ulongs == 1) { 442 #if HWLOC_API_VERSION >= 0x00020000 443 CHKERRQ(hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0])); 444 #else 445 hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]); 446 #endif 447 } else { 448 for (i=0; i<nr_cpu_ulongs; i++) { 449 #if HWLOC_API_VERSION >= 0x00020000 450 CHKERRQ(hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i])); 451 #else 452 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]); 453 #endif 454 } 455 } 456 } 457 458 CHKERRQ(PetscFree(cpu_ulongs)); 459 *pctrl = ctrl; 460 PetscFunctionReturn(0); 461 } 462 463 /*@C 464 PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller 465 466 Input Parameter: 467 . pctrl - a PETSc OpenMP controller 468 469 Level: developer 470 471 .seealso PetscOmpCtrlCreate() 472 @*/ 473 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) 474 { 475 PetscOmpCtrl ctrl = *pctrl; 476 477 PetscFunctionBegin; 478 hwloc_bitmap_free(ctrl->cpuset); 479 hwloc_topology_destroy(ctrl->topology); 480 PetscOmpCtrlDestroyBarrier(ctrl); 481 CHKERRMPI(MPI_Comm_free(&ctrl->omp_comm)); 482 if (ctrl->is_omp_master) { 483 hwloc_bitmap_free(ctrl->omp_cpuset); 484 CHKERRMPI(MPI_Comm_free(&ctrl->omp_master_comm)); 485 } 486 CHKERRQ(PetscFree(ctrl)); 487 PetscFunctionReturn(0); 488 } 489 490 /*@C 491 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller 492 493 Input Parameter: 494 . ctrl - a PETSc OMP controller 495 496 Output Parameters: 497 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads 498 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 499 on slave ranks, MPI_COMM_NULL will be return in reality. 500 - is_omp_master - true if the calling process is an OMP master rank. 501 502 Notes: any output parameter can be NULL. The parameter is just ignored. 503 504 Level: developer 505 @*/ 506 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master) 507 { 508 PetscFunctionBegin; 509 if (omp_comm) *omp_comm = ctrl->omp_comm; 510 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 511 if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 512 PetscFunctionReturn(0); 513 } 514 515 /*@C 516 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU) 517 518 Input Parameter: 519 . ctrl - a PETSc OMP controller 520 521 Notes: 522 this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not 523 require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency, 524 MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement. 525 526 A code using PetscOmpCtrlBarrier() would be like this, 527 528 if (is_omp_master) { 529 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl); 530 Call the library using OpenMP 531 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl); 532 } 533 PetscOmpCtrlBarrier(ctrl); 534 535 Level: developer 536 537 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd() 538 @*/ 539 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) 540 { 541 int err; 542 543 PetscFunctionBegin; 544 err = pthread_barrier_wait(ctrl->barrier); 545 PetscCheckFalse(err && err != PTHREAD_BARRIER_SERIAL_THREAD,PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %" PetscInt_FMT, err); 546 PetscFunctionReturn(0); 547 } 548 549 /*@C 550 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks 551 552 Input Parameter: 553 . ctrl - a PETSc OMP controller 554 555 Notes: 556 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank. 557 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime 558 559 Level: developer 560 561 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd() 562 @*/ 563 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) 564 { 565 PetscFunctionBegin; 566 CHKERRQ(hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS)); 567 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */ 568 PetscFunctionReturn(0); 569 } 570 571 /*@C 572 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks 573 574 Input Parameter: 575 . ctrl - a PETSc OMP controller 576 577 Notes: 578 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank. 579 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1. 580 581 Level: developer 582 583 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin() 584 @*/ 585 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) 586 { 587 PetscFunctionBegin; 588 CHKERRQ(hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS)); 589 omp_set_num_threads(1); 590 PetscFunctionReturn(0); 591 } 592 593 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 594 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */ 595