1 #include <petscsys.h> /*I "petscsys.h" I*/ 2 #include <petsc/private/petscimpl.h> 3 4 struct _n_PetscShmComm { 5 PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */ 6 PetscMPIInt shmsize; /* size of the shared memory communicator */ 7 MPI_Comm globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */ 8 }; 9 10 /* 11 Private routine to delete internal shared memory communicator when a communicator is freed. 12 13 This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed. 14 15 Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval() 16 17 */ 18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state) 19 { 20 PetscErrorCode ierr; 21 PetscShmComm p = (PetscShmComm)val; 22 23 PetscFunctionBegin; 24 ierr = PetscInfo(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr); 25 ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr); 26 ierr = PetscFree(p->globranks);CHKERRMPI(ierr); 27 ierr = PetscFree(val);CHKERRMPI(ierr); 28 PetscFunctionReturn(MPI_SUCCESS); 29 } 30 31 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 32 /* Data structures to support freeing comms created in PetscShmCommGet(). 33 Since we predict communicators passed to PetscShmCommGet() are very likely 34 either a petsc inner communicator or an MPI communicator with a linked petsc 35 inner communicator, we use a simple static array to store dupped communicators 36 on rare cases otherwise. 37 */ 38 #define MAX_SHMCOMM_DUPPED_COMMS 16 39 static PetscInt num_dupped_comms=0; 40 static MPI_Comm shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS]; 41 static PetscErrorCode PetscShmCommDestroyDuppedComms(void) 42 { 43 PetscErrorCode ierr; 44 PetscInt i; 45 PetscFunctionBegin; 46 for (i=0; i<num_dupped_comms; i++) {ierr = PetscCommDestroy(&shmcomm_dupped_comms[i]);CHKERRQ(ierr);} 47 num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */ 48 PetscFunctionReturn(0); 49 } 50 #endif 51 52 /*@C 53 PetscShmCommGet - Given a communicator returns a sub-communicator of all ranks that share a common memory 54 55 Collective. 56 57 Input Parameter: 58 . globcomm - MPI_Comm, which can be a user MPI_Comm or a PETSc inner MPI_Comm 59 60 Output Parameter: 61 . pshmcomm - the PETSc shared memory communicator object 62 63 Level: developer 64 65 Notes: 66 When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis 67 68 @*/ 69 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm) 70 { 71 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 72 PetscErrorCode ierr; 73 MPI_Group globgroup,shmgroup; 74 PetscMPIInt *shmranks,i,flg; 75 PetscCommCounter *counter; 76 77 PetscFunctionBegin; 78 PetscValidPointer(pshmcomm,2); 79 /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */ 80 ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRMPI(ierr); 81 if (!flg) { /* globcomm is not a petsc comm */ 82 union {MPI_Comm comm; void *ptr;} ucomm; 83 /* check if globcomm already has a linked petsc inner comm */ 84 ierr = MPI_Comm_get_attr(globcomm,Petsc_InnerComm_keyval,&ucomm,&flg);CHKERRMPI(ierr); 85 if (!flg) { 86 /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */ 87 PetscAssertFalse(num_dupped_comms >= MAX_SHMCOMM_DUPPED_COMMS,globcomm,PETSC_ERR_PLIB,"PetscShmCommGet() is trying to dup more than %d MPI_Comms",MAX_SHMCOMM_DUPPED_COMMS); 88 ierr = PetscCommDuplicate(globcomm,&globcomm,NULL);CHKERRQ(ierr); 89 /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */ 90 if (num_dupped_comms == 0) {ierr = PetscRegisterFinalize(PetscShmCommDestroyDuppedComms);CHKERRQ(ierr);} 91 shmcomm_dupped_comms[num_dupped_comms] = globcomm; 92 num_dupped_comms++; 93 } else { 94 /* otherwise, we pull out the inner comm and use it as globcomm */ 95 globcomm = ucomm.comm; 96 } 97 } 98 99 /* Check if globcomm already has an attached pshmcomm. If no, create one */ 100 ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRMPI(ierr); 101 if (flg) PetscFunctionReturn(0); 102 103 ierr = PetscNew(pshmcomm);CHKERRQ(ierr); 104 (*pshmcomm)->globcomm = globcomm; 105 106 ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRMPI(ierr); 107 108 ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRMPI(ierr); 109 ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRMPI(ierr); 110 ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRMPI(ierr); 111 ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr); 112 ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr); 113 for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i; 114 ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRMPI(ierr); 115 ierr = PetscFree(shmranks);CHKERRQ(ierr); 116 ierr = MPI_Group_free(&globgroup);CHKERRMPI(ierr); 117 ierr = MPI_Group_free(&shmgroup);CHKERRMPI(ierr); 118 119 for (i=0; i<(*pshmcomm)->shmsize; i++) { 120 ierr = PetscInfo(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr); 121 } 122 ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRMPI(ierr); 123 PetscFunctionReturn(0); 124 #else 125 SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich."); 126 #endif 127 } 128 129 /*@C 130 PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator 131 132 Input Parameters: 133 + pshmcomm - the shared memory communicator object 134 - grank - the global rank 135 136 Output Parameter: 137 . lrank - the local rank, or MPI_PROC_NULL if it does not exist 138 139 Level: developer 140 141 Developer Notes: 142 Assumes the pshmcomm->globranks[] is sorted 143 144 It may be better to rewrite this to map multiple global ranks to local in the same function call 145 146 @*/ 147 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank) 148 { 149 PetscMPIInt low,high,t,i; 150 PetscBool flg = PETSC_FALSE; 151 PetscErrorCode ierr; 152 153 PetscFunctionBegin; 154 PetscValidPointer(pshmcomm,1); 155 PetscValidPointer(lrank,3); 156 *lrank = MPI_PROC_NULL; 157 if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0); 158 if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0); 159 ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr); 160 if (flg) PetscFunctionReturn(0); 161 low = 0; 162 high = pshmcomm->shmsize; 163 while (high-low > 5) { 164 t = (low+high)/2; 165 if (pshmcomm->globranks[t] > grank) high = t; 166 else low = t; 167 } 168 for (i=low; i<high; i++) { 169 if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0); 170 if (pshmcomm->globranks[i] == grank) { 171 *lrank = i; 172 PetscFunctionReturn(0); 173 } 174 } 175 PetscFunctionReturn(0); 176 } 177 178 /*@C 179 PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank 180 181 Input Parameters: 182 + pshmcomm - the shared memory communicator object 183 - lrank - the local rank in the shared memory communicator 184 185 Output Parameter: 186 . grank - the global rank in the global communicator where the shared memory communicator is built 187 188 Level: developer 189 190 @*/ 191 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank) 192 { 193 PetscFunctionBegin; 194 PetscValidPointer(pshmcomm,1); 195 PetscValidPointer(grank,3); 196 PetscAssertFalse(PetscUnlikely((lrank < 0) || (lrank >= pshmcomm->shmsize)),PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %d in the shared memory communicator",lrank); 197 *grank = pshmcomm->globranks[lrank]; 198 PetscFunctionReturn(0); 199 } 200 201 /*@C 202 PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory 203 204 Input Parameter: 205 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet() 206 207 Output Parameter: 208 . comm - the MPI communicator 209 210 Level: developer 211 212 @*/ 213 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm) 214 { 215 PetscFunctionBegin; 216 PetscValidPointer(pshmcomm,1); 217 PetscValidPointer(comm,2); 218 *comm = pshmcomm->shmcomm; 219 PetscFunctionReturn(0); 220 } 221 222 #if defined(PETSC_HAVE_OPENMP_SUPPORT) 223 #include <pthread.h> 224 #include <hwloc.h> 225 #include <omp.h> 226 227 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available, 228 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much 229 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 230 by 50%. Until the reason is found out, we use mmap() instead. 231 */ 232 #define USE_MMAP_ALLOCATE_SHARED_MEMORY 233 234 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 235 #include <sys/mman.h> 236 #include <sys/types.h> 237 #include <sys/stat.h> 238 #include <fcntl.h> 239 #endif 240 241 struct _n_PetscOmpCtrl { 242 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 243 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 244 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 245 PetscBool is_omp_master; /* rank 0's in omp_comm */ 246 MPI_Win omp_win; /* a shared memory window containing a barrier */ 247 pthread_barrier_t *barrier; /* pointer to the barrier */ 248 hwloc_topology_t topology; 249 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 250 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 251 }; 252 253 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm 254 contained by the controller. 255 256 PETSc OpenMP controller users do not call this function directly. This function exists 257 only because we want to separate shared memory allocation methods from other code. 258 */ 259 static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) 260 { 261 PetscErrorCode ierr; 262 MPI_Aint size; 263 void *baseptr; 264 pthread_barrierattr_t attr; 265 266 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 267 PetscInt fd; 268 PetscChar pathname[PETSC_MAX_PATH_LEN]; 269 #else 270 PetscMPIInt disp_unit; 271 #endif 272 273 PetscFunctionBegin; 274 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 275 size = sizeof(pthread_barrier_t); 276 if (ctrl->is_omp_master) { 277 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */ 278 ierr = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr); 279 ierr = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr); 280 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 281 fd = mkstemp(pathname); PetscAssertFalse(fd == -1,PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp", pathname); 282 ierr = ftruncate(fd,size);CHKERRQ(ierr); 283 baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); PetscAssertFalse(baseptr == MAP_FAILED,PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed"); 284 ierr = close(fd);CHKERRQ(ierr); 285 ierr = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRMPI(ierr); 286 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */ 287 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr); 288 ierr = unlink(pathname);CHKERRQ(ierr); 289 } else { 290 ierr = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRMPI(ierr); 291 fd = open(pathname,O_RDWR); PetscAssertFalse(fd == -1,PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s", pathname); 292 baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); PetscAssertFalse(baseptr == MAP_FAILED,PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed"); 293 ierr = close(fd);CHKERRQ(ierr); 294 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr); 295 } 296 #else 297 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 298 ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRMPI(ierr); 299 ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRMPI(ierr); 300 #endif 301 ctrl->barrier = (pthread_barrier_t*)baseptr; 302 303 /* omp master initializes the barrier */ 304 if (ctrl->is_omp_master) { 305 ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRMPI(ierr); 306 ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr); 307 ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */ 308 ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr); 309 ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr); 310 } 311 312 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 313 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr); 314 PetscFunctionReturn(0); 315 } 316 317 /* Destroy the pthread barrier in the PETSc OpenMP controller */ 318 static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) 319 { 320 PetscErrorCode ierr; 321 322 PetscFunctionBegin; 323 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 324 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr); 325 if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); } 326 327 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 328 ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr); 329 #else 330 ierr = MPI_Win_free(&ctrl->omp_win);CHKERRMPI(ierr); 331 #endif 332 PetscFunctionReturn(0); 333 } 334 335 /*@C 336 PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP 337 338 Input Parameters: 339 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in 340 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value 341 342 Output Parameter: 343 . pctrl - a PETSc OpenMP controller 344 345 Level: developer 346 347 TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use 348 349 .seealso PetscOmpCtrlDestroy() 350 @*/ 351 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl) 352 { 353 PetscErrorCode ierr; 354 PetscOmpCtrl ctrl; 355 unsigned long *cpu_ulongs=NULL; 356 PetscInt i,nr_cpu_ulongs; 357 PetscShmComm pshmcomm; 358 MPI_Comm shm_comm; 359 PetscMPIInt shm_rank,shm_comm_size,omp_rank,color; 360 PetscInt num_packages,num_cores; 361 362 PetscFunctionBegin; 363 ierr = PetscNew(&ctrl);CHKERRQ(ierr); 364 365 /*================================================================================= 366 Init hwloc 367 ==================================================================================*/ 368 ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr); 369 #if HWLOC_API_VERSION >= 0x00020000 370 /* to filter out unneeded info and have faster hwloc_topology_load */ 371 ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr); 372 ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr); 373 #endif 374 ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr); 375 376 /*================================================================================= 377 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 378 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 379 others are called slaves. OMP Masters make up a new comm called omp_master_comm, 380 which is usually passed to third party libraries. 381 ==================================================================================*/ 382 383 /* fetch the stored shared memory communicator */ 384 ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr); 385 ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr); 386 387 ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRMPI(ierr); 388 ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRMPI(ierr); 389 390 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */ 391 if (nthreads == -1) { 392 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE); 393 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE); 394 nthreads = num_cores/num_packages; 395 if (nthreads > shm_comm_size) nthreads = shm_comm_size; 396 } 397 398 PetscAssertFalse(nthreads < 1 || nthreads > shm_comm_size,petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d",nthreads,shm_comm_size); 399 if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); } 400 401 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 402 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 403 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 404 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 405 Use 0 as key so that rank ordering wont change in new comm. 406 */ 407 color = shm_rank / nthreads; 408 ierr = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRMPI(ierr); 409 410 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 411 ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRMPI(ierr); 412 if (!omp_rank) { 413 ctrl->is_omp_master = PETSC_TRUE; /* master */ 414 color = 0; 415 } else { 416 ctrl->is_omp_master = PETSC_FALSE; /* slave */ 417 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 418 } 419 ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRMPI(ierr); 420 421 /*================================================================================= 422 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 423 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 424 and run them on the idle CPUs. 425 ==================================================================================*/ 426 ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr); 427 428 /*================================================================================= 429 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 430 is the union of the bindings of all ranks in the omp_comm 431 =================================================================================*/ 432 433 ctrl->cpuset = hwloc_bitmap_alloc(); PetscAssertFalse(!ctrl->cpuset,PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed"); 434 ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 435 436 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 437 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8; 438 ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr); 439 if (nr_cpu_ulongs == 1) { 440 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 441 } else { 442 for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i); 443 } 444 445 ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRMPI(ierr); 446 447 if (ctrl->is_omp_master) { 448 ctrl->omp_cpuset = hwloc_bitmap_alloc(); PetscAssertFalse(!ctrl->omp_cpuset,PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed"); 449 if (nr_cpu_ulongs == 1) { 450 #if HWLOC_API_VERSION >= 0x00020000 451 ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr); 452 #else 453 hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]); 454 #endif 455 } else { 456 for (i=0; i<nr_cpu_ulongs; i++) { 457 #if HWLOC_API_VERSION >= 0x00020000 458 ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr); 459 #else 460 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]); 461 #endif 462 } 463 } 464 } 465 466 ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr); 467 *pctrl = ctrl; 468 PetscFunctionReturn(0); 469 } 470 471 /*@C 472 PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller 473 474 Input Parameter: 475 . pctrl - a PETSc OpenMP controller 476 477 Level: developer 478 479 .seealso PetscOmpCtrlCreate() 480 @*/ 481 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) 482 { 483 PetscErrorCode ierr; 484 PetscOmpCtrl ctrl = *pctrl; 485 486 PetscFunctionBegin; 487 hwloc_bitmap_free(ctrl->cpuset); 488 hwloc_topology_destroy(ctrl->topology); 489 PetscOmpCtrlDestroyBarrier(ctrl); 490 ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRMPI(ierr); 491 if (ctrl->is_omp_master) { 492 hwloc_bitmap_free(ctrl->omp_cpuset); 493 ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRMPI(ierr); 494 } 495 ierr = PetscFree(ctrl);CHKERRQ(ierr); 496 PetscFunctionReturn(0); 497 } 498 499 /*@C 500 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller 501 502 Input Parameter: 503 . ctrl - a PETSc OMP controller 504 505 Output Parameters: 506 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads 507 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 508 on slave ranks, MPI_COMM_NULL will be return in reality. 509 - is_omp_master - true if the calling process is an OMP master rank. 510 511 Notes: any output parameter can be NULL. The parameter is just ignored. 512 513 Level: developer 514 @*/ 515 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master) 516 { 517 PetscFunctionBegin; 518 if (omp_comm) *omp_comm = ctrl->omp_comm; 519 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 520 if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 521 PetscFunctionReturn(0); 522 } 523 524 /*@C 525 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU) 526 527 Input Parameter: 528 . ctrl - a PETSc OMP controller 529 530 Notes: 531 this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not 532 require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency, 533 MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement. 534 535 A code using PetscOmpCtrlBarrier() would be like this, 536 537 if (is_omp_master) { 538 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl); 539 Call the library using OpenMP 540 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl); 541 } 542 PetscOmpCtrlBarrier(ctrl); 543 544 Level: developer 545 546 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd() 547 @*/ 548 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) 549 { 550 int err; 551 552 PetscFunctionBegin; 553 err = pthread_barrier_wait(ctrl->barrier); 554 PetscAssertFalse(err && err != PTHREAD_BARRIER_SERIAL_THREAD,PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %" PetscInt_FMT, err); 555 PetscFunctionReturn(0); 556 } 557 558 /*@C 559 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks 560 561 Input Parameter: 562 . ctrl - a PETSc OMP controller 563 564 Notes: 565 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank. 566 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime 567 568 Level: developer 569 570 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd() 571 @*/ 572 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) 573 { 574 PetscErrorCode ierr; 575 576 PetscFunctionBegin; 577 ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 578 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */ 579 PetscFunctionReturn(0); 580 } 581 582 /*@C 583 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks 584 585 Input Parameter: 586 . ctrl - a PETSc OMP controller 587 588 Notes: 589 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank. 590 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1. 591 592 Level: developer 593 594 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin() 595 @*/ 596 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) 597 { 598 PetscErrorCode ierr; 599 600 PetscFunctionBegin; 601 ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 602 omp_set_num_threads(1); 603 PetscFunctionReturn(0); 604 } 605 606 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 607 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */ 608