1 #include <petscsys.h> /*I "petscsys.h" I*/ 2 #include <petsc/private/petscimpl.h> 3 4 struct _n_PetscShmComm { 5 PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */ 6 PetscMPIInt shmsize; /* size of the shared memory communicator */ 7 MPI_Comm globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */ 8 }; 9 10 /* 11 Private routine to delete internal shared memory communicator when a communicator is freed. 12 13 This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed. 14 15 Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval() 16 17 */ 18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_Delete_Fn(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state) 19 { 20 PetscErrorCode ierr; 21 PetscShmComm p = (PetscShmComm)val; 22 23 PetscFunctionBegin; 24 ierr = PetscInfo1(NULL,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr); 25 ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr); 26 ierr = PetscFree(p->globranks);CHKERRMPI(ierr); 27 ierr = PetscFree(val);CHKERRMPI(ierr); 28 PetscFunctionReturn(MPI_SUCCESS); 29 } 30 31 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 32 /* Data structures to support freeing comms created in PetscShmCommGet(). 33 Since we predict communicators passed to PetscShmCommGet() are very likely 34 either a petsc inner communicator or an MPI communicator with a linked petsc 35 inner communicator, we use a simple static array to store dupped communicators 36 on rare cases otherwise. 37 */ 38 #define MAX_SHMCOMM_DUPPED_COMMS 16 39 static PetscInt num_dupped_comms=0; 40 static MPI_Comm shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS]; 41 static PetscErrorCode PetscShmCommDestroyDuppedComms(void) 42 { 43 PetscErrorCode ierr; 44 PetscInt i; 45 PetscFunctionBegin; 46 for (i=0; i<num_dupped_comms; i++) {ierr = PetscCommDestroy(&shmcomm_dupped_comms[i]);CHKERRQ(ierr);} 47 num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */ 48 PetscFunctionReturn(0); 49 } 50 #endif 51 52 /*@C 53 PetscShmCommGet - Given a communicator returns a sub-communicator of all ranks that share a common memory 54 55 Collective. 56 57 Input Parameter: 58 . globcomm - MPI_Comm, which can be a user MPI_Comm or a PETSc inner MPI_Comm 59 60 Output Parameter: 61 . pshmcomm - the PETSc shared memory communicator object 62 63 Level: developer 64 65 Notes: 66 When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis 67 68 @*/ 69 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm) 70 { 71 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 72 PetscErrorCode ierr; 73 MPI_Group globgroup,shmgroup; 74 PetscMPIInt *shmranks,i,flg; 75 PetscCommCounter *counter; 76 77 PetscFunctionBegin; 78 /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */ 79 ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRMPI(ierr); 80 if (!flg) { /* globcomm is not a petsc comm */ 81 union {MPI_Comm comm; void *ptr;} ucomm; 82 /* check if globcomm already has a linked petsc inner comm */ 83 ierr = MPI_Comm_get_attr(globcomm,Petsc_InnerComm_keyval,&ucomm,&flg);CHKERRMPI(ierr); 84 if (!flg) { 85 /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */ 86 if (num_dupped_comms >= MAX_SHMCOMM_DUPPED_COMMS) SETERRQ1(globcomm,PETSC_ERR_PLIB,"PetscShmCommGet() is trying to dup more than %d MPI_Comms",MAX_SHMCOMM_DUPPED_COMMS); 87 ierr = PetscCommDuplicate(globcomm,&globcomm,NULL);CHKERRQ(ierr); 88 /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */ 89 if (num_dupped_comms == 0) {ierr = PetscRegisterFinalize(PetscShmCommDestroyDuppedComms);CHKERRQ(ierr);} 90 shmcomm_dupped_comms[num_dupped_comms] = globcomm; 91 num_dupped_comms++; 92 } else { 93 /* otherwise, we pull out the inner comm and use it as globcomm */ 94 globcomm = ucomm.comm; 95 } 96 } 97 98 /* Check if globcomm already has an attached pshmcomm. If no, create one */ 99 ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRMPI(ierr); 100 if (flg) PetscFunctionReturn(0); 101 102 ierr = PetscNew(pshmcomm);CHKERRQ(ierr); 103 (*pshmcomm)->globcomm = globcomm; 104 105 ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRMPI(ierr); 106 107 ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRMPI(ierr); 108 ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRMPI(ierr); 109 ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRMPI(ierr); 110 ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr); 111 ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr); 112 for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i; 113 ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRMPI(ierr); 114 ierr = PetscFree(shmranks);CHKERRQ(ierr); 115 ierr = MPI_Group_free(&globgroup);CHKERRMPI(ierr); 116 ierr = MPI_Group_free(&shmgroup);CHKERRMPI(ierr); 117 118 for (i=0; i<(*pshmcomm)->shmsize; i++) { 119 ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr); 120 } 121 ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRMPI(ierr); 122 PetscFunctionReturn(0); 123 #else 124 SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich."); 125 #endif 126 } 127 128 /*@C 129 PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator 130 131 Input Parameters: 132 + pshmcomm - the shared memory communicator object 133 - grank - the global rank 134 135 Output Parameter: 136 . lrank - the local rank, or MPI_PROC_NULL if it does not exist 137 138 Level: developer 139 140 Developer Notes: 141 Assumes the pshmcomm->globranks[] is sorted 142 143 It may be better to rewrite this to map multiple global ranks to local in the same function call 144 145 @*/ 146 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank) 147 { 148 PetscMPIInt low,high,t,i; 149 PetscBool flg = PETSC_FALSE; 150 PetscErrorCode ierr; 151 152 PetscFunctionBegin; 153 *lrank = MPI_PROC_NULL; 154 if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0); 155 if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0); 156 ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr); 157 if (flg) PetscFunctionReturn(0); 158 low = 0; 159 high = pshmcomm->shmsize; 160 while (high-low > 5) { 161 t = (low+high)/2; 162 if (pshmcomm->globranks[t] > grank) high = t; 163 else low = t; 164 } 165 for (i=low; i<high; i++) { 166 if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0); 167 if (pshmcomm->globranks[i] == grank) { 168 *lrank = i; 169 PetscFunctionReturn(0); 170 } 171 } 172 PetscFunctionReturn(0); 173 } 174 175 /*@C 176 PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank 177 178 Input Parameters: 179 + pshmcomm - the shared memory communicator object 180 - lrank - the local rank in the shared memory communicator 181 182 Output Parameter: 183 . grank - the global rank in the global communicator where the shared memory communicator is built 184 185 Level: developer 186 187 @*/ 188 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank) 189 { 190 PetscFunctionBegin; 191 if (lrank < 0 || lrank >= pshmcomm->shmsize) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank); 192 *grank = pshmcomm->globranks[lrank]; 193 PetscFunctionReturn(0); 194 } 195 196 /*@C 197 PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory 198 199 Input Parameter: 200 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet() 201 202 Output Parameter: 203 . comm - the MPI communicator 204 205 Level: developer 206 207 @*/ 208 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm) 209 { 210 PetscFunctionBegin; 211 *comm = pshmcomm->shmcomm; 212 PetscFunctionReturn(0); 213 } 214 215 #if defined(PETSC_HAVE_OPENMP_SUPPORT) 216 #include <pthread.h> 217 #include <hwloc.h> 218 #include <omp.h> 219 220 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available, 221 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much 222 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 223 by 50%. Until the reason is found out, we use mmap() instead. 224 */ 225 #define USE_MMAP_ALLOCATE_SHARED_MEMORY 226 227 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 228 #include <sys/mman.h> 229 #include <sys/types.h> 230 #include <sys/stat.h> 231 #include <fcntl.h> 232 #endif 233 234 struct _n_PetscOmpCtrl { 235 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 236 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 237 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 238 PetscBool is_omp_master; /* rank 0's in omp_comm */ 239 MPI_Win omp_win; /* a shared memory window containing a barrier */ 240 pthread_barrier_t *barrier; /* pointer to the barrier */ 241 hwloc_topology_t topology; 242 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 243 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 244 }; 245 246 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm 247 contained by the controller. 248 249 PETSc OpenMP controller users do not call this function directly. This function exists 250 only because we want to separate shared memory allocation methods from other code. 251 */ 252 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) 253 { 254 PetscErrorCode ierr; 255 MPI_Aint size; 256 void *baseptr; 257 pthread_barrierattr_t attr; 258 259 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 260 PetscInt fd; 261 PetscChar pathname[PETSC_MAX_PATH_LEN]; 262 #else 263 PetscMPIInt disp_unit; 264 #endif 265 266 PetscFunctionBegin; 267 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 268 size = sizeof(pthread_barrier_t); 269 if (ctrl->is_omp_master) { 270 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */ 271 ierr = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr); 272 ierr = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr); 273 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 274 fd = mkstemp(pathname); if (fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname); 275 ierr = ftruncate(fd,size);CHKERRQ(ierr); 276 baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n"); 277 ierr = close(fd);CHKERRQ(ierr); 278 ierr = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRMPI(ierr); 279 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */ 280 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr); 281 ierr = unlink(pathname);CHKERRQ(ierr); 282 } else { 283 ierr = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRMPI(ierr); 284 fd = open(pathname,O_RDWR); if (fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname); 285 baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n"); 286 ierr = close(fd);CHKERRQ(ierr); 287 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr); 288 } 289 #else 290 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 291 ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRMPI(ierr); 292 ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRMPI(ierr); 293 #endif 294 ctrl->barrier = (pthread_barrier_t*)baseptr; 295 296 /* omp master initializes the barrier */ 297 if (ctrl->is_omp_master) { 298 ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRMPI(ierr); 299 ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr); 300 ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */ 301 ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr); 302 ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr); 303 } 304 305 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 306 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 /* Destroy the pthread barrier in the PETSc OpenMP controller */ 311 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) 312 { 313 PetscErrorCode ierr; 314 315 PetscFunctionBegin; 316 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 317 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRMPI(ierr); 318 if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); } 319 320 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 321 ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr); 322 #else 323 ierr = MPI_Win_free(&ctrl->omp_win);CHKERRMPI(ierr); 324 #endif 325 PetscFunctionReturn(0); 326 } 327 328 /*@C 329 PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries using OpenMP 330 331 Input Parameters: 332 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in 333 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value 334 335 Output Parameter: 336 . pctrl - a PETSc OpenMP controller 337 338 Level: developer 339 340 TODO: Possibly use the variable PetscNumOMPThreads to determine the number for threads to use 341 342 .seealso PetscOmpCtrlDestroy() 343 @*/ 344 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl) 345 { 346 PetscErrorCode ierr; 347 PetscOmpCtrl ctrl; 348 unsigned long *cpu_ulongs=NULL; 349 PetscInt i,nr_cpu_ulongs; 350 PetscShmComm pshmcomm; 351 MPI_Comm shm_comm; 352 PetscMPIInt shm_rank,shm_comm_size,omp_rank,color; 353 PetscInt num_packages,num_cores; 354 355 PetscFunctionBegin; 356 ierr = PetscNew(&ctrl);CHKERRQ(ierr); 357 358 /*================================================================================= 359 Init hwloc 360 ==================================================================================*/ 361 ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr); 362 #if HWLOC_API_VERSION >= 0x00020000 363 /* to filter out unneeded info and have faster hwloc_topology_load */ 364 ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr); 365 ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr); 366 #endif 367 ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr); 368 369 /*================================================================================= 370 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 371 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 372 others are called slaves. OMP Masters make up a new comm called omp_master_comm, 373 which is usually passed to third party libraries. 374 ==================================================================================*/ 375 376 /* fetch the stored shared memory communicator */ 377 ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr); 378 ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr); 379 380 ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRMPI(ierr); 381 ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRMPI(ierr); 382 383 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */ 384 if (nthreads == -1) { 385 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE); 386 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE); 387 nthreads = num_cores/num_packages; 388 if (nthreads > shm_comm_size) nthreads = shm_comm_size; 389 } 390 391 if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %D can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size); 392 if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %D is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); } 393 394 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 395 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 396 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 397 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 398 Use 0 as key so that rank ordering wont change in new comm. 399 */ 400 color = shm_rank / nthreads; 401 ierr = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRMPI(ierr); 402 403 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 404 ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRMPI(ierr); 405 if (!omp_rank) { 406 ctrl->is_omp_master = PETSC_TRUE; /* master */ 407 color = 0; 408 } else { 409 ctrl->is_omp_master = PETSC_FALSE; /* slave */ 410 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 411 } 412 ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRMPI(ierr); 413 414 /*================================================================================= 415 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 416 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 417 and run them on the idle CPUs. 418 ==================================================================================*/ 419 ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr); 420 421 /*================================================================================= 422 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 423 is the union of the bindings of all ranks in the omp_comm 424 =================================================================================*/ 425 426 ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n"); 427 ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 428 429 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 430 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8; 431 ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr); 432 if (nr_cpu_ulongs == 1) { 433 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 434 } else { 435 for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i); 436 } 437 438 ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRMPI(ierr); 439 440 if (ctrl->is_omp_master) { 441 ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n"); 442 if (nr_cpu_ulongs == 1) { 443 #if HWLOC_API_VERSION >= 0x00020000 444 ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr); 445 #else 446 hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]); 447 #endif 448 } else { 449 for (i=0; i<nr_cpu_ulongs; i++) { 450 #if HWLOC_API_VERSION >= 0x00020000 451 ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr); 452 #else 453 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]); 454 #endif 455 } 456 } 457 } 458 459 ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr); 460 *pctrl = ctrl; 461 PetscFunctionReturn(0); 462 } 463 464 /*@C 465 PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller 466 467 Input Parameter: 468 . pctrl - a PETSc OpenMP controller 469 470 Level: developer 471 472 .seealso PetscOmpCtrlCreate() 473 @*/ 474 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) 475 { 476 PetscErrorCode ierr; 477 PetscOmpCtrl ctrl = *pctrl; 478 479 PetscFunctionBegin; 480 hwloc_bitmap_free(ctrl->cpuset); 481 hwloc_topology_destroy(ctrl->topology); 482 PetscOmpCtrlDestroyBarrier(ctrl); 483 ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRMPI(ierr); 484 if (ctrl->is_omp_master) { 485 hwloc_bitmap_free(ctrl->omp_cpuset); 486 ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRMPI(ierr); 487 } 488 ierr = PetscFree(ctrl);CHKERRQ(ierr); 489 PetscFunctionReturn(0); 490 } 491 492 /*@C 493 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller 494 495 Input Parameter: 496 . ctrl - a PETSc OMP controller 497 498 Output Parameters: 499 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads 500 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 501 on slave ranks, MPI_COMM_NULL will be return in reality. 502 - is_omp_master - true if the calling process is an OMP master rank. 503 504 Notes: any output parameter can be NULL. The parameter is just ignored. 505 506 Level: developer 507 @*/ 508 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master) 509 { 510 PetscFunctionBegin; 511 if (omp_comm) *omp_comm = ctrl->omp_comm; 512 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 513 if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 514 PetscFunctionReturn(0); 515 } 516 517 /*@C 518 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU) 519 520 Input Parameter: 521 . ctrl - a PETSc OMP controller 522 523 Notes: 524 this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not 525 require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency, 526 MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement. 527 528 A code using PetscOmpCtrlBarrier() would be like this, 529 530 if (is_omp_master) { 531 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl); 532 Call the library using OpenMP 533 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl); 534 } 535 PetscOmpCtrlBarrier(ctrl); 536 537 Level: developer 538 539 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd() 540 @*/ 541 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) 542 { 543 int err; 544 545 PetscFunctionBegin; 546 err = pthread_barrier_wait(ctrl->barrier); 547 if (err && err != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", err); 548 PetscFunctionReturn(0); 549 } 550 551 /*@C 552 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks 553 554 Input Parameter: 555 . ctrl - a PETSc OMP controller 556 557 Notes: 558 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank. 559 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime 560 561 Level: developer 562 563 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd() 564 @*/ 565 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) 566 { 567 PetscErrorCode ierr; 568 569 PetscFunctionBegin; 570 ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 571 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */ 572 PetscFunctionReturn(0); 573 } 574 575 /*@C 576 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks 577 578 Input Parameter: 579 . ctrl - a PETSc OMP controller 580 581 Notes: 582 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms() to know if this is a master rank. 583 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1. 584 585 Level: developer 586 587 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin() 588 @*/ 589 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) 590 { 591 PetscErrorCode ierr; 592 593 PetscFunctionBegin; 594 ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 595 omp_set_num_threads(1); 596 PetscFunctionReturn(0); 597 } 598 599 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 600 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */ 601