1 #include <petscsys.h> /*I "petscsys.h" I*/ 2 #include <petsc/private/petscimpl.h> 3 4 struct _n_PetscShmComm { 5 PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */ 6 PetscMPIInt shmsize; /* size of the shared memory communicator */ 7 MPI_Comm globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */ 8 }; 9 10 /* 11 Private routine to delete internal tag/name shared memory communicator when a communicator is freed. 12 13 This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed. 14 15 Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval() 16 17 */ 18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state) 19 { 20 PetscErrorCode ierr; 21 PetscShmComm p = (PetscShmComm)val; 22 23 PetscFunctionBegin; 24 ierr = PetscInfo1(0,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr); 25 ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr); 26 ierr = PetscFree(p->globranks);CHKERRMPI(ierr); 27 ierr = PetscFree(val);CHKERRMPI(ierr); 28 PetscFunctionReturn(MPI_SUCCESS); 29 } 30 31 /*@C 32 PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory 33 34 35 Collective. 36 37 Input Parameter: 38 . globcomm - MPI_Comm 39 40 Output Parameter: 41 . pshmcomm - the PETSc shared memory communicator object 42 43 Level: developer 44 45 Notes: 46 This should be called only with an PetscCommDuplicate() communictor 47 48 When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis 49 50 @*/ 51 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm) 52 { 53 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 54 PetscErrorCode ierr; 55 MPI_Group globgroup,shmgroup; 56 PetscMPIInt *shmranks,i,flg; 57 PetscCommCounter *counter; 58 59 PetscFunctionBegin; 60 ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr); 61 if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator"); 62 63 ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr); 64 if (flg) PetscFunctionReturn(0); 65 66 ierr = PetscNew(pshmcomm);CHKERRQ(ierr); 67 (*pshmcomm)->globcomm = globcomm; 68 69 ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr); 70 71 ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr); 72 ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr); 73 ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr); 74 ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr); 75 ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr); 76 for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i; 77 ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr); 78 ierr = PetscFree(shmranks);CHKERRQ(ierr); 79 ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr); 80 ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr); 81 82 for (i=0; i<(*pshmcomm)->shmsize; i++) { 83 ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr); 84 } 85 ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr); 86 PetscFunctionReturn(0); 87 #else 88 SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich."); 89 #endif 90 } 91 92 /*@C 93 PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator 94 95 Input Parameters: 96 + pshmcomm - the shared memory communicator object 97 - grank - the global rank 98 99 Output Parameter: 100 . lrank - the local rank, or MPI_PROC_NULL if it does not exist 101 102 Level: developer 103 104 Developer Notes: 105 Assumes the pshmcomm->globranks[] is sorted 106 107 It may be better to rewrite this to map multiple global ranks to local in the same function call 108 109 @*/ 110 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank) 111 { 112 PetscMPIInt low,high,t,i; 113 PetscBool flg = PETSC_FALSE; 114 PetscErrorCode ierr; 115 116 PetscFunctionBegin; 117 *lrank = MPI_PROC_NULL; 118 if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0); 119 if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0); 120 ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr); 121 if (flg) PetscFunctionReturn(0); 122 low = 0; 123 high = pshmcomm->shmsize; 124 while (high-low > 5) { 125 t = (low+high)/2; 126 if (pshmcomm->globranks[t] > grank) high = t; 127 else low = t; 128 } 129 for (i=low; i<high; i++) { 130 if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0); 131 if (pshmcomm->globranks[i] == grank) { 132 *lrank = i; 133 PetscFunctionReturn(0); 134 } 135 } 136 PetscFunctionReturn(0); 137 } 138 139 /*@C 140 PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank 141 142 Input Parameters: 143 + pshmcomm - the shared memory communicator object 144 - lrank - the local rank in the shared memory communicator 145 146 Output Parameter: 147 . grank - the global rank in the global communicator where the shared memory communicator is built 148 149 Level: developer 150 151 @*/ 152 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank) 153 { 154 PetscFunctionBegin; 155 #ifdef PETSC_USE_DEBUG 156 { 157 PetscErrorCode ierr; 158 if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);CHKERRQ(ierr); } 159 } 160 #endif 161 *grank = pshmcomm->globranks[lrank]; 162 PetscFunctionReturn(0); 163 } 164 165 /*@C 166 PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory 167 168 Input Parameter: 169 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet() 170 171 Output Parameter: 172 . comm - the MPI communicator 173 174 Level: developer 175 176 @*/ 177 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm) 178 { 179 PetscFunctionBegin; 180 *comm = pshmcomm->shmcomm; 181 PetscFunctionReturn(0); 182 } 183 184 #if defined(PETSC_HAVE_OPENMP_SUPPORT) 185 #include <pthread.h> 186 #include <hwloc.h> 187 #include <omp.h> 188 189 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available, 190 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much 191 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 192 by 50%. Until the reason is found out, we use mmap() instead. 193 */ 194 #define USE_MMAP_ALLOCATE_SHARED_MEMORY 195 196 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 197 #include <sys/mman.h> 198 #include <sys/types.h> 199 #include <sys/stat.h> 200 #include <fcntl.h> 201 #endif 202 203 struct _n_PetscOmpCtrl { 204 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 205 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 206 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 207 PetscBool is_omp_master; /* rank 0's in omp_comm */ 208 MPI_Win omp_win; /* a shared memory window containing a barrier */ 209 pthread_barrier_t *barrier; /* pointer to the barrier */ 210 hwloc_topology_t topology; 211 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 212 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 213 }; 214 215 216 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm 217 contained by the controler. 218 219 PETSc OpenMP controler users do not call this function directly. This function exists 220 only because we want to separate shared memory allocation methods from other code. 221 */ 222 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) 223 { 224 PetscErrorCode ierr; 225 MPI_Aint size; 226 void *baseptr; 227 pthread_barrierattr_t attr; 228 229 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 230 PetscInt fd; 231 PetscChar pathname[PETSC_MAX_PATH_LEN]; 232 #else 233 PetscMPIInt disp_unit; 234 #endif 235 236 PetscFunctionBegin; 237 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 238 size = sizeof(pthread_barrier_t); 239 if (ctrl->is_omp_master) { 240 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */ 241 ierr = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr); 242 ierr = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr); 243 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 244 fd = mkstemp(pathname); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname); 245 ierr = ftruncate(fd,size);CHKERRQ(ierr); 246 baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n"); 247 ierr = close(fd);CHKERRQ(ierr); 248 ierr = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr); 249 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */ 250 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 251 ierr = unlink(pathname);CHKERRQ(ierr); 252 } else { 253 ierr = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr); 254 fd = open(pathname,O_RDWR); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname); 255 baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n"); 256 ierr = close(fd);CHKERRQ(ierr); 257 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 258 } 259 #else 260 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 261 ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr); 262 ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr); 263 #endif 264 ctrl->barrier = (pthread_barrier_t*)baseptr; 265 266 /* omp master initializes the barrier */ 267 if (ctrl->is_omp_master) { 268 ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr); 269 ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr); 270 ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */ 271 ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr); 272 ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr); 273 } 274 275 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 276 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 277 PetscFunctionReturn(0); 278 } 279 280 /* Destroy the pthread barrier in the PETSc OpenMP controler */ 281 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) 282 { 283 PetscErrorCode ierr; 284 285 PetscFunctionBegin; 286 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 287 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 288 if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); } 289 290 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 291 ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr); 292 #else 293 ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr); 294 #endif 295 PetscFunctionReturn(0); 296 } 297 298 /*@C 299 PetscOmpCtrlCreate - create a PETSc OpenMP controler, which manages PETSc's interaction with third party libraries using OpenMP 300 301 Input Parameter: 302 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in 303 - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value 304 305 Output Parameter: 306 . pctrl - a PETSc OpenMP controler 307 308 Level: developer 309 310 .seealso PetscOmpCtrlDestroy() 311 @*/ 312 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl) 313 { 314 PetscErrorCode ierr; 315 PetscOmpCtrl ctrl; 316 unsigned long *cpu_ulongs=NULL; 317 PetscInt i,nr_cpu_ulongs; 318 PetscShmComm pshmcomm; 319 MPI_Comm shm_comm; 320 PetscMPIInt shm_rank,shm_comm_size,omp_rank,color; 321 PetscInt num_packages,num_cores; 322 323 PetscFunctionBegin; 324 ierr = PetscNew(&ctrl);CHKERRQ(ierr); 325 326 /*================================================================================= 327 Init hwloc 328 ==================================================================================*/ 329 ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr); 330 #if HWLOC_API_VERSION >= 0x00020000 331 /* to filter out unneeded info and have faster hwloc_topology_load */ 332 ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr); 333 ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr); 334 #endif 335 ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr); 336 337 /*================================================================================= 338 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 339 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 340 others are called slaves. OMP Masters make up a new comm called omp_master_comm, 341 which is usually passed to third party libraries. 342 ==================================================================================*/ 343 344 /* fetch the stored shared memory communicator */ 345 ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr); 346 ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr); 347 348 ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr); 349 ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr); 350 351 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */ 352 if (nthreads == -1) { 353 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE); if (num_packages <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not determine number of sockets(packages) per compute node\n"); 354 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE); if (num_cores <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not determine number of cores per compute node\n"); 355 nthreads = num_cores/num_packages; 356 if (nthreads > shm_comm_size) nthreads = shm_comm_size; 357 } 358 359 if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %d can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size); 360 if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %d is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); } 361 362 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 363 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 364 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 365 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 366 Use 0 as key so that rank ordering wont change in new comm. 367 */ 368 color = shm_rank / nthreads; 369 ierr = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRQ(ierr); 370 371 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 372 ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr); 373 if (!omp_rank) { 374 ctrl->is_omp_master = PETSC_TRUE; /* master */ 375 color = 0; 376 } else { 377 ctrl->is_omp_master = PETSC_FALSE; /* slave */ 378 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 379 } 380 ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRQ(ierr); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */ 381 382 /*================================================================================= 383 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 384 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 385 and run them on the idle CPUs. 386 ==================================================================================*/ 387 ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr); 388 389 /*================================================================================= 390 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 391 is the union of the bindings of all ranks in the omp_comm 392 =================================================================================*/ 393 394 ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n"); 395 ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 396 397 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 398 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8; 399 ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr); 400 if (nr_cpu_ulongs == 1) { 401 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 402 } else { 403 for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i); 404 } 405 406 ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr); 407 408 if (ctrl->is_omp_master) { 409 ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n"); 410 if (nr_cpu_ulongs == 1) { 411 #if HWLOC_API_VERSION >= 0x00020000 412 ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr); 413 #else 414 hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]); 415 #endif 416 } else { 417 for (i=0; i<nr_cpu_ulongs; i++) { 418 #if HWLOC_API_VERSION >= 0x00020000 419 ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr); 420 #else 421 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]); 422 #endif 423 } 424 } 425 } 426 427 ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr); 428 *pctrl = ctrl; 429 PetscFunctionReturn(0); 430 } 431 432 /*@C 433 PetscOmpCtrlDestroy - destory the PETSc OpenMP controler 434 435 Input Parameter: 436 . pctrl - a PETSc OpenMP controler 437 438 Level: developer 439 440 .seealso PetscOmpCtrlCreate() 441 @*/ 442 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) 443 { 444 PetscErrorCode ierr; 445 PetscOmpCtrl ctrl = *pctrl; 446 447 PetscFunctionBegin; 448 hwloc_bitmap_free(ctrl->cpuset); 449 hwloc_topology_destroy(ctrl->topology); 450 PetscOmpCtrlDestroyBarrier(ctrl); 451 ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr); 452 if (ctrl->is_omp_master) { 453 hwloc_bitmap_free(ctrl->omp_cpuset); 454 ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr); 455 } 456 ierr = PetscFree(ctrl);CHKERRQ(ierr); 457 PetscFunctionReturn(0); 458 } 459 460 /*@C 461 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controler 462 463 Input Parameter: 464 . ctrl - a PETSc OMP controler 465 466 Output Parameter: 467 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads 468 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 469 on slave ranks, MPI_COMM_NULL will be return in reality. 470 - is_omp_master - true if the calling process is an OMP master rank. 471 472 Notes: any output parameter can be NULL. The parameter is just ignored. 473 474 Level: developer 475 @*/ 476 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master) 477 { 478 PetscFunctionBegin; 479 if (omp_comm) *omp_comm = ctrl->omp_comm; 480 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 481 if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 482 PetscFunctionReturn(0); 483 } 484 485 /*@C 486 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controler (to let slave ranks free their CPU) 487 488 Input Parameter: 489 . ctrl - a PETSc OMP controler 490 491 Notes: 492 this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not 493 require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency, 494 MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement. 495 496 A code using PetscOmpCtrlBarrier() would be like this, 497 498 if (is_omp_master) { 499 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl); 500 Call the library using OpenMP 501 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl); 502 } 503 PetscOmpCtrlBarrier(ctrl); 504 505 Level: developer 506 507 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd() 508 @*/ 509 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) 510 { 511 PetscErrorCode ierr; 512 513 PetscFunctionBegin; 514 ierr = pthread_barrier_wait(ctrl->barrier); 515 if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr); 516 PetscFunctionReturn(0); 517 } 518 519 /*@C 520 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks 521 522 Input Parameter: 523 . ctrl - a PETSc OMP controler 524 525 Notes: 526 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank. 527 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime 528 529 Level: developer 530 531 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd() 532 @*/ 533 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) 534 { 535 PetscErrorCode ierr; 536 537 PetscFunctionBegin; 538 ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 539 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */ 540 PetscFunctionReturn(0); 541 } 542 543 /*@C 544 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks 545 546 Input Parameter: 547 . ctrl - a PETSc OMP controler 548 549 Notes: 550 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank. 551 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1. 552 553 Level: developer 554 555 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin() 556 @*/ 557 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) 558 { 559 PetscErrorCode ierr; 560 561 PetscFunctionBegin; 562 ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 563 omp_set_num_threads(1); 564 PetscFunctionReturn(0); 565 } 566 567 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 568 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */ 569