1 #include <petscsys.h> /*I "petscsys.h" I*/ 2 #include <petsc/private/petscimpl.h> 3 4 struct _n_PetscShmComm { 5 PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */ 6 PetscMPIInt shmsize; /* size of the shared memory communicator */ 7 MPI_Comm globcomm,shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */ 8 }; 9 10 /* 11 Private routine to delete internal tag/name shared memory communicator when a communicator is freed. 12 13 This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed. 14 15 Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval() 16 17 */ 18 PETSC_EXTERN PetscMPIInt MPIAPI Petsc_DelComm_Shm(MPI_Comm comm,PetscMPIInt keyval,void *val,void *extra_state) 19 { 20 PetscErrorCode ierr; 21 PetscShmComm p = (PetscShmComm)val; 22 23 PetscFunctionBegin; 24 ierr = PetscInfo1(0,"Deleting shared memory subcommunicator in a MPI_Comm %ld\n",(long)comm);CHKERRMPI(ierr); 25 ierr = MPI_Comm_free(&p->shmcomm);CHKERRMPI(ierr); 26 ierr = PetscFree(p->globranks);CHKERRMPI(ierr); 27 ierr = PetscFree(val);CHKERRMPI(ierr); 28 PetscFunctionReturn(MPI_SUCCESS); 29 } 30 31 /*@C 32 PetscShmCommGet - Given a PETSc communicator returns a communicator of all ranks that share a common memory 33 34 35 Collective on comm. 36 37 Input Parameter: 38 . globcomm - MPI_Comm 39 40 Output Parameter: 41 . pshmcomm - the PETSc shared memory communicator object 42 43 Level: developer 44 45 Notes: 46 This should be called only with an PetscCommDuplicate() communictor 47 48 When used with MPICH, MPICH must be configured with --download-mpich-device=ch3:nemesis 49 50 Concepts: MPI subcomm^numbering 51 52 @*/ 53 PetscErrorCode PetscShmCommGet(MPI_Comm globcomm,PetscShmComm *pshmcomm) 54 { 55 #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY 56 PetscErrorCode ierr; 57 MPI_Group globgroup,shmgroup; 58 PetscMPIInt *shmranks,i,flg; 59 PetscCommCounter *counter; 60 61 PetscFunctionBegin; 62 ierr = MPI_Comm_get_attr(globcomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr); 63 if (!flg) SETERRQ(globcomm,PETSC_ERR_ARG_CORRUPT,"Bad MPI communicator supplied; must be a PETSc communicator"); 64 65 ierr = MPI_Comm_get_attr(globcomm,Petsc_ShmComm_keyval,pshmcomm,&flg);CHKERRQ(ierr); 66 if (flg) PetscFunctionReturn(0); 67 68 ierr = PetscNew(pshmcomm);CHKERRQ(ierr); 69 (*pshmcomm)->globcomm = globcomm; 70 71 ierr = MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED,0, MPI_INFO_NULL,&(*pshmcomm)->shmcomm);CHKERRQ(ierr); 72 73 ierr = MPI_Comm_size((*pshmcomm)->shmcomm,&(*pshmcomm)->shmsize);CHKERRQ(ierr); 74 ierr = MPI_Comm_group(globcomm, &globgroup);CHKERRQ(ierr); 75 ierr = MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup);CHKERRQ(ierr); 76 ierr = PetscMalloc1((*pshmcomm)->shmsize,&shmranks);CHKERRQ(ierr); 77 ierr = PetscMalloc1((*pshmcomm)->shmsize,&(*pshmcomm)->globranks);CHKERRQ(ierr); 78 for (i=0; i<(*pshmcomm)->shmsize; i++) shmranks[i] = i; 79 ierr = MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks);CHKERRQ(ierr); 80 ierr = PetscFree(shmranks);CHKERRQ(ierr); 81 ierr = MPI_Group_free(&globgroup);CHKERRQ(ierr); 82 ierr = MPI_Group_free(&shmgroup);CHKERRQ(ierr); 83 84 for (i=0; i<(*pshmcomm)->shmsize; i++) { 85 ierr = PetscInfo2(NULL,"Shared memory rank %d global rank %d\n",i,(*pshmcomm)->globranks[i]);CHKERRQ(ierr); 86 } 87 ierr = MPI_Comm_set_attr(globcomm,Petsc_ShmComm_keyval,*pshmcomm);CHKERRQ(ierr); 88 PetscFunctionReturn(0); 89 #else 90 SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich."); 91 #endif 92 } 93 94 /*@C 95 PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator 96 97 Input Parameters: 98 + pshmcomm - the shared memory communicator object 99 - grank - the global rank 100 101 Output Parameter: 102 . lrank - the local rank, or MPI_PROC_NULL if it does not exist 103 104 Level: developer 105 106 Developer Notes: 107 Assumes the pshmcomm->globranks[] is sorted 108 109 It may be better to rewrite this to map multiple global ranks to local in the same function call 110 111 Concepts: MPI subcomm^numbering 112 113 @*/ 114 PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm,PetscMPIInt grank,PetscMPIInt *lrank) 115 { 116 PetscMPIInt low,high,t,i; 117 PetscBool flg = PETSC_FALSE; 118 PetscErrorCode ierr; 119 120 PetscFunctionBegin; 121 *lrank = MPI_PROC_NULL; 122 if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(0); 123 if (grank > pshmcomm->globranks[pshmcomm->shmsize-1]) PetscFunctionReturn(0); 124 ierr = PetscOptionsGetBool(NULL,NULL,"-noshared",&flg,NULL);CHKERRQ(ierr); 125 if (flg) PetscFunctionReturn(0); 126 low = 0; 127 high = pshmcomm->shmsize; 128 while (high-low > 5) { 129 t = (low+high)/2; 130 if (pshmcomm->globranks[t] > grank) high = t; 131 else low = t; 132 } 133 for (i=low; i<high; i++) { 134 if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(0); 135 if (pshmcomm->globranks[i] == grank) { 136 *lrank = i; 137 PetscFunctionReturn(0); 138 } 139 } 140 PetscFunctionReturn(0); 141 } 142 143 /*@C 144 PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank 145 146 Input Parameters: 147 + pshmcomm - the shared memory communicator object 148 - lrank - the local rank in the shared memory communicator 149 150 Output Parameter: 151 . grank - the global rank in the global communicator where the shared memory communicator is built 152 153 Level: developer 154 155 Concepts: MPI subcomm^numbering 156 @*/ 157 PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm,PetscMPIInt lrank,PetscMPIInt *grank) 158 { 159 PetscFunctionBegin; 160 #ifdef PETSC_USE_DEBUG 161 { 162 PetscErrorCode ierr; 163 if (lrank < 0 || lrank >= pshmcomm->shmsize) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"No rank %D in the shared memory communicator",lrank);CHKERRQ(ierr); } 164 } 165 #endif 166 *grank = pshmcomm->globranks[lrank]; 167 PetscFunctionReturn(0); 168 } 169 170 /*@C 171 PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory 172 173 Input Parameter: 174 . pshmcomm - PetscShmComm object obtained with PetscShmCommGet() 175 176 Output Parameter: 177 . comm - the MPI communicator 178 179 Level: developer 180 181 @*/ 182 PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm,MPI_Comm *comm) 183 { 184 PetscFunctionBegin; 185 *comm = pshmcomm->shmcomm; 186 PetscFunctionReturn(0); 187 } 188 189 #if defined(PETSC_HAVE_OPENMP_SUPPORT) 190 #include <pthread.h> 191 #include <hwloc.h> 192 #include <omp.h> 193 194 /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available, 195 otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much 196 simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance 197 by 50%. Until the reason is found out, we use mmap() instead. 198 */ 199 #define USE_MMAP_ALLOCATE_SHARED_MEMORY 200 201 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 202 #include <sys/mman.h> 203 #include <sys/types.h> 204 #include <sys/stat.h> 205 #include <fcntl.h> 206 #endif 207 208 struct _n_PetscOmpCtrl { 209 MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */ 210 MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */ 211 PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */ 212 PetscBool is_omp_master; /* rank 0's in omp_comm */ 213 MPI_Win omp_win; /* a shared memory window containing a barrier */ 214 pthread_barrier_t *barrier; /* pointer to the barrier */ 215 hwloc_topology_t topology; 216 hwloc_cpuset_t cpuset; /* cpu bindings of omp master */ 217 hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */ 218 }; 219 220 221 /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm 222 contained by the controler. 223 224 PETSc OpenMP controler users do not call this function directly. This function exists 225 only because we want to separate shared memory allocation methods from other code. 226 */ 227 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl) 228 { 229 PetscErrorCode ierr; 230 MPI_Aint size; 231 void *baseptr; 232 pthread_barrierattr_t attr; 233 234 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 235 PetscInt fd; 236 PetscChar pathname[PETSC_MAX_PATH_LEN]; 237 #else 238 PetscMPIInt disp_unit; 239 #endif 240 241 PetscFunctionBegin; 242 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 243 size = sizeof(pthread_barrier_t); 244 if (ctrl->is_omp_master) { 245 /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */ 246 ierr = PetscGetTmp(PETSC_COMM_SELF,pathname,PETSC_MAX_PATH_LEN);CHKERRQ(ierr); 247 ierr = PetscStrlcat(pathname,"/petsc-shm-XXXXXX",PETSC_MAX_PATH_LEN);CHKERRQ(ierr); 248 /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */ 249 fd = mkstemp(pathname); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not create tmp file %s with mkstemp\n", pathname); 250 ierr = ftruncate(fd,size);CHKERRQ(ierr); 251 baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n"); 252 ierr = close(fd);CHKERRQ(ierr); 253 ierr = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr); 254 /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */ 255 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 256 ierr = unlink(pathname);CHKERRQ(ierr); 257 } else { 258 ierr = MPI_Bcast(pathname,PETSC_MAX_PATH_LEN,MPI_CHAR,0,ctrl->omp_comm);CHKERRQ(ierr); 259 fd = open(pathname,O_RDWR); if(fd == -1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not open tmp file %s\n", pathname); 260 baseptr = mmap(NULL,size,PROT_READ | PROT_WRITE, MAP_SHARED,fd,0); if (baseptr == MAP_FAILED) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"mmap() failed\n"); 261 ierr = close(fd);CHKERRQ(ierr); 262 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 263 } 264 #else 265 size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0; 266 ierr = MPI_Win_allocate_shared(size,1,MPI_INFO_NULL,ctrl->omp_comm,&baseptr,&ctrl->omp_win);CHKERRQ(ierr); 267 ierr = MPI_Win_shared_query(ctrl->omp_win,0,&size,&disp_unit,&baseptr);CHKERRQ(ierr); 268 #endif 269 ctrl->barrier = (pthread_barrier_t*)baseptr; 270 271 /* omp master initializes the barrier */ 272 if (ctrl->is_omp_master) { 273 ierr = MPI_Comm_size(ctrl->omp_comm,&ctrl->omp_comm_size);CHKERRQ(ierr); 274 ierr = pthread_barrierattr_init(&attr);CHKERRQ(ierr); 275 ierr = pthread_barrierattr_setpshared(&attr,PTHREAD_PROCESS_SHARED);CHKERRQ(ierr); /* make the barrier also work for processes */ 276 ierr = pthread_barrier_init(ctrl->barrier,&attr,(unsigned int)ctrl->omp_comm_size);CHKERRQ(ierr); 277 ierr = pthread_barrierattr_destroy(&attr);CHKERRQ(ierr); 278 } 279 280 /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */ 281 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 282 PetscFunctionReturn(0); 283 } 284 285 /* Destroy the pthread barrier in the PETSc OpenMP controler */ 286 PETSC_STATIC_INLINE PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl) 287 { 288 PetscErrorCode ierr; 289 290 PetscFunctionBegin; 291 /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */ 292 ierr = MPI_Barrier(ctrl->omp_comm);CHKERRQ(ierr); 293 if (ctrl->is_omp_master) { ierr = pthread_barrier_destroy(ctrl->barrier);CHKERRQ(ierr); } 294 295 #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP) 296 ierr = munmap(ctrl->barrier,sizeof(pthread_barrier_t));CHKERRQ(ierr); 297 #else 298 ierr = MPI_Win_free(&ctrl->omp_win);CHKERRQ(ierr); 299 #endif 300 PetscFunctionReturn(0); 301 } 302 303 /*@C 304 PetscOmpCtrlCreate - create a PETSc OpenMP controler, which manages PETSc's interaction with third party libraries using OpenMP 305 306 Input Parameter: 307 + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in 308 . nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value 309 310 Output Parameter: 311 . pctrl - a PETSc OpenMP controler 312 313 Level: developer 314 315 .seealso PetscOmpCtrlDestroy() 316 @*/ 317 PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm,PetscInt nthreads,PetscOmpCtrl *pctrl) 318 { 319 PetscErrorCode ierr; 320 PetscOmpCtrl ctrl; 321 unsigned long *cpu_ulongs=NULL; 322 PetscInt i,nr_cpu_ulongs; 323 PetscShmComm pshmcomm; 324 MPI_Comm shm_comm; 325 PetscMPIInt shm_rank,shm_comm_size,omp_rank,color; 326 PetscInt num_packages,num_cores; 327 328 PetscFunctionBegin; 329 ierr = PetscNew(&ctrl);CHKERRQ(ierr); 330 331 /*================================================================================= 332 Init hwloc 333 ==================================================================================*/ 334 ierr = hwloc_topology_init(&ctrl->topology);CHKERRQ(ierr); 335 #if HWLOC_API_VERSION >= 0x00020000 336 /* to filter out unneeded info and have faster hwloc_topology_load */ 337 ierr = hwloc_topology_set_all_types_filter(ctrl->topology,HWLOC_TYPE_FILTER_KEEP_NONE);CHKERRQ(ierr); 338 ierr = hwloc_topology_set_type_filter(ctrl->topology,HWLOC_OBJ_CORE,HWLOC_TYPE_FILTER_KEEP_ALL);CHKERRQ(ierr); 339 #endif 340 ierr = hwloc_topology_load(ctrl->topology);CHKERRQ(ierr); 341 342 /*================================================================================= 343 Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to 344 physically shared memory. Rank 0 of each omp_comm is called an OMP master, and 345 others are called slaves. OMP Masters make up a new comm called omp_master_comm, 346 which is usually passed to third party libraries. 347 ==================================================================================*/ 348 349 /* fetch the stored shared memory communicator */ 350 ierr = PetscShmCommGet(petsc_comm,&pshmcomm);CHKERRQ(ierr); 351 ierr = PetscShmCommGetMpiShmComm(pshmcomm,&shm_comm);CHKERRQ(ierr); 352 353 ierr = MPI_Comm_rank(shm_comm,&shm_rank);CHKERRQ(ierr); 354 ierr = MPI_Comm_size(shm_comm,&shm_comm_size);CHKERRQ(ierr); 355 356 /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */ 357 if (nthreads == -1) { 358 num_packages = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_PACKAGE); if (num_packages <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not determine number of sockets(packages) per compute node\n"); 359 num_cores = hwloc_get_nbobjs_by_type(ctrl->topology,HWLOC_OBJ_CORE); if (num_cores <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not determine number of cores per compute node\n"); 360 nthreads = num_cores/num_packages; 361 if (nthreads > shm_comm_size) nthreads = shm_comm_size; 362 } 363 364 if (nthreads < 1 || nthreads > shm_comm_size) SETERRQ2(petsc_comm,PETSC_ERR_ARG_OUTOFRANGE,"number of OpenMP threads %d can not be < 1 or > the MPI shared memory communicator size %d\n",nthreads,shm_comm_size); 365 if (shm_comm_size % nthreads) { ierr = PetscPrintf(petsc_comm,"Warning: number of OpenMP threads %d is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n",nthreads,shm_comm_size);CHKERRQ(ierr); } 366 367 /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if 368 shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get 369 color 1. They are put in two omp_comms. Note that petsc_ranks may or may not 370 be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1. 371 Use 0 as key so that rank ordering wont change in new comm. 372 */ 373 color = shm_rank / nthreads; 374 ierr = MPI_Comm_split(shm_comm,color,0/*key*/,&ctrl->omp_comm);CHKERRQ(ierr); 375 376 /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */ 377 ierr = MPI_Comm_rank(ctrl->omp_comm,&omp_rank);CHKERRQ(ierr); 378 if (!omp_rank) { 379 ctrl->is_omp_master = PETSC_TRUE; /* master */ 380 color = 0; 381 } else { 382 ctrl->is_omp_master = PETSC_FALSE; /* slave */ 383 color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */ 384 } 385 ierr = MPI_Comm_split(petsc_comm,color,0/*key*/,&ctrl->omp_master_comm);CHKERRQ(ierr); /* rank 0 in omp_master_comm is rank 0 in petsc_comm */ 386 387 /*================================================================================= 388 Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put 389 slave ranks in sleep and idle their CPU, so that the master can fork OMP threads 390 and run them on the idle CPUs. 391 ==================================================================================*/ 392 ierr = PetscOmpCtrlCreateBarrier(ctrl);CHKERRQ(ierr); 393 394 /*================================================================================= 395 omp master logs its cpu binding (i.e., cpu set) and computes a new binding that 396 is the union of the bindings of all ranks in the omp_comm 397 =================================================================================*/ 398 399 ctrl->cpuset = hwloc_bitmap_alloc(); if (!ctrl->cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n"); 400 ierr = hwloc_get_cpubind(ctrl->topology,ctrl->cpuset, HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 401 402 /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */ 403 nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset (ctrl->topology))+sizeof(unsigned long)*8)/sizeof(unsigned long)/8; 404 ierr = PetscMalloc1(nr_cpu_ulongs,&cpu_ulongs);CHKERRQ(ierr); 405 if (nr_cpu_ulongs == 1) { 406 cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset); 407 } else { 408 for (i=0; i<nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset,(unsigned)i); 409 } 410 411 ierr = MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs,nr_cpu_ulongs, MPI_UNSIGNED_LONG,MPI_BOR,0,ctrl->omp_comm);CHKERRQ(ierr); 412 413 if (ctrl->is_omp_master) { 414 ctrl->omp_cpuset = hwloc_bitmap_alloc(); if (!ctrl->omp_cpuset) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"hwloc_bitmap_alloc() failed\n"); 415 if (nr_cpu_ulongs == 1) { 416 #if HWLOC_API_VERSION >= 0x00020000 417 ierr = hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]);CHKERRQ(ierr); 418 #else 419 hwloc_bitmap_from_ulong(ctrl->omp_cpuset,cpu_ulongs[0]); 420 #endif 421 } else { 422 for (i=0; i<nr_cpu_ulongs; i++) { 423 #if HWLOC_API_VERSION >= 0x00020000 424 ierr = hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]);CHKERRQ(ierr); 425 #else 426 hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset,(unsigned)i,cpu_ulongs[i]); 427 #endif 428 } 429 } 430 } 431 432 ierr = PetscFree(cpu_ulongs);CHKERRQ(ierr); 433 *pctrl = ctrl; 434 PetscFunctionReturn(0); 435 } 436 437 /*@C 438 PetscOmpCtrlDestroy - destory the PETSc OpenMP controler 439 440 Input Parameter: 441 . pctrl - a PETSc OpenMP controler 442 443 Level: developer 444 445 .seealso PetscOmpCtrlCreate() 446 @*/ 447 PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl) 448 { 449 PetscErrorCode ierr; 450 PetscOmpCtrl ctrl = *pctrl; 451 452 PetscFunctionBegin; 453 hwloc_bitmap_free(ctrl->cpuset); 454 hwloc_topology_destroy(ctrl->topology); 455 PetscOmpCtrlDestroyBarrier(ctrl); 456 ierr = MPI_Comm_free(&ctrl->omp_comm);CHKERRQ(ierr); 457 if (ctrl->is_omp_master) { 458 hwloc_bitmap_free(ctrl->omp_cpuset); 459 ierr = MPI_Comm_free(&ctrl->omp_master_comm);CHKERRQ(ierr); 460 } 461 ierr = PetscFree(ctrl);CHKERRQ(ierr); 462 PetscFunctionReturn(0); 463 } 464 465 /*@C 466 PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controler 467 468 Input Parameter: 469 . ctrl - a PETSc OMP controler 470 471 Output Parameter: 472 + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads 473 . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm; 474 on slave ranks, MPI_COMM_NULL will be return in reality. 475 - is_omp_master - true if the calling process is an OMP master rank. 476 477 Notes: any output parameter can be NULL. The parameter is just ignored. 478 479 Level: developer 480 @*/ 481 PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl,MPI_Comm *omp_comm,MPI_Comm *omp_master_comm,PetscBool *is_omp_master) 482 { 483 PetscFunctionBegin; 484 if (omp_comm) *omp_comm = ctrl->omp_comm; 485 if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm; 486 if (is_omp_master) *is_omp_master = ctrl->is_omp_master; 487 PetscFunctionReturn(0); 488 } 489 490 /*@C 491 PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controler (to let slave ranks free their CPU) 492 493 Input Parameter: 494 . ctrl - a PETSc OMP controler 495 496 Notes: 497 this is a pthread barrier on MPI processes. Using MPI_Barrier instead is conceptually correct. But MPI standard does not 498 require processes blocked by MPI_Barrier free their CPUs to let other processes progress. In practice, to minilize latency, 499 MPI processes stuck in MPI_Barrier keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement. 500 501 A code using PetscOmpCtrlBarrier() would be like this, 502 503 if (is_omp_master) { 504 PetscOmpCtrlOmpRegionOnMasterBegin(ctrl); 505 Call the library using OpenMP 506 PetscOmpCtrlOmpRegionOnMasterEnd(ctrl); 507 } 508 PetscOmpCtrlBarrier(ctrl); 509 510 Level: developer 511 512 .seealso PetscOmpCtrlOmpRegionOnMasterBegin(), PetscOmpCtrlOmpRegionOnMasterEnd() 513 @*/ 514 PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl) 515 { 516 PetscErrorCode ierr; 517 518 PetscFunctionBegin; 519 ierr = pthread_barrier_wait(ctrl->barrier); 520 if (ierr && ierr != PTHREAD_BARRIER_SERIAL_THREAD) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %D\n", ierr); 521 PetscFunctionReturn(0); 522 } 523 524 /*@C 525 PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks 526 527 Input Parameter: 528 . ctrl - a PETSc OMP controler 529 530 Notes: 531 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank. 532 This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime 533 534 Level: developer 535 536 .seealso: PetscOmpCtrlOmpRegionOnMasterEnd() 537 @*/ 538 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl) 539 { 540 PetscErrorCode ierr; 541 542 PetscFunctionBegin; 543 ierr = hwloc_set_cpubind(ctrl->topology,ctrl->omp_cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 544 omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */ 545 PetscFunctionReturn(0); 546 } 547 548 /*@C 549 PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks 550 551 Input Parameter: 552 . ctrl - a PETSc OMP controler 553 554 Notes: 555 Only master ranks can call this function. Call PetscOmpCtrlGetOmpComms to know if this is a master rank. 556 This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1. 557 558 Level: developer 559 560 .seealso: PetscOmpCtrlOmpRegionOnMasterBegin() 561 @*/ 562 PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl) 563 { 564 PetscErrorCode ierr; 565 566 PetscFunctionBegin; 567 ierr = hwloc_set_cpubind(ctrl->topology,ctrl->cpuset,HWLOC_CPUBIND_PROCESS);CHKERRQ(ierr); 568 omp_set_num_threads(1); 569 PetscFunctionReturn(0); 570 } 571 572 #undef USE_MMAP_ALLOCATE_SHARED_MEMORY 573 #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */ 574