1import config.package 2import os 3 4class Configure(config.package.Package): 5 def __init__(self, framework): 6 config.package.Package.__init__(self, framework) 7 self.minversion = '7.5' 8 self.versionname = 'CUDA_VERSION' 9 self.versioninclude = 'cuda.h' 10 self.requiresversion = 1 11 self.functions = ['cublasInit','cufftDestroy','nvmlInit_v2'] 12 self.includes = ['cublas.h','cufft.h','cusparse.h','cusolverDn.h','curand.h','thrust/version.h','nvml.h'] 13 self.basicliblist = [['libcudart.a','libnvtx3interop.a'],['libcudart.a','libnvToolsExt.a']] 14 self.mathliblist = [['libcufft.a', 'libcublas.a','libcusparse.a','libcusolver.a','libcurand.a']] 15 # CUDA provides 2 variants of libcuda.so (for access to CUDA driver API): 16 # - fully functional compile, runtime libraries installed with the GPU driver 17 # (for ex:) /usr/lib64/libcuda.so (compile), libcuda.so.1 (runtime) 18 # - stub library - usable only for compiles 19 # (for ex:) /usr/local/cuda/lib64/stubs/libcuda.so (without corresponding libcuda.so.1 for runtime) 20 # We are preferring this stub library - as it enables compiles on non-GPU nodes (for ex: login nodes). 21 # Using RPATH to this stub location is not appropriate - so skipping via libraries.rpathSkipDirs() 22 # Note: PETSc does not use CUDA driver API (as of Sep 29, 2021), but external package for ex: Kokkos does. 23 # 24 # see more at https://stackoverflow.com/a/52784819 25 self.stubliblist = [['libcuda.so','libnvidia-ml.so']] 26 self.liblist = 'dummy' # existence of self.liblist is used by package.py to determine if --with-cuda-lib must be provided 27 self.precisions = ['single','double'] 28 self.buildLanguages = ['CUDA'] 29 self.functionsDefine = ['cusolverDnDpotri'] 30 self.isnvhpc = 0 31 self.devicePackage = 1 32 self.skipMPIDependency = 1 33 return 34 35 def setupHelp(self, help): 36 import nargs 37 config.package.Package.setupHelp(self, help) 38 help.addArgument( 39 'CUDA', '-with-cuda-arch', 40 nargs.ArgString( 41 None, None, 42 'Cuda architecture for code generation, for example 70 (this may be used by external ' 43 'packages). A comma-separated list can be passed to target multiple architectures (e.g. ' 44 'for distribution). When using the nvcc compiler, other possible options include "all", ' 45 '"all-major", and "native" (see documentation of the nvcc "--gpu-architecture" flag)' 46 ) 47 ) 48 return 49 50 def __str__(self): 51 output = config.package.Package.__str__(self) 52 if hasattr(self,'cudaArch'): 53 output += ' CUDA SM '+self.cudaArch+'\n' 54 if hasattr(self.setCompilers,'CUDA_CXX'): 55 output += ' CUDA underlying compiler: CUDA_CXX=' + self.setCompilers.CUDA_CXX + '\n' 56 if hasattr(self.setCompilers,'CUDA_CXXFLAGS'): 57 output += ' CUDA underlying compiler flags: CUDA_CXXFLAGS=' + self.setCompilers.CUDA_CXXFLAGS + '\n' 58 if hasattr(self.setCompilers,'CUDA_CXXLIBS'): 59 output += ' CUDA underlying linker libraries: CUDA_CXXLIBS=' + self.setCompilers.CUDA_CXXLIBS + '\n' 60 return output 61 62 def cudaArchIsVersionList(self): 63 "whether the CUDA arch is a list of version numbers (vs a string like 'all')" 64 try: 65 self.cudaArchList() 66 except RuntimeError: 67 return False 68 else: 69 return True 70 71 def cudaArchList(self): 72 ''' 73 a list of the given cuda arch numbers. 74 raises RuntimeError if cuda arch is not a list of version numbers 75 ''' 76 if not hasattr(self,'cudaArch'): 77 raise RuntimeError('cudaArch is not set') from None 78 79 arch_list = self.cudaArch.split(',') 80 81 try: 82 for v in arch_list: 83 int(v) 84 except ValueError as e: 85 msg = 'only explicit cuda arch version numbers supported for this package ' 86 msg += '(got "'+self.cudaArch+'")' 87 raise RuntimeError(msg) from None 88 89 return arch_list 90 91 def cudaArchSingle(self): 92 ''' 93 Returns the single given CUDA arch, or raises RuntimeError if something else was specified 94 (like a list of numbers or "all") 95 ''' 96 arch_list = self.cudaArchList() 97 if len(arch_list) > 1: 98 raise RuntimeError('this package can only be compiled to target a single explicit CUDA arch ' 99 'version (got "'+self.cudaArch+'")') 100 return arch_list[0] 101 102 def nvccArchFlags(self): 103 if not self.cudaArchIsVersionList(): 104 return ' -arch='+self.cudaArch 105 106 if self.setCompilers.isCygwin(self.log): 107 arg_sep = '=' 108 else: 109 arg_sep = ' ' 110 111 # generate both SASS and PTX for the arch, see https://stackoverflow.com/a/35657430/3447299 112 # e.g., '-arch=sm_50' is equivalent to '-arch=compute_50 -code=sm_50,compute_50'. 113 return ''.join(' -arch=sm_'+gen for gen in self.cudaArchList()) 114 115 def clangArchFlags(self): 116 if not self.cudaArchIsVersionList(): 117 raise RuntimeError('clang only supports cuda archs specified as version number(s) (got "'+self.cudaArch+'")') 118 return ''.join(' --cuda-gpu-arch=sm_'+gen for gen in self.cudaArchList()) 119 120 def getCmakeCUDAArchFlag(self): 121 # CMake supports 'all', 'all-major', 'native', and a semicolon-separated list of numbers 122 if hasattr(self,'cudaArch'): 123 return ['-DCMAKE_CUDA_ARCHITECTURES:STRING="{}"'.format(self.cudaArch.replace(',', ';'))] 124 else: 125 return [] 126 127 def setupDependencies(self, framework): 128 config.package.Package.setupDependencies(self, framework) 129 self.scalarTypes = framework.require('PETSc.options.scalarTypes',self) 130 self.compilers = framework.require('config.compilers',self) 131 self.thrust = framework.require('config.packages.Thrust',self) 132 self.libraries = framework.require('config.libraries', self) 133 self.odeps = [self.thrust] # if user supplies thrust, install it first 134 return 135 136 def getSearchDirectories(self): 137 if hasattr(self, 'cudaDir'): 138 yield self.cudaDir 139 for i in config.package.Package.getSearchDirectories(self): yield i 140 return 141 142 def getIncludeDirs(self, prefix, includeDir): 143 ''' Generate cuda include dirs''' 144 # See comments below in generateLibList() for different prefix formats. 145 # format A, prefix = /path/cuda-11.4.0/, includeDir = 'include'. The superclass's method handles this well. 146 incDirs = config.package.Package.getIncludeDirs(self, prefix, includeDir) 147 148 if not isinstance(incDirs, list): 149 incDirs = [incDirs] 150 151 # format B and C, prefix = /path/nvhpc/Linux_x86_64/21.7/compilers or /path/nvhpc/Linux_x86_64/21.7/cuda 152 nvhpcDir = os.path.dirname(prefix) # /path/nvhpc/Linux_x86_64/21.7 153 nvhpcCudaIncDir = os.path.join(nvhpcDir,'cuda','include') 154 nvhpcMathIncDir = os.path.join(nvhpcDir,'math_libs','include') 155 if os.path.isdir(nvhpcCudaIncDir) and os.path.isdir(nvhpcMathIncDir): 156 incDirs.extend([nvhpcCudaIncDir,nvhpcMathIncDir]) 157 158 # format D, prefix = /path/nvhpc/Linux_x86_64/21.7/cuda/11.4 159 nvhpcDir = os.path.dirname(os.path.dirname(prefix)) # /path/nvhpc/Linux_x86_64/21.7 160 ver = os.path.basename(prefix) # 11.4 161 nvhpcCudaVerIncDir = os.path.join(nvhpcDir,'cuda',ver,'include') 162 nvhpcMathVerIncDir = os.path.join(nvhpcDir,'math_libs',ver,'include') 163 if os.path.isdir(nvhpcCudaVerIncDir) and os.path.isdir(nvhpcMathVerIncDir): 164 incDirs.extend([nvhpcCudaVerIncDir,nvhpcMathVerIncDir]) 165 return incDirs 166 167 def fixWinLib(liblist): 168 # libfoo.a -> foo.lib 169 winliblist = [] 170 for lib in liblist: 171 winliblist.append(lib[3:-1]+'lib') 172 return winliblist 173 174 def generateLibList(self, directory): 175 ''' Generate cuda liblist. The difficulty comes from that cuda can be in different directory structures through system, CUDAToolkit or NVHPC''' 176 177 if self.setCompilers.isCygwin(self.log): 178 self.basicliblist = [fixWinLib(lib) for lib in self.basicliblist] 179 self.mathliblist = [fixWinLib(lib) for lib in self.mathliblist] 180 self.stubliblist = [fixWinLib(lib) for lib in self.stubliblist] 181 182 # 1) From system installation (ex. Ubuntu 21.10), all libraries are on the compiler (nvcc)'s default search paths 183 # /usr/bin/nvcc 184 # /usr/include 185 # /usr/lib/x86_64-linux-gnu/{libcudart.so,..,libcuda.so,..,stubs}. See https://wiki.ubuntu.com/MultiarchSpec for info on this new directory structure 186 # 187 # 2) CUDAToolkit, with a directory structure like 188 # /path/cuda-11.4.0/{lib64, lib64/stubs}, here lib64/ contains all basic and math libraries 189 # +/include 190 # +/bin/{nvcc,..} 191 # 192 # 3) NVHPC, with a directory structure like 193 # /path/nvhpc/Linux_x86_64/21.7/compilers/bin/{nvcc,nvc,nvc++} 194 # +/cuda/{include,bin/nvcc,lib64,lib64/stubs}, just symbol links to what in cuda/11.4 195 # +/cuda/11.4/{include,bin/nvcc,lib64,lib64/stubs} 196 # +/math_libs/{include,lib64,lib64/stubs}, just symbol links to what in math_libs/11.4 197 # +/math_libs/11.4/{include,lib64,lib64/stubs} 198 # +/comm_libs/mpi/bin/{mpicc,mpicxx,mpifort} 199 # 200 # The input argument 'directory' could be in these formats: 201 # 0) '' We are checking if the compiler by default supports the libraries 202 # A) /path/cuda-11.4.0/lib64, by loading a CUDAToolkit or --with-cuda-dir=/path/cuda-11.4.0 203 # B) /path/nvhpc/Linux_x86_64/21.7/compilers/lib64, by loading a NVHPC module 204 # C) /path/nvhpc/Linux_x86_64/21.7/cuda/lib64, by --with-cuda-dir=/path/Linux_x86_64/21.7/cuda/ 205 # D) /path/nvhpc/Linux_x86_64/21.7/cuda/11.4/lib64, by --with-cuda-dir=/path/Linux_x86_64/21.7/cuda/11.4 206 207 # directory is None (''). Test if the compiler by default supports all libraries including the stub 208 if not directory and not self.isnvhpc: 209 self.liblist = [basicliblist+mathliblist+stubliblist for basicliblist in self.basicliblist for mathliblist in self.mathliblist for stubliblist in self.stubliblist] 210 liblist = config.package.Package.generateLibList(self, directory) 211 return liblist 212 213 # 'directory' is in format A, with basic and math libraries in one directory. 214 liblist = [] # initialize 215 if not self.isnvhpc: 216 toolkitCudaLibDir = directory 217 toolkitStubLibDir = os.path.join(toolkitCudaLibDir,'stubs') 218 if os.path.isdir(toolkitCudaLibDir) and os.path.isdir(toolkitStubLibDir): 219 self.libraries.addRpathSkipDir(toolkitStubLibDir) 220 self.liblist = [basicliblist+mathliblist for basicliblist in self.basicliblist for mathliblist in self.mathliblist] 221 cudaliblist = config.package.Package.generateLibList(self, toolkitCudaLibDir) 222 self.liblist = self.stubliblist 223 stubliblist = config.package.Package.generateLibList(self,toolkitStubLibDir) 224 liblist = [cudalib+stublib for cudalib in cudaliblist for stublib in stubliblist] 225 226 # 'directory' is in format B or C, and we peel 'directory' two times. 227 nvhpcDir = os.path.dirname(os.path.dirname(directory)) # /path/nvhpc/Linux_x86_64/21.7 228 nvhpcCudaLibDir = os.path.join(nvhpcDir,'cuda','lib64') 229 nvhpcMathLibDir = os.path.join(nvhpcDir,'math_libs','lib64') 230 nvhpcStubLibDir = os.path.join(nvhpcDir,'cuda','lib64','stubs') 231 if os.path.isdir(nvhpcCudaLibDir) and os.path.isdir(nvhpcMathLibDir) and os.path.isdir(nvhpcStubLibDir): 232 self.libraries.addRpathSkipDir(nvhpcStubLibDir) 233 self.liblist = self.basicliblist 234 basicliblist = config.package.Package.generateLibList(self, nvhpcCudaLibDir) 235 self.liblist = self.mathliblist 236 mathliblist = config.package.Package.generateLibList(self, nvhpcMathLibDir) 237 self.liblist = self.stubliblist 238 stubliblist = config.package.Package.generateLibList(self, nvhpcStubLibDir) 239 liblist += [basiclib+mathlib+stublib for basiclib in basicliblist for mathlib in mathliblist for stublib in stubliblist] 240 self.math_libs_dir = os.path.join(nvhpcDir,'math_libs') # might be used by Kokkos-Kernels 241 242 # 'directory' is in format D, and we peel 'directory' three times. 243 # We preserve the version info in case a NVHPC installation provides multiple cuda versions and we'd like to respect user's choice 244 nvhpcDir = os.path.dirname(os.path.dirname(os.path.dirname(directory))) # /path/nvhpc/Linux_x86_64/21.7 245 ver = os.path.basename(os.path.dirname(directory)) # 11.4 246 nvhpcCudaVerLibDir = os.path.join(nvhpcDir,'cuda',ver,'lib64') 247 nvhpcMathVerLibDir = os.path.join(nvhpcDir,'math_libs',ver,'lib64') 248 nvhpcStubVerLibDir = os.path.join(nvhpcDir,'cuda',ver,'lib64','stubs') 249 if os.path.isdir(nvhpcCudaVerLibDir) and os.path.isdir(nvhpcMathVerLibDir) and os.path.isdir(nvhpcStubVerLibDir): 250 self.libraries.addRpathSkipDir(nvhpcStubVerLibDir) 251 self.liblist = self.basicliblist 252 basicliblist = config.package.Package.generateLibList(self, nvhpcCudaVerLibDir) 253 self.liblist = self.mathliblist 254 mathliblist = config.package.Package.generateLibList(self, nvhpcMathVerLibDir) 255 self.liblist = self.stubliblist 256 stubliblist = config.package.Package.generateLibList(self, nvhpcStubVerLibDir) 257 liblist += [basiclib+mathlib+stublib for basiclib in basicliblist for mathlib in mathliblist for stublib in stubliblist] 258 self.math_libs_dir = os.path.join(nvhpcDir,'math_libs',ver) 259 return liblist 260 261 def checkSizeofVoidP(self): 262 '''Checks if the CUDA compiler agrees with the C compiler on what size of void * should be''' 263 self.log.write('Checking if sizeof(void*) in CUDA is the same as with regular compiler\n') 264 size = self.types.checkSizeof('void *', (8, 4), lang='CUDA', save=False) 265 if size != self.types.sizes['void-p']: 266 raise RuntimeError('CUDA Error: sizeof(void*) with CUDA compiler is ' + str(size) + ' which differs from sizeof(void*) with C compiler') 267 return 268 269 def checkThrustVersion(self,minVer): 270 '''Check if thrust version is >= minVer ''' 271 include = '#include <thrust/version.h> \n#if THRUST_VERSION < ' + str(minVer) + '\n#error "thrust version is too low"\n#endif\n' 272 self.pushLanguage('CUDA') 273 valid = self.checkCompile(include) 274 self.popLanguage() 275 return valid 276 277 def configureTypes(self): 278 import config.setCompilers 279 if not self.getDefaultPrecision() in ['double', 'single']: 280 raise RuntimeError('Must use either single or double precision with CUDA') 281 self.checkSizeofVoidP() 282 # if no user-supplied thrust, check the system's complex ability 283 if not self.thrust.found and self.scalarTypes.scalartype == 'complex': 284 if not self.checkThrustVersion(100908): 285 raise RuntimeError('CUDA Error: The thrust library is too low to support PetscComplex. Use --download-thrust or --with-thrust-dir to give a thrust >= 1.9.8') 286 return 287 288 def versionToStandardForm(self,ver): 289 '''Converts from CUDA 7050 notation to standard notation 7.5''' 290 return ".".join(map(str,[int(ver)//1000, int(ver)//10%10])) 291 292 def checkNVCCDoubleAlign(self): 293 if 'known-cuda-align-double' in self.argDB: 294 if not self.argDB['known-cuda-align-double']: 295 raise RuntimeError('CUDA error: PETSc currently requires that CUDA double alignment match the C compiler') 296 else: 297 typedef = 'typedef struct {double a; int b;} teststruct;\n' 298 cuda_size = self.types.checkSizeof('teststruct', (16, 12), lang='CUDA', codeBegin=typedef, save=False) 299 c_size = self.types.checkSizeof('teststruct', (16, 12), lang='C', codeBegin=typedef, save=False) 300 if c_size != cuda_size: 301 raise RuntimeError('CUDA compiler error: memory alignment doesn\'t match C compiler (try adding -malign-double to compiler options)') 302 return 303 304 def setCudaDir(self): 305 import os 306 self.pushLanguage('CUDA') 307 petscNvcc = self.getCompiler() 308 self.cudaclang = self.setCompilers.isClang(petscNvcc, self.log) 309 self.popLanguage() 310 311 # The presence of the cudaDir attribute means that PETSc has detected a conventional installation of CUDA. 312 # This seems to be needed by some external packages that can build against it. 313 # PETSc can be built when the various components (cudaruntime, cublas, etc) are scattered in different locations, 314 # like in the case of NVIDIA packages from pip. 315 if 'with-cuda-dir' in self.argDB and os.path.exists(os.path.join(self.argDB['with-cuda-dir'],'include','cuda.h')): 316 self.cudaDir = self.argDB['with-cuda-dir'] 317 if self.setCompilers.isCygwin(self.log): # Handle win32fe nvcc as the compiler name 318 petscNvcc = petscNvcc.split(' ')[1] 319 320 self.getExecutable(petscNvcc,getFullPath=1,resultName='systemNvcc') 321 if hasattr(self,'systemNvcc') and not hasattr(self, 'cudaDir'): 322 if self.cudaclang: 323 (out, err, ret) = Configure.executeShellCommand(petscNvcc + ' -v 2>&1 | grep "Found CUDA installation"',timeout = 60, log = self.log, threads = 1) 324 self.cudaDir = out.split()[3].replace(',','') 325 else: 326 nvccDir = os.path.dirname(self.systemNvcc) # /path/bin 327 d = os.path.dirname(nvccDir) # /path 328 # d might be /to/Linux_x86_64/21.7/cuda or /to/Linux_x86_64/21.7/cuda/12.2, check if math_libs exist. If yes, we are using NVHPC 329 if os.path.exists(os.path.join(d,'..','math_libs')) or os.path.exists(os.path.join(d,'..','..','math_libs')): 330 self.isnvhpc = 1 331 if os.path.exists(os.path.join(d,'include','cuda.h')): # CUDAToolkit with a structure /path/{bin/nvcc, include/cuda.h} 332 self.cudaDir = d 333 elif os.path.exists(os.path.normpath(os.path.join(d,'..','cuda','include','cuda.h'))): # could be NVHPC 334 self.cudaDir = os.path.normpath(os.path.join(d,'..','cuda')) # get rid of .. in path, getting /path/Linux_x86_64/21.5/cuda 335 336 def configureLibrary(self): 337 import re 338 339 self.setCudaDir() 340 # skip this because it does not properly set self.lib and self.include if they have already been set 341 if not self.found: config.package.Package.configureLibrary(self) 342 self.checkNVCCDoubleAlign() 343 self.configureTypes() 344 # includes from --download-thrust should override the prepackaged version in cuda - so list thrust.include before cuda.include on the compile command. 345 if self.thrust.found: 346 self.log.write('Overriding the thrust library in CUDAToolkit with a user-specified one\n') 347 self.include = self.thrust.include+self.include 348 349 self.pushLanguage('CUDA') 350 petscNvcc = self.getCompiler() 351 self.popLanguage() 352 353 # Handle CUDA arch 354 if 'with-cuda-arch' in self.framework.argDB: 355 self.cudaArch = self.argDB['with-cuda-arch'] 356 elif hasattr(self, 'cudaDir'): 357 dq = os.path.join(self.cudaDir,'extras','demo_suite') 358 self.getExecutable('deviceQuery',path = dq) 359 if hasattr(self,'deviceQuery'): 360 try: 361 (out, err, ret) = Configure.executeShellCommand(self.deviceQuery + ' | grep "CUDA Capability"',timeout = 60, log = self.log, threads = 1) 362 except Exception as e: 363 self.log.write('NVIDIA utility deviceQuery failed '+str(e)+'\n') 364 else: 365 try: 366 out = out.split('\n')[0] 367 sm = out[-3:] 368 self.cudaArch = str(int(10*float(sm))) 369 except: 370 self.log.write('Unable to parse the CUDA Capability output from the NVIDIA utility deviceQuery\n') 371 372 if not hasattr(self,'cudaArch') and not self.argDB['with-batch']: 373 includes = '''#include <stdio.h> 374 #include <cuda_runtime.h> 375 #include <cuda_runtime_api.h> 376 #include <cuda_device_runtime_api.h>''' 377 body = '''cudaError_t cerr; 378 cudaDeviceProp dp; 379 cerr = cudaGetDeviceProperties(&dp, 0); 380 if (cerr) { 381 #if (CUDART_VERSION >= 8000) 382 printf("Error calling cudaGetDeviceProperties with CUDA error %d (%s) : %s\\n", (int)cerr, cudaGetErrorName(cerr), cudaGetErrorString(cerr)); 383 #else 384 printf("Error calling cudaGetDeviceProperties with CUDA error %d\\n", (int)cerr); 385 #endif 386 } 387 else printf("%d\\n",10*dp.major+dp.minor); 388 return(cerr);''' 389 self.pushLanguage('CUDA') 390 try: 391 (output,status) = self.outputRun(includes, body) 392 except Exception as e: 393 self.log.write('petsc-supplied CUDA device query test failed: '+str(e)+'\n') 394 self.popLanguage() 395 else: 396 self.popLanguage() 397 self.log.write('petsc-supplied CUDA device query test output: '+output+', status: '+str(status)+'\n') 398 if not status: 399 try: 400 gen = int(output) 401 except: 402 pass 403 else: 404 self.log.write('petsc-supplied CUDA device query test found the CUDA Capability is '+str(gen)+'\n') 405 self.cudaArch = str(gen) 406 # Store min cuda arch at configure time for later error diagnosis 407 if self.cudaArchIsVersionList(): 408 self.addDefine('PKG_CUDA_MIN_ARCH', min(self.cudaArchList())) 409 410 # Check flags validity 411 if hasattr(self,'cudaArch'): 412 self.pushLanguage('CUDA') 413 if self.cudaclang: 414 self.setCompilers.CUDAFLAGS += self.clangArchFlags() 415 else: # assuming nvcc 416 self.setCompilers.CUDAFLAGS += self.nvccArchFlags() 417 418 try: 419 valid = self.checkCompile() 420 except Exception as e: 421 self.log.write('checkCompile on CUDA compile with gencode failed '+str(e)+'\n') 422 self.popLanguage() 423 valid = False 424 else: 425 self.log.write('Flag from checkCompile on CUDA compile with gencode '+str(valid)+'\n') 426 self.popLanguage() 427 428 if not valid: 429 raise RuntimeError('CUDA compile failed with arch flags "'+self.setCompilers.CUDAFLAGS+'"' 430 ' generated from "--with-cuda-arch='+self.cudaArch+'"') 431 432 self.addDefine('HAVE_CUPM','1') # Have either CUDA or HIP 433 if not self.version_tuple: 434 self.checkVersion(); # set version_tuple 435 if self.version_tuple[0] > 12 or (self.version_tuple[0] == 12 and self.version_tuple[1] >= 2): 436 self.addDefine('HAVE_CUDA_VERSION_12_2PLUS','1') 437 if self.version_tuple[0] >= 11: 438 self.addDefine('HAVE_CUDA_VERSION_11PLUS','1') 439 if self.cudaclang: 440 self.addDefine('HAVE_CUDA_CLANG','1') # code compilation in aijdevice and landau is broken 441 442 # determine the compiler used by nvcc 443 # '-ccbin mpicxx' might be in by self.setCompilers.CUDAFLAGS 444 if not self.cudaclang: 445 (out, err, ret) = Configure.executeShellCommand(petscNvcc + ' ' + self.setCompilers.CUDAFLAGS + ' --dryrun dummy.cu 2>&1 | grep D__CUDACC__ | head -1 | cut -f2 -d" "') 446 if out: 447 # MPI.py adds its include paths and libraries to these lists and saves them again 448 self.setCompilers.CUDA_CXX = out 449 self.setCompilers.CUDA_CXXFLAGS = '' 450 self.setCompilers.CUDA_CXXLIBS = '' 451 self.logPrint('Determined the compiler nvcc uses is ' + out); 452 self.logPrint('PETSc C compiler '+self.compilers.CC) 453 self.logPrint('PETSc C++ compiler '+self.compilers.CXX) 454 455 # TODO: How to handle MPI compiler wrapper as opposed to its underlying compiler 456 if out == self.compilers.CXX: 457 # nvcc will say it is using gcc as its compiler, it pass a flag when using to 458 # treat it as a C++ compiler 459 newFlags = self.setCompilers.CXXPPFLAGS.split()+self.setCompilers.CXXFLAGS.split() 460 # need to remove the std flag from the list, nvcc will already have its own flag set 461 # With IBM XL compilers, we also need to remove -+ 462 # Remove -O since the optimization level is already set by CUDAC_FLAGS, otherwise Kokkos nvcc_wrapper will complain 463 # "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the last 464 # is used because nvcc can only accept a single optimization setting." 465 self.setCompilers.CUDA_CXXFLAGS = ' '.join([flg for flg in newFlags if not flg.startswith(('-std=c++','-std=gnu++','-+','-O'))]) 466 else: 467 # only add any -I arguments since compiler arguments may not work 468 flags = self.setCompilers.CPPFLAGS.split(' ')+self.setCompilers.CXXFLAGS.split(' ') 469 for i in flags: 470 if i.startswith('-I'): 471 self.setCompilers.CUDA_CXXFLAGS += ' '+i 472 # set compiler flags for compiler called by nvcc 473 if self.setCompilers.CUDA_CXXFLAGS: 474 self.addMakeMacro('CUDA_CXXFLAGS',self.setCompilers.CUDA_CXXFLAGS) 475 else: 476 self.logPrint('No CUDA_CXXFLAGS available') 477 self.addMakeMacro('CUDA_CXX',self.setCompilers.CUDA_CXX) 478 479 # Intel compiler environment breaks GNU compilers, fix it just enough to allow g++ to run 480 if self.setCompilers.CUDA_CXX == 'gcc' and config.setCompilers.Configure.isIntel(self.compilers.CXX,self.log): 481 self.logPrint('''Removing Intel's CPLUS_INCLUDE_PATH when using nvcc since it breaks g++''') 482 self.delMakeMacro('CUDAC') 483 self.addMakeMacro('CUDAC','CPLUS_INCLUDE_PATH="" '+petscNvcc) 484 else: 485 self.logPrint('nvcc --dryrun failed, unable to determine CUDA_CXX and CUDA_CXXFLAGS') 486 487 if not self.cudaclang: 488 self.addMakeMacro('CUDA_HOSTFLAGS','--compiler-options="$(CXXCPPFLAGS) $(CUDA_CXXFLAGS)"') 489 self.addMakeMacro('CUDA_PETSC_GENDEPS','$(call quiet,CUDAC,.dep) --generate-dependencies --output-directory=$(@D) $(MPICXX_INCLUDES) $(CUDAC_FLAGS) --compiler-options="$(CXXCPPFLAGS) $(CUDA_CXXFLAGS)"') 490 else: 491 self.addMakeMacro('CUDA_HOSTFLAGS','$(CXXCPPFLAGS) $(CUDA_CXXFLAGS) $(CUDA_DEPFLAGS) $(PETSC_CC_INCLUDES)') 492 self.addMakeMacro('CUDA_PETSC_GENDEPS','true') 493 return 494 495 def checkKnownBadCUDAHostCompilerCombo(self): 496 """ 497 Check for nvcc + host compiler combinations that are unable to compile or have some other known 498 defect and prints a warning to the user. Has no other effect. 499 500 For example: 501 1. CUDA 11.5 + gcc 11.3.0 produces 502 503 /usr/include/c++/11/bits/std_function.h:435:145: error: parameter packs not expanded with '...': 504 435 | function(_Functor&& __f) 505 | ^ 506 507 """ 508 if not self.argDB['with-cuda']: 509 return 510 511 assert self.version_tuple 512 assert isinstance(self.version_tuple, tuple) 513 assert isinstance(self.version_tuple[0], int) 514 if self.version_tuple[:2] == (11, 5): 515 # CUDA 11.5.X 516 cxx = self.setCompilers.CXX 517 if self.setCompilers.isGNU(cxx, self.log): 518 output, _, _ = self.executeShellCommand(cxx + ' -dumpfullversion', log=self.log) 519 gcc_version = output.strip().split('.') 520 gcc_version_tuple = tuple(map(int, gcc_version)) 521 if gcc_version_tuple[:3] == (11, 3, 0): 522 mess = """ 523 You appear to be using CUDA {} and GCC {}. If you get compile errors along the lines of: 524 525 /usr/include/c++/11/bits/std_function.h:435:145: error: parameter packs not expanded with '...': 526 435 | function(_Functor&& __f) 527 528 This is a bug that crops up with exactly the combination of CUDA 11.5.X + GCC 11.3.0. 529 It is a bug in nvcc itself, and the file (C++ standard header <function>) originates from 530 within CUDA headers. There is no way to work around it in software. 531 532 Your only options are: 533 - Use a newer nvcc version 534 - Use an older gcc version 535 """.format('.'.join(map(str, self.version_tuple)), gcc_version) 536 self.logPrintWarning(mess) 537 return 538 539 def configure(self, *args, **kwargs): 540 super().configure(*args, **kwargs) 541 self.executeTest(self.checkKnownBadCUDAHostCompilerCombo) 542 return 543