xref: /petsc/config/BuildSystem/config/packages/CUDA.py (revision 5fa70555f2cfa5f8527759fb2fd8b5523acdf153)
1import config.package
2import os
3
4class Configure(config.package.Package):
5  def __init__(self, framework):
6    config.package.Package.__init__(self, framework)
7    self.minversion        = '7.5'
8    self.versionname       = 'CUDA_VERSION'
9    self.versioninclude    = 'cuda.h'
10    self.requiresversion   = 1
11    self.functions         = ['cublasInit','cufftDestroy','nvmlInit_v2']
12    self.includes          = ['cublas.h','cufft.h','cusparse.h','cusolverDn.h','curand.h','thrust/version.h','nvml.h']
13    self.basicliblist      = [['libcudart.a','libnvtx3interop.a'],['libcudart.a','libnvToolsExt.a']]
14    self.mathliblist       = [['libcufft.a', 'libcublas.a','libcusparse.a','libcusolver.a','libcurand.a']]
15    # CUDA provides 2 variants of libcuda.so (for access to CUDA driver API):
16    # - fully functional compile, runtime libraries installed with the GPU driver
17    #    (for ex:) /usr/lib64/libcuda.so (compile), libcuda.so.1 (runtime)
18    # -	stub library - usable only for compiles
19    # 	 (for ex:) /usr/local/cuda/lib64/stubs/libcuda.so  (without corresponding libcuda.so.1 for runtime)
20    # We are preferring this stub library - as it enables compiles on non-GPU nodes (for ex: login nodes).
21    # Using RPATH to this stub location is not appropriate - so skipping via libraries.rpathSkipDirs()
22    # Note: PETSc does not use CUDA driver API (as of Sep 29, 2021), but external package for ex: Kokkos does.
23    #
24    # see more at https://stackoverflow.com/a/52784819
25    self.stubliblist       = [['libcuda.so','libnvidia-ml.so']]
26    self.liblist           = 'dummy' # existence of self.liblist is used by package.py to determine if --with-cuda-lib must be provided
27    self.precisions        = ['single','double']
28    self.buildLanguages    = ['CUDA']
29    self.functionsDefine   = ['cusolverDnDpotri']
30    self.isnvhpc           = 0
31    self.devicePackage     = 1
32    self.skipMPIDependency = 1
33    return
34
35  def setupHelp(self, help):
36    import nargs
37    config.package.Package.setupHelp(self, help)
38    help.addArgument(
39      'CUDA', '-with-cuda-arch',
40      nargs.ArgString(
41        None, None,
42        'Cuda architecture for code generation, for example 70 (this may be used by external '
43        'packages). A comma-separated list can be passed to target multiple architectures (e.g. '
44        'for distribution). When using the nvcc compiler, other possible options include "all", '
45        '"all-major", and "native" (see documentation of the nvcc "--gpu-architecture" flag)'
46      )
47    )
48    return
49
50  def __str__(self):
51    output  = config.package.Package.__str__(self)
52    if hasattr(self,'cudaArch'):
53      output += '  CUDA SM '+self.cudaArch+'\n'
54    if hasattr(self.setCompilers,'CUDA_CXX'):
55      output += '  CUDA underlying compiler: CUDA_CXX=' + self.setCompilers.CUDA_CXX + '\n'
56    if hasattr(self.setCompilers,'CUDA_CXXFLAGS'):
57      output += '  CUDA underlying compiler flags: CUDA_CXXFLAGS=' + self.setCompilers.CUDA_CXXFLAGS + '\n'
58    if hasattr(self.setCompilers,'CUDA_CXXLIBS'):
59      output += '  CUDA underlying linker libraries: CUDA_CXXLIBS=' + self.setCompilers.CUDA_CXXLIBS + '\n'
60    return output
61
62  def cudaArchIsVersionList(self):
63    "whether the CUDA arch is a list of version numbers (vs a string like 'all')"
64    try:
65      self.cudaArchList()
66    except RuntimeError:
67      return False
68    else:
69      return True
70
71  def cudaArchList(self):
72    '''
73    a list of the given cuda arch numbers.
74    raises RuntimeError if cuda arch is not a list of version numbers
75    '''
76    if not hasattr(self,'cudaArch'):
77      raise RuntimeError('cudaArch is not set') from None
78
79    arch_list = self.cudaArch.split(',')
80
81    try:
82      for v in arch_list:
83        int(v)
84    except ValueError as e:
85      msg = 'only explicit cuda arch version numbers supported for this package '
86      msg += '(got "'+self.cudaArch+'")'
87      raise RuntimeError(msg) from None
88
89    return arch_list
90
91  def cudaArchSingle(self):
92    '''
93    Returns the single given CUDA arch, or raises RuntimeError if something else was specified
94    (like a list of numbers or "all")
95    '''
96    arch_list = self.cudaArchList()
97    if len(arch_list) > 1:
98      raise RuntimeError('this package can only be compiled to target a single explicit CUDA arch '
99                         'version (got "'+self.cudaArch+'")')
100    return arch_list[0]
101
102  def nvccArchFlags(self):
103    if not self.cudaArchIsVersionList():
104      return ' -arch='+self.cudaArch
105
106    if self.setCompilers.isCygwin(self.log):
107      arg_sep = '='
108    else:
109      arg_sep = ' '
110
111    # generate both SASS and PTX for the arch, see https://stackoverflow.com/a/35657430/3447299
112    # e.g., '-arch=sm_50' is equivalent to '-arch=compute_50 -code=sm_50,compute_50'.
113    return ''.join(' -arch=sm_'+gen for gen in self.cudaArchList())
114
115  def clangArchFlags(self):
116    if not self.cudaArchIsVersionList():
117      raise RuntimeError('clang only supports cuda archs specified as version number(s) (got "'+self.cudaArch+'")')
118    return ''.join(' --cuda-gpu-arch=sm_'+gen for gen in self.cudaArchList())
119
120  def getCmakeCUDAArchFlag(self):
121    # CMake supports 'all', 'all-major', 'native', and a semicolon-separated list of numbers
122    if hasattr(self,'cudaArch'):
123      return ['-DCMAKE_CUDA_ARCHITECTURES:STRING="{}"'.format(self.cudaArch.replace(',', ';'))]
124    else:
125      return []
126
127  def setupDependencies(self, framework):
128    config.package.Package.setupDependencies(self, framework)
129    self.scalarTypes  = framework.require('PETSc.options.scalarTypes',self)
130    self.compilers    = framework.require('config.compilers',self)
131    self.thrust       = framework.require('config.packages.Thrust',self)
132    self.libraries    = framework.require('config.libraries', self)
133    self.odeps        = [self.thrust] # if user supplies thrust, install it first
134    return
135
136  def getSearchDirectories(self):
137    if hasattr(self, 'cudaDir'):
138      yield self.cudaDir
139    for i in config.package.Package.getSearchDirectories(self): yield i
140    return
141
142  def getIncludeDirs(self, prefix, includeDir):
143    ''' Generate cuda include dirs'''
144    # See comments below in generateLibList() for different prefix formats.
145    # format A, prefix = /path/cuda-11.4.0/, includeDir = 'include'. The superclass's method handles this well.
146    incDirs = config.package.Package.getIncludeDirs(self, prefix, includeDir)
147
148    if not isinstance(incDirs, list):
149      incDirs = [incDirs]
150
151    # format B and C, prefix = /path/nvhpc/Linux_x86_64/21.7/compilers or  /path/nvhpc/Linux_x86_64/21.7/cuda
152    nvhpcDir        = os.path.dirname(prefix) # /path/nvhpc/Linux_x86_64/21.7
153    nvhpcCudaIncDir = os.path.join(nvhpcDir,'cuda','include')
154    nvhpcMathIncDir = os.path.join(nvhpcDir,'math_libs','include')
155    if os.path.isdir(nvhpcCudaIncDir) and os.path.isdir(nvhpcMathIncDir):
156      incDirs.extend([nvhpcCudaIncDir,nvhpcMathIncDir])
157
158    # format D, prefix = /path/nvhpc/Linux_x86_64/21.7/cuda/11.4
159    nvhpcDir           = os.path.dirname(os.path.dirname(prefix))  # /path/nvhpc/Linux_x86_64/21.7
160    ver                = os.path.basename(prefix) # 11.4
161    nvhpcCudaVerIncDir = os.path.join(nvhpcDir,'cuda',ver,'include')
162    nvhpcMathVerIncDir = os.path.join(nvhpcDir,'math_libs',ver,'include')
163    if os.path.isdir(nvhpcCudaVerIncDir) and os.path.isdir(nvhpcMathVerIncDir):
164      incDirs.extend([nvhpcCudaVerIncDir,nvhpcMathVerIncDir])
165    return incDirs
166
167  def fixWinLib(liblist):
168    # libfoo.a -> foo.lib
169    winliblist = []
170    for lib in liblist:
171      winliblist.append(lib[3:-1]+'lib')
172      return winliblist
173
174  def generateLibList(self, directory):
175    ''' Generate cuda liblist. The difficulty comes from that cuda can be in different directory structures through system, CUDAToolkit or NVHPC'''
176
177    if self.setCompilers.isCygwin(self.log):
178      self.basicliblist = [fixWinLib(lib) for lib in self.basicliblist]
179      self.mathliblist = [fixWinLib(lib) for lib in self.mathliblist]
180      self.stubliblist = [fixWinLib(lib) for lib in self.stubliblist]
181
182    # 1) From system installation (ex. Ubuntu 21.10), all libraries are on the compiler (nvcc)'s default search paths
183    #   /usr/bin/nvcc
184    #   /usr/include
185    #   /usr/lib/x86_64-linux-gnu/{libcudart.so,..,libcuda.so,..,stubs}.  See https://wiki.ubuntu.com/MultiarchSpec for info on this new directory structure
186    #
187    # 2) CUDAToolkit, with a directory structure like
188    #   /path/cuda-11.4.0/{lib64, lib64/stubs}, here lib64/ contains all basic and math libraries
189    #                   +/include
190    #                   +/bin/{nvcc,..}
191    #
192    # 3) NVHPC, with a directory structure like
193    # /path/nvhpc/Linux_x86_64/21.7/compilers/bin/{nvcc,nvc,nvc++}
194    #                             +/cuda/{include,bin/nvcc,lib64,lib64/stubs}, just symbol links to what in cuda/11.4
195    #                             +/cuda/11.4/{include,bin/nvcc,lib64,lib64/stubs}
196    #                             +/math_libs/{include,lib64,lib64/stubs}, just symbol links to what in math_libs/11.4
197    #                             +/math_libs/11.4/{include,lib64,lib64/stubs}
198    #                             +/comm_libs/mpi/bin/{mpicc,mpicxx,mpifort}
199    #
200    # The input argument 'directory' could be in these formats:
201    # 0) ''                                             We are checking if the compiler by default supports the libraries
202    # A) /path/cuda-11.4.0/lib64,                       by loading a CUDAToolkit or --with-cuda-dir=/path/cuda-11.4.0
203    # B) /path/nvhpc/Linux_x86_64/21.7/compilers/lib64, by loading a NVHPC module
204    # C) /path/nvhpc/Linux_x86_64/21.7/cuda/lib64,      by --with-cuda-dir=/path/Linux_x86_64/21.7/cuda/
205    # D) /path/nvhpc/Linux_x86_64/21.7/cuda/11.4/lib64, by --with-cuda-dir=/path/Linux_x86_64/21.7/cuda/11.4
206
207    # directory is None (''). Test if the compiler by default supports all libraries including the stub
208    if not directory and not self.isnvhpc:
209      self.liblist = [basicliblist+mathliblist+stubliblist for basicliblist in self.basicliblist for mathliblist in self.mathliblist for stubliblist in self.stubliblist]
210      liblist      = config.package.Package.generateLibList(self, directory)
211      return liblist
212
213    # 'directory' is in format A, with basic and math libraries in one directory.
214    liblist           = [] # initialize
215    if not self.isnvhpc:
216      toolkitCudaLibDir = directory
217      toolkitStubLibDir = os.path.join(toolkitCudaLibDir,'stubs')
218      if os.path.isdir(toolkitCudaLibDir) and os.path.isdir(toolkitStubLibDir):
219        self.libraries.addRpathSkipDir(toolkitStubLibDir)
220        self.liblist = [basicliblist+mathliblist for basicliblist in self.basicliblist for mathliblist in self.mathliblist]
221        cudaliblist  = config.package.Package.generateLibList(self, toolkitCudaLibDir)
222        self.liblist = self.stubliblist
223        stubliblist  = config.package.Package.generateLibList(self,toolkitStubLibDir)
224        liblist      = [cudalib+stublib for cudalib in cudaliblist for stublib in stubliblist]
225
226    # 'directory' is in format B or C, and we peel 'directory' two times.
227    nvhpcDir        = os.path.dirname(os.path.dirname(directory)) # /path/nvhpc/Linux_x86_64/21.7
228    nvhpcCudaLibDir = os.path.join(nvhpcDir,'cuda','lib64')
229    nvhpcMathLibDir = os.path.join(nvhpcDir,'math_libs','lib64')
230    nvhpcStubLibDir = os.path.join(nvhpcDir,'cuda','lib64','stubs')
231    if os.path.isdir(nvhpcCudaLibDir) and os.path.isdir(nvhpcMathLibDir) and os.path.isdir(nvhpcStubLibDir):
232      self.libraries.addRpathSkipDir(nvhpcStubLibDir)
233      self.liblist = self.basicliblist
234      basicliblist  = config.package.Package.generateLibList(self, nvhpcCudaLibDir)
235      self.liblist = self.mathliblist
236      mathliblist  = config.package.Package.generateLibList(self, nvhpcMathLibDir)
237      self.liblist = self.stubliblist
238      stubliblist  = config.package.Package.generateLibList(self, nvhpcStubLibDir)
239      liblist += [basiclib+mathlib+stublib for basiclib in basicliblist for mathlib in mathliblist for stublib in stubliblist]
240      self.math_libs_dir = os.path.join(nvhpcDir,'math_libs') # might be used by Kokkos-Kernels
241
242    # 'directory' is in format D, and we peel 'directory' three times.
243    # We preserve the version info in case a NVHPC installation provides multiple cuda versions and we'd like to respect user's choice
244    nvhpcDir           = os.path.dirname(os.path.dirname(os.path.dirname(directory))) # /path/nvhpc/Linux_x86_64/21.7
245    ver                = os.path.basename(os.path.dirname(directory)) # 11.4
246    nvhpcCudaVerLibDir = os.path.join(nvhpcDir,'cuda',ver,'lib64')
247    nvhpcMathVerLibDir = os.path.join(nvhpcDir,'math_libs',ver,'lib64')
248    nvhpcStubVerLibDir = os.path.join(nvhpcDir,'cuda',ver,'lib64','stubs')
249    if os.path.isdir(nvhpcCudaVerLibDir) and os.path.isdir(nvhpcMathVerLibDir) and os.path.isdir(nvhpcStubVerLibDir):
250      self.libraries.addRpathSkipDir(nvhpcStubVerLibDir)
251      self.liblist = self.basicliblist
252      basicliblist  = config.package.Package.generateLibList(self, nvhpcCudaVerLibDir)
253      self.liblist = self.mathliblist
254      mathliblist  = config.package.Package.generateLibList(self, nvhpcMathVerLibDir)
255      self.liblist = self.stubliblist
256      stubliblist  = config.package.Package.generateLibList(self, nvhpcStubVerLibDir)
257      liblist += [basiclib+mathlib+stublib for basiclib in basicliblist for mathlib in mathliblist for stublib in stubliblist]
258      self.math_libs_dir = os.path.join(nvhpcDir,'math_libs',ver)
259    return liblist
260
261  def checkSizeofVoidP(self):
262    '''Checks if the CUDA compiler agrees with the C compiler on what size of void * should be'''
263    self.log.write('Checking if sizeof(void*) in CUDA is the same as with regular compiler\n')
264    size = self.types.checkSizeof('void *', (8, 4), lang='CUDA', save=False)
265    if size != self.types.sizes['void-p']:
266      raise RuntimeError('CUDA Error: sizeof(void*) with CUDA compiler is ' + str(size) + ' which differs from sizeof(void*) with C compiler')
267    return
268
269  def checkThrustVersion(self,minVer):
270    '''Check if thrust version is >= minVer '''
271    include = '#include <thrust/version.h> \n#if THRUST_VERSION < ' + str(minVer) + '\n#error "thrust version is too low"\n#endif\n'
272    self.pushLanguage('CUDA')
273    valid = self.checkCompile(include)
274    self.popLanguage()
275    return valid
276
277  def configureTypes(self):
278    import config.setCompilers
279    if not self.getDefaultPrecision() in ['double', 'single']:
280      raise RuntimeError('Must use either single or double precision with CUDA')
281    self.checkSizeofVoidP()
282    # if no user-supplied thrust, check the system's complex ability
283    if not self.thrust.found and self.scalarTypes.scalartype == 'complex':
284      if not self.checkThrustVersion(100908):
285        raise RuntimeError('CUDA Error: The thrust library is too low to support PetscComplex. Use --download-thrust or --with-thrust-dir to give a thrust >= 1.9.8')
286    return
287
288  def versionToStandardForm(self,ver):
289    '''Converts from CUDA 7050 notation to standard notation 7.5'''
290    return ".".join(map(str,[int(ver)//1000, int(ver)//10%10]))
291
292  def checkNVCCDoubleAlign(self):
293    if 'known-cuda-align-double' in self.argDB:
294      if not self.argDB['known-cuda-align-double']:
295        raise RuntimeError('CUDA error: PETSc currently requires that CUDA double alignment match the C compiler')
296    else:
297      typedef = 'typedef struct {double a; int b;} teststruct;\n'
298      cuda_size = self.types.checkSizeof('teststruct', (16, 12), lang='CUDA', codeBegin=typedef, save=False)
299      c_size = self.types.checkSizeof('teststruct', (16, 12), lang='C', codeBegin=typedef, save=False)
300      if c_size != cuda_size:
301        raise RuntimeError('CUDA compiler error: memory alignment doesn\'t match C compiler (try adding -malign-double to compiler options)')
302    return
303
304  def setCudaDir(self):
305    import os
306    self.pushLanguage('CUDA')
307    petscNvcc = self.getCompiler()
308    self.cudaclang = self.setCompilers.isClang(petscNvcc, self.log)
309    self.popLanguage()
310
311    # The presence of the cudaDir attribute means that PETSc has detected a conventional installation of CUDA.
312    # This seems to be needed by some external packages that can build against it.
313    # PETSc can be built when the various components (cudaruntime, cublas, etc) are scattered in different locations,
314    # like in the case of NVIDIA packages from pip.
315    if 'with-cuda-dir' in self.argDB and os.path.exists(os.path.join(self.argDB['with-cuda-dir'],'include','cuda.h')):
316      self.cudaDir = self.argDB['with-cuda-dir']
317    if self.setCompilers.isCygwin(self.log):  # Handle win32fe nvcc as the compiler name
318      petscNvcc = petscNvcc.split(' ')[1]
319
320    self.getExecutable(petscNvcc,getFullPath=1,resultName='systemNvcc')
321    if hasattr(self,'systemNvcc') and not hasattr(self, 'cudaDir'):
322      if self.cudaclang:
323        (out, err, ret) = Configure.executeShellCommand(petscNvcc + ' -v 2>&1 | grep "Found CUDA installation"',timeout = 60, log = self.log, threads = 1)
324        self.cudaDir = out.split()[3].replace(',','')
325      else:
326        nvccDir = os.path.dirname(self.systemNvcc) # /path/bin
327        d = os.path.dirname(nvccDir) # /path
328        # d might be /to/Linux_x86_64/21.7/cuda or /to/Linux_x86_64/21.7/cuda/12.2, check if math_libs exist. If yes, we are using NVHPC
329        if os.path.exists(os.path.join(d,'..','math_libs')) or os.path.exists(os.path.join(d,'..','..','math_libs')):
330          self.isnvhpc = 1
331        if os.path.exists(os.path.join(d,'include','cuda.h')): # CUDAToolkit with a structure /path/{bin/nvcc, include/cuda.h}
332          self.cudaDir = d
333        elif os.path.exists(os.path.normpath(os.path.join(d,'..','cuda','include','cuda.h'))): # could be NVHPC
334          self.cudaDir = os.path.normpath(os.path.join(d,'..','cuda')) # get rid of .. in path, getting /path/Linux_x86_64/21.5/cuda
335
336  def configureLibrary(self):
337    import re
338
339    self.setCudaDir()
340    # skip this because it does not properly set self.lib and self.include if they have already been set
341    if not self.found: config.package.Package.configureLibrary(self)
342    self.checkNVCCDoubleAlign()
343    self.configureTypes()
344    # includes from --download-thrust should override the prepackaged version in cuda - so list thrust.include before cuda.include on the compile command.
345    if self.thrust.found:
346      self.log.write('Overriding the thrust library in CUDAToolkit with a user-specified one\n')
347      self.include = self.thrust.include+self.include
348
349    self.pushLanguage('CUDA')
350    petscNvcc = self.getCompiler()
351    self.popLanguage()
352
353    # Handle CUDA arch
354    if 'with-cuda-arch' in self.framework.argDB:
355      self.cudaArch = self.argDB['with-cuda-arch']
356    elif hasattr(self, 'cudaDir'):
357      dq = os.path.join(self.cudaDir,'extras','demo_suite')
358      self.getExecutable('deviceQuery',path = dq)
359      if hasattr(self,'deviceQuery'):
360        try:
361          (out, err, ret) = Configure.executeShellCommand(self.deviceQuery + ' | grep "CUDA Capability"',timeout = 60, log = self.log, threads = 1)
362        except Exception as e:
363          self.log.write('NVIDIA utility deviceQuery failed '+str(e)+'\n')
364        else:
365          try:
366            out = out.split('\n')[0]
367            sm = out[-3:]
368            self.cudaArch = str(int(10*float(sm)))
369          except:
370            self.log.write('Unable to parse the CUDA Capability output from the NVIDIA utility deviceQuery\n')
371
372    if not hasattr(self,'cudaArch') and not self.argDB['with-batch']:
373        includes = '''#include <stdio.h>
374                    #include <cuda_runtime.h>
375                    #include <cuda_runtime_api.h>
376                    #include <cuda_device_runtime_api.h>'''
377        body = '''cudaError_t cerr;
378                cudaDeviceProp dp;
379                cerr = cudaGetDeviceProperties(&dp, 0);
380                if (cerr) {
381              #if (CUDART_VERSION >= 8000)
382                  printf("Error calling cudaGetDeviceProperties with CUDA error %d (%s) : %s\\n", (int)cerr, cudaGetErrorName(cerr), cudaGetErrorString(cerr));
383              #else
384                  printf("Error calling cudaGetDeviceProperties with CUDA error %d\\n", (int)cerr);
385              #endif
386                }
387                else printf("%d\\n",10*dp.major+dp.minor);
388                return(cerr);'''
389        self.pushLanguage('CUDA')
390        try:
391          (output,status) = self.outputRun(includes, body)
392        except Exception as e:
393          self.log.write('petsc-supplied CUDA device query test failed: '+str(e)+'\n')
394          self.popLanguage()
395        else:
396          self.popLanguage()
397          self.log.write('petsc-supplied CUDA device query test output: '+output+', status: '+str(status)+'\n')
398          if not status:
399            try:
400              gen = int(output)
401            except:
402              pass
403            else:
404              self.log.write('petsc-supplied CUDA device query test found the CUDA Capability is '+str(gen)+'\n')
405              self.cudaArch = str(gen)
406    # Store min cuda arch at configure time for later error diagnosis
407    if self.cudaArchIsVersionList():
408      self.addDefine('PKG_CUDA_MIN_ARCH', min(self.cudaArchList()))
409
410    # Check flags validity
411    if hasattr(self,'cudaArch'):
412      self.pushLanguage('CUDA')
413      if self.cudaclang:
414        self.setCompilers.CUDAFLAGS += self.clangArchFlags()
415      else: # assuming nvcc
416        self.setCompilers.CUDAFLAGS += self.nvccArchFlags()
417
418      try:
419        valid = self.checkCompile()
420      except Exception as e:
421        self.log.write('checkCompile on CUDA compile with gencode failed '+str(e)+'\n')
422        self.popLanguage()
423        valid = False
424      else:
425        self.log.write('Flag from checkCompile on CUDA compile with gencode '+str(valid)+'\n')
426        self.popLanguage()
427
428      if not valid:
429        raise RuntimeError('CUDA compile failed with arch flags "'+self.setCompilers.CUDAFLAGS+'"'
430                           ' generated from "--with-cuda-arch='+self.cudaArch+'"')
431
432    self.addDefine('HAVE_CUPM','1') # Have either CUDA or HIP
433    if not self.version_tuple:
434      self.checkVersion(); # set version_tuple
435    if self.version_tuple[0] > 12 or (self.version_tuple[0] == 12 and self.version_tuple[1] >= 2):
436      self.addDefine('HAVE_CUDA_VERSION_12_2PLUS','1')
437    if self.version_tuple[0] >= 11:
438      self.addDefine('HAVE_CUDA_VERSION_11PLUS','1')
439    if self.cudaclang:
440      self.addDefine('HAVE_CUDA_CLANG','1') # code compilation in aijdevice and landau is broken
441
442    # determine the compiler used by nvcc
443    # '-ccbin mpicxx' might be in by self.setCompilers.CUDAFLAGS
444    if not self.cudaclang:
445      (out, err, ret) = Configure.executeShellCommand(petscNvcc + ' ' + self.setCompilers.CUDAFLAGS + ' --dryrun dummy.cu 2>&1 | grep D__CUDACC__ | head -1 | cut -f2 -d" "')
446      if out:
447        # MPI.py adds its include paths and libraries to these lists and saves them again
448        self.setCompilers.CUDA_CXX = out
449        self.setCompilers.CUDA_CXXFLAGS = ''
450        self.setCompilers.CUDA_CXXLIBS = ''
451        self.logPrint('Determined the compiler nvcc uses is ' + out);
452        self.logPrint('PETSc C compiler '+self.compilers.CC)
453        self.logPrint('PETSc C++ compiler '+self.compilers.CXX)
454
455        # TODO: How to handle MPI compiler wrapper as opposed to its underlying compiler
456        if out == self.compilers.CXX:
457          # nvcc will say it is using gcc as its compiler, it pass a flag when using to
458          # treat it as a C++ compiler
459          newFlags = self.setCompilers.CXXPPFLAGS.split()+self.setCompilers.CXXFLAGS.split()
460          # need to remove the std flag from the list, nvcc will already have its own flag set
461          # With IBM XL compilers, we also need to remove -+
462          # Remove -O since the optimization level is already set by CUDAC_FLAGS, otherwise Kokkos nvcc_wrapper will complain
463          #   "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the last
464          #    is used because nvcc can only accept a single optimization setting."
465          self.setCompilers.CUDA_CXXFLAGS = ' '.join([flg for flg in newFlags if not flg.startswith(('-std=c++','-std=gnu++','-+','-O'))])
466        else:
467          # only add any -I arguments since compiler arguments may not work
468          flags = self.setCompilers.CPPFLAGS.split(' ')+self.setCompilers.CXXFLAGS.split(' ')
469          for i in flags:
470            if i.startswith('-I'):
471              self.setCompilers.CUDA_CXXFLAGS += ' '+i
472        # set compiler flags for compiler called by nvcc
473        if self.setCompilers.CUDA_CXXFLAGS:
474          self.addMakeMacro('CUDA_CXXFLAGS',self.setCompilers.CUDA_CXXFLAGS)
475        else:
476          self.logPrint('No CUDA_CXXFLAGS available')
477        self.addMakeMacro('CUDA_CXX',self.setCompilers.CUDA_CXX)
478
479        # Intel compiler environment breaks GNU compilers, fix it just enough to allow g++ to run
480        if self.setCompilers.CUDA_CXX == 'gcc' and config.setCompilers.Configure.isIntel(self.compilers.CXX,self.log):
481          self.logPrint('''Removing Intel's CPLUS_INCLUDE_PATH when using nvcc since it breaks g++''')
482          self.delMakeMacro('CUDAC')
483          self.addMakeMacro('CUDAC','CPLUS_INCLUDE_PATH="" '+petscNvcc)
484      else:
485        self.logPrint('nvcc --dryrun failed, unable to determine CUDA_CXX and CUDA_CXXFLAGS')
486
487    if not self.cudaclang:
488      self.addMakeMacro('CUDA_HOSTFLAGS','--compiler-options="$(CXXCPPFLAGS) $(CUDA_CXXFLAGS)"')
489      self.addMakeMacro('CUDA_PETSC_GENDEPS','$(call quiet,CUDAC,.dep) --generate-dependencies --output-directory=$(@D) $(MPICXX_INCLUDES) $(CUDAC_FLAGS) --compiler-options="$(CXXCPPFLAGS) $(CUDA_CXXFLAGS)"')
490    else:
491      self.addMakeMacro('CUDA_HOSTFLAGS','$(CXXCPPFLAGS) $(CUDA_CXXFLAGS) $(CUDA_DEPFLAGS) $(PETSC_CC_INCLUDES)')
492      self.addMakeMacro('CUDA_PETSC_GENDEPS','true')
493    return
494
495  def checkKnownBadCUDAHostCompilerCombo(self):
496    """
497    Check for nvcc + host compiler combinations that are unable to compile or have some other known
498    defect and prints a warning to the user. Has no other effect.
499
500    For example:
501    1. CUDA 11.5 + gcc 11.3.0 produces
502
503    /usr/include/c++/11/bits/std_function.h:435:145: error: parameter packs not expanded with '...':
504  435 |         function(_Functor&& __f)
505      |         ^
506
507    """
508    if not self.argDB['with-cuda']:
509      return
510
511    assert self.version_tuple
512    assert isinstance(self.version_tuple, tuple)
513    assert isinstance(self.version_tuple[0], int)
514    if self.version_tuple[:2] == (11, 5):
515      # CUDA 11.5.X
516      cxx = self.setCompilers.CXX
517      if self.setCompilers.isGNU(cxx, self.log):
518        output, _, _      = self.executeShellCommand(cxx + ' -dumpfullversion', log=self.log)
519        gcc_version       = output.strip().split('.')
520        gcc_version_tuple = tuple(map(int, gcc_version))
521        if gcc_version_tuple[:3] == (11, 3, 0):
522          mess = """
523          You appear to be using CUDA {} and GCC {}. If you get compile errors along the lines of:
524
525          /usr/include/c++/11/bits/std_function.h:435:145: error: parameter packs not expanded with '...':
526          435 |         function(_Functor&& __f)
527
528          This is a bug that crops up with exactly the combination of CUDA 11.5.X + GCC 11.3.0.
529          It is a bug in nvcc itself, and the file (C++ standard header <function>) originates from
530          within CUDA headers. There is no way to work around it in software.
531
532          Your only options are:
533          - Use a newer nvcc version
534          - Use an older gcc version
535          """.format('.'.join(map(str, self.version_tuple)), gcc_version)
536          self.logPrintWarning(mess)
537    return
538
539  def configure(self, *args, **kwargs):
540    super().configure(*args, **kwargs)
541    self.executeTest(self.checkKnownBadCUDAHostCompilerCombo)
542    return
543