import config.package
import os

class Configure(config.package.Package):
  def __init__(self, framework):
    config.package.Package.__init__(self, framework)
    self.minversion        = '7.5'
    self.versionname       = 'CUDA_VERSION'
    self.versioninclude    = 'cuda.h'
    self.requiresversion   = 1
    self.functions         = ['cublasInit','cufftDestroy','nvmlInit_v2']
    self.includes          = ['cublas.h','cufft.h','cusparse.h','cusolverDn.h','curand.h','thrust/version.h','nvml.h']
    self.basicliblist      = [['libcudart.a','libnvtx3interop.a'],['libcudart.a','libnvToolsExt.a']]
    self.mathliblist       = [['libcufft.a', 'libcublas.a','libcusparse.a','libcusolver.a','libcurand.a']]
    # CUDA provides 2 variants of libcuda.so (for access to CUDA driver API):
    # - fully functional compile, runtime libraries installed with the GPU driver
    #    (for ex:) /usr/lib64/libcuda.so (compile), libcuda.so.1 (runtime)
    # -	stub library - usable only for compiles
    # 	 (for ex:) /usr/local/cuda/lib64/stubs/libcuda.so  (without corresponding libcuda.so.1 for runtime)
    # We are preferring this stub library - as it enables compiles on non-GPU nodes (for ex: login nodes).
    # Using RPATH to this stub location is not appropriate - so skipping via libraries.rpathSkipDirs()
    # Note: PETSc does not use CUDA driver API (as of Sep 29, 2021), but external package for ex: Kokkos does.
    #
    # see more at https://stackoverflow.com/a/52784819
    self.stubliblist       = [['libcuda.so','libnvidia-ml.so']]
    self.liblist           = 'dummy' # existence of self.liblist is used by package.py to determine if --with-cuda-lib must be provided
    self.precisions        = ['single','double']
    self.buildLanguages    = ['CUDA']
    self.functionsDefine   = ['cusolverDnDpotri']
    self.isnvhpc           = 0
    self.devicePackage     = 1
    self.skipMPIDependency = 1
    return

  def setupHelp(self, help):
    import nargs
    config.package.Package.setupHelp(self, help)
    help.addArgument(
      'CUDA', '-with-cuda-arch',
      nargs.ArgString(
        None, None,
        'Cuda architecture for code generation, for example 70 (this may be used by external '
        'packages). A comma-separated list can be passed to target multiple architectures (e.g. '
        'for distribution). When using the nvcc compiler, other possible options include "all", '
        '"all-major", and "native" (see documentation of the nvcc "--gpu-architecture" flag)'
      )
    )
    return

  def __str__(self):
    output  = config.package.Package.__str__(self)
    if hasattr(self,'cudaArch'):
      output += '  CUDA SM '+self.cudaArch+'\n'
    if hasattr(self.setCompilers,'CUDA_CXX'):
      output += '  CUDA underlying compiler: CUDA_CXX=' + self.setCompilers.CUDA_CXX + '\n'
    if hasattr(self.setCompilers,'CUDA_CXXFLAGS'):
      output += '  CUDA underlying compiler flags: CUDA_CXXFLAGS=' + self.setCompilers.CUDA_CXXFLAGS + '\n'
    if hasattr(self.setCompilers,'CUDA_CXXLIBS'):
      output += '  CUDA underlying linker libraries: CUDA_CXXLIBS=' + self.setCompilers.CUDA_CXXLIBS + '\n'
    return output

  def cudaArchIsVersionList(self):
    "whether the CUDA arch is a list of version numbers (vs a string like 'all')"
    try:
      self.cudaArchList()
    except RuntimeError:
      return False
    else:
      return True

  def cudaArchList(self):
    '''
    a list of the given cuda arch numbers.
    raises RuntimeError if cuda arch is not a list of version numbers
    '''
    if not hasattr(self,'cudaArch'):
      raise RuntimeError('cudaArch is not set') from None

    arch_list = self.cudaArch.split(',')

    try:
      for v in arch_list:
        int(v)
    except ValueError as e:
      msg = 'only explicit cuda arch version numbers supported for this package '
      msg += '(got "'+self.cudaArch+'")'
      raise RuntimeError(msg) from None

    return arch_list

  def cudaArchSingle(self):
    '''
    Returns the single given CUDA arch, or raises RuntimeError if something else was specified
    (like a list of numbers or "all")
    '''
    arch_list = self.cudaArchList()
    if len(arch_list) > 1:
      raise RuntimeError('this package can only be compiled to target a single explicit CUDA arch '
                         'version (got "'+self.cudaArch+'")')
    return arch_list[0]

  def nvccArchFlags(self):
    if not self.cudaArchIsVersionList():
      return ' -arch='+self.cudaArch

    if self.setCompilers.isCygwin(self.log):
      arg_sep = '='
    else:
      arg_sep = ' '

    # generate both SASS and PTX for the arch, see https://stackoverflow.com/a/35657430/3447299
    # e.g., '-arch=sm_50' is equivalent to '-arch=compute_50 -code=sm_50,compute_50'.
    return ''.join(' -arch=sm_'+gen for gen in self.cudaArchList())

  def clangArchFlags(self):
    if not self.cudaArchIsVersionList():
      raise RuntimeError('clang only supports cuda archs specified as version number(s) (got "'+self.cudaArch+'")')
    return ''.join(' --cuda-gpu-arch=sm_'+gen for gen in self.cudaArchList())

  def getCmakeCUDAArchFlag(self):
    # CMake supports 'all', 'all-major', 'native', and a semicolon-separated list of numbers
    if hasattr(self,'cudaArch'):
      return ['-DCMAKE_CUDA_ARCHITECTURES:STRING="{}"'.format(self.cudaArch.replace(',', ';'))]
    else:
      return []

  def setupDependencies(self, framework):
    config.package.Package.setupDependencies(self, framework)
    self.scalarTypes  = framework.require('PETSc.options.scalarTypes',self)
    self.compilers    = framework.require('config.compilers',self)
    self.thrust       = framework.require('config.packages.Thrust',self)
    self.libraries    = framework.require('config.libraries', self)
    self.odeps        = [self.thrust] # if user supplies thrust, install it first
    return

  def getSearchDirectories(self):
    if hasattr(self, 'cudaDir'):
      yield self.cudaDir
    for i in config.package.Package.getSearchDirectories(self): yield i
    return

  def getIncludeDirs(self, prefix, includeDir):
    ''' Generate cuda include dirs'''
    # See comments below in generateLibList() for different prefix formats.
    # format A, prefix = /path/cuda-11.4.0/, includeDir = 'include'. The superclass's method handles this well.
    incDirs = config.package.Package.getIncludeDirs(self, prefix, includeDir)

    if not isinstance(incDirs, list):
      incDirs = [incDirs]

    # format B and C, prefix = /path/nvhpc/Linux_x86_64/21.7/compilers or  /path/nvhpc/Linux_x86_64/21.7/cuda
    nvhpcDir        = os.path.dirname(prefix) # /path/nvhpc/Linux_x86_64/21.7
    nvhpcCudaIncDir = os.path.join(nvhpcDir,'cuda','include')
    nvhpcMathIncDir = os.path.join(nvhpcDir,'math_libs','include')
    if os.path.isdir(nvhpcCudaIncDir) and os.path.isdir(nvhpcMathIncDir):
      incDirs.extend([nvhpcCudaIncDir,nvhpcMathIncDir])

    # format D, prefix = /path/nvhpc/Linux_x86_64/21.7/cuda/11.4
    nvhpcDir           = os.path.dirname(os.path.dirname(prefix))  # /path/nvhpc/Linux_x86_64/21.7
    ver                = os.path.basename(prefix) # 11.4
    nvhpcCudaVerIncDir = os.path.join(nvhpcDir,'cuda',ver,'include')
    nvhpcMathVerIncDir = os.path.join(nvhpcDir,'math_libs',ver,'include')
    if os.path.isdir(nvhpcCudaVerIncDir) and os.path.isdir(nvhpcMathVerIncDir):
      incDirs.extend([nvhpcCudaVerIncDir,nvhpcMathVerIncDir])
    return incDirs

  def fixWinLib(liblist):
    # libfoo.a -> foo.lib
    winliblist = []
    for lib in liblist:
      winliblist.append(lib[3:-1]+'lib')
      return winliblist

  def generateLibList(self, directory):
    ''' Generate cuda liblist. The difficulty comes from that cuda can be in different directory structures through system, CUDAToolkit or NVHPC'''

    if self.setCompilers.isCygwin(self.log):
      self.basicliblist = [fixWinLib(lib) for lib in self.basicliblist]
      self.mathliblist = [fixWinLib(lib) for lib in self.mathliblist]
      self.stubliblist = [fixWinLib(lib) for lib in self.stubliblist]

    # 1) From system installation (ex. Ubuntu 21.10), all libraries are on the compiler (nvcc)'s default search paths
    #   /usr/bin/nvcc
    #   /usr/include
    #   /usr/lib/x86_64-linux-gnu/{libcudart.so,..,libcuda.so,..,stubs}.  See https://wiki.ubuntu.com/MultiarchSpec for info on this new directory structure
    #
    # 2) CUDAToolkit, with a directory structure like
    #   /path/cuda-11.4.0/{lib64, lib64/stubs}, here lib64/ contains all basic and math libraries
    #                   +/include
    #                   +/bin/{nvcc,..}
    #
    # 3) NVHPC, with a directory structure like
    # /path/nvhpc/Linux_x86_64/21.7/compilers/bin/{nvcc,nvc,nvc++}
    #                             +/cuda/{include,bin/nvcc,lib64,lib64/stubs}, just symbol links to what in cuda/11.4
    #                             +/cuda/11.4/{include,bin/nvcc,lib64,lib64/stubs}
    #                             +/math_libs/{include,lib64,lib64/stubs}, just symbol links to what in math_libs/11.4
    #                             +/math_libs/11.4/{include,lib64,lib64/stubs}
    #                             +/comm_libs/mpi/bin/{mpicc,mpicxx,mpifort}
    #
    # The input argument 'directory' could be in these formats:
    # 0) ''                                             We are checking if the compiler by default supports the libraries
    # A) /path/cuda-11.4.0/lib64,                       by loading a CUDAToolkit or --with-cuda-dir=/path/cuda-11.4.0
    # B) /path/nvhpc/Linux_x86_64/21.7/compilers/lib64, by loading a NVHPC module
    # C) /path/nvhpc/Linux_x86_64/21.7/cuda/lib64,      by --with-cuda-dir=/path/Linux_x86_64/21.7/cuda/
    # D) /path/nvhpc/Linux_x86_64/21.7/cuda/11.4/lib64, by --with-cuda-dir=/path/Linux_x86_64/21.7/cuda/11.4

    # directory is None (''). Test if the compiler by default supports all libraries including the stub
    if not directory and not self.isnvhpc:
      self.liblist = [basicliblist+mathliblist+stubliblist for basicliblist in self.basicliblist for mathliblist in self.mathliblist for stubliblist in self.stubliblist]
      liblist      = config.package.Package.generateLibList(self, directory)
      return liblist

    # 'directory' is in format A, with basic and math libraries in one directory.
    liblist           = [] # initialize
    if not self.isnvhpc:
      toolkitCudaLibDir = directory
      toolkitStubLibDir = os.path.join(toolkitCudaLibDir,'stubs')
      if os.path.isdir(toolkitCudaLibDir) and os.path.isdir(toolkitStubLibDir):
        self.libraries.addRpathSkipDir(toolkitStubLibDir)
        self.liblist = [basicliblist+mathliblist for basicliblist in self.basicliblist for mathliblist in self.mathliblist]
        cudaliblist  = config.package.Package.generateLibList(self, toolkitCudaLibDir)
        self.liblist = self.stubliblist
        stubliblist  = config.package.Package.generateLibList(self,toolkitStubLibDir)
        liblist      = [cudalib+stublib for cudalib in cudaliblist for stublib in stubliblist]

    # 'directory' is in format B or C, and we peel 'directory' two times.
    nvhpcDir        = os.path.dirname(os.path.dirname(directory)) # /path/nvhpc/Linux_x86_64/21.7
    nvhpcCudaLibDir = os.path.join(nvhpcDir,'cuda','lib64')
    nvhpcMathLibDir = os.path.join(nvhpcDir,'math_libs','lib64')
    nvhpcStubLibDir = os.path.join(nvhpcDir,'cuda','lib64','stubs')
    if os.path.isdir(nvhpcCudaLibDir) and os.path.isdir(nvhpcMathLibDir) and os.path.isdir(nvhpcStubLibDir):
      self.libraries.addRpathSkipDir(nvhpcStubLibDir)
      self.liblist = self.basicliblist
      basicliblist  = config.package.Package.generateLibList(self, nvhpcCudaLibDir)
      self.liblist = self.mathliblist
      mathliblist  = config.package.Package.generateLibList(self, nvhpcMathLibDir)
      self.liblist = self.stubliblist
      stubliblist  = config.package.Package.generateLibList(self, nvhpcStubLibDir)
      liblist += [basiclib+mathlib+stublib for basiclib in basicliblist for mathlib in mathliblist for stublib in stubliblist]
      self.math_libs_dir = os.path.join(nvhpcDir,'math_libs') # might be used by Kokkos-Kernels

    # 'directory' is in format D, and we peel 'directory' three times.
    # We preserve the version info in case a NVHPC installation provides multiple cuda versions and we'd like to respect user's choice
    nvhpcDir           = os.path.dirname(os.path.dirname(os.path.dirname(directory))) # /path/nvhpc/Linux_x86_64/21.7
    ver                = os.path.basename(os.path.dirname(directory)) # 11.4
    nvhpcCudaVerLibDir = os.path.join(nvhpcDir,'cuda',ver,'lib64')
    nvhpcMathVerLibDir = os.path.join(nvhpcDir,'math_libs',ver,'lib64')
    nvhpcStubVerLibDir = os.path.join(nvhpcDir,'cuda',ver,'lib64','stubs')
    if os.path.isdir(nvhpcCudaVerLibDir) and os.path.isdir(nvhpcMathVerLibDir) and os.path.isdir(nvhpcStubVerLibDir):
      self.libraries.addRpathSkipDir(nvhpcStubVerLibDir)
      self.liblist = self.basicliblist
      basicliblist  = config.package.Package.generateLibList(self, nvhpcCudaVerLibDir)
      self.liblist = self.mathliblist
      mathliblist  = config.package.Package.generateLibList(self, nvhpcMathVerLibDir)
      self.liblist = self.stubliblist
      stubliblist  = config.package.Package.generateLibList(self, nvhpcStubVerLibDir)
      liblist += [basiclib+mathlib+stublib for basiclib in basicliblist for mathlib in mathliblist for stublib in stubliblist]
      self.math_libs_dir = os.path.join(nvhpcDir,'math_libs',ver)
    return liblist

  def checkSizeofVoidP(self):
    '''Checks if the CUDA compiler agrees with the C compiler on what size of void * should be'''
    self.log.write('Checking if sizeof(void*) in CUDA is the same as with regular compiler\n')
    size = self.types.checkSizeof('void *', (8, 4), lang='CUDA', save=False)
    if size != self.types.sizes['void-p']:
      raise RuntimeError('CUDA Error: sizeof(void*) with CUDA compiler is ' + str(size) + ' which differs from sizeof(void*) with C compiler')
    return

  def checkThrustVersion(self,minVer):
    '''Check if thrust version is >= minVer '''
    include = '#include <thrust/version.h> \n#if THRUST_VERSION < ' + str(minVer) + '\n#error "thrust version is too low"\n#endif\n'
    self.pushLanguage('CUDA')
    valid = self.checkCompile(include)
    self.popLanguage()
    return valid

  def configureTypes(self):
    import config.setCompilers
    if not self.getDefaultPrecision() in ['double', 'single']:
      raise RuntimeError('Must use either single or double precision with CUDA')
    self.checkSizeofVoidP()
    # if no user-supplied thrust, check the system's complex ability
    if not self.thrust.found and self.scalarTypes.scalartype == 'complex':
      if not self.checkThrustVersion(100908):
        raise RuntimeError('CUDA Error: The thrust library is too low to support PetscComplex. Use --download-thrust or --with-thrust-dir to give a thrust >= 1.9.8')
    return

  def versionToStandardForm(self,ver):
    '''Converts from CUDA 7050 notation to standard notation 7.5'''
    return ".".join(map(str,[int(ver)//1000, int(ver)//10%10]))

  def checkNVCCDoubleAlign(self):
    if 'known-cuda-align-double' in self.argDB:
      if not self.argDB['known-cuda-align-double']:
        raise RuntimeError('CUDA error: PETSc currently requires that CUDA double alignment match the C compiler')
    else:
      typedef = 'typedef struct {double a; int b;} teststruct;\n'
      cuda_size = self.types.checkSizeof('teststruct', (16, 12), lang='CUDA', codeBegin=typedef, save=False)
      c_size = self.types.checkSizeof('teststruct', (16, 12), lang='C', codeBegin=typedef, save=False)
      if c_size != cuda_size:
        raise RuntimeError('CUDA compiler error: memory alignment doesn\'t match C compiler (try adding -malign-double to compiler options)')
    return

  def setCudaDir(self):
    import os
    self.pushLanguage('CUDA')
    petscNvcc = self.getCompiler()
    self.cudaclang = self.setCompilers.isClang(petscNvcc, self.log)
    self.popLanguage()

    # The presence of the cudaDir attribute means that PETSc has detected a conventional installation of CUDA.
    # This seems to be needed by some external packages that can build against it.
    # PETSc can be built when the various components (cudaruntime, cublas, etc) are scattered in different locations,
    # like in the case of NVIDIA packages from pip.
    if 'with-cuda-dir' in self.argDB and os.path.exists(os.path.join(self.argDB['with-cuda-dir'],'include','cuda.h')):
      self.cudaDir = self.argDB['with-cuda-dir']
    if self.setCompilers.isCygwin(self.log):  # Handle win32fe nvcc as the compiler name
      petscNvcc = petscNvcc.split(' ')[1]

    self.getExecutable(petscNvcc,getFullPath=1,resultName='systemNvcc')
    if hasattr(self,'systemNvcc') and not hasattr(self, 'cudaDir'):
      if self.cudaclang:
        (out, err, ret) = Configure.executeShellCommand(petscNvcc + ' -v 2>&1 | grep "Found CUDA installation"',timeout = 60, log = self.log, threads = 1)
        self.cudaDir = out.split()[3].replace(',','')
      else:
        nvccDir = os.path.dirname(self.systemNvcc) # /path/bin
        d = os.path.dirname(nvccDir) # /path
        # d might be /to/Linux_x86_64/21.7/cuda or /to/Linux_x86_64/21.7/cuda/12.2, check if math_libs exist. If yes, we are using NVHPC
        if os.path.exists(os.path.join(d,'..','math_libs')) or os.path.exists(os.path.join(d,'..','..','math_libs')):
          self.isnvhpc = 1
        if os.path.exists(os.path.join(d,'include','cuda.h')): # CUDAToolkit with a structure /path/{bin/nvcc, include/cuda.h}
          self.cudaDir = d
        elif os.path.exists(os.path.normpath(os.path.join(d,'..','cuda','include','cuda.h'))): # could be NVHPC
          self.cudaDir = os.path.normpath(os.path.join(d,'..','cuda')) # get rid of .. in path, getting /path/Linux_x86_64/21.5/cuda

  def configureLibrary(self):
    import re

    self.setCudaDir()
    # skip this because it does not properly set self.lib and self.include if they have already been set
    if not self.found: config.package.Package.configureLibrary(self)
    self.checkNVCCDoubleAlign()
    self.configureTypes()
    # includes from --download-thrust should override the prepackaged version in cuda - so list thrust.include before cuda.include on the compile command.
    if self.thrust.found:
      self.log.write('Overriding the thrust library in CUDAToolkit with a user-specified one\n')
      self.include = self.thrust.include+self.include

    self.pushLanguage('CUDA')
    petscNvcc = self.getCompiler()
    self.popLanguage()

    # Handle CUDA arch
    if 'with-cuda-arch' in self.framework.argDB:
      self.cudaArch = self.argDB['with-cuda-arch']
    elif hasattr(self, 'cudaDir'):
      dq = os.path.join(self.cudaDir,'extras','demo_suite')
      self.getExecutable('deviceQuery',path = dq)
      if hasattr(self,'deviceQuery'):
        try:
          (out, err, ret) = Configure.executeShellCommand(self.deviceQuery + ' | grep "CUDA Capability"',timeout = 60, log = self.log, threads = 1)
        except Exception as e:
          self.log.write('NVIDIA utility deviceQuery failed '+str(e)+'\n')
        else:
          try:
            out = out.split('\n')[0]
            sm = out[-3:]
            self.cudaArch = str(int(10*float(sm)))
          except:
            self.log.write('Unable to parse the CUDA Capability output from the NVIDIA utility deviceQuery\n')

    if not hasattr(self,'cudaArch') and not self.argDB['with-batch']:
        includes = '''#include <stdio.h>
                    #include <cuda_runtime.h>
                    #include <cuda_runtime_api.h>
                    #include <cuda_device_runtime_api.h>'''
        body = '''cudaError_t cerr;
                cudaDeviceProp dp;
                cerr = cudaGetDeviceProperties(&dp, 0);
                if (cerr) {
              #if (CUDART_VERSION >= 8000)
                  printf("Error calling cudaGetDeviceProperties with CUDA error %d (%s) : %s\\n", (int)cerr, cudaGetErrorName(cerr), cudaGetErrorString(cerr));
              #else
                  printf("Error calling cudaGetDeviceProperties with CUDA error %d\\n", (int)cerr);
              #endif
                }
                else printf("%d\\n",10*dp.major+dp.minor);
                return(cerr);'''
        self.pushLanguage('CUDA')
        try:
          (output,status) = self.outputRun(includes, body)
        except Exception as e:
          self.log.write('petsc-supplied CUDA device query test failed: '+str(e)+'\n')
          self.popLanguage()
        else:
          self.popLanguage()
          self.log.write('petsc-supplied CUDA device query test output: '+output+', status: '+str(status)+'\n')
          if not status:
            try:
              gen = int(output)
            except:
              pass
            else:
              self.log.write('petsc-supplied CUDA device query test found the CUDA Capability is '+str(gen)+'\n')
              self.cudaArch = str(gen)
    # Store min cuda arch at configure time for later error diagnosis
    if self.cudaArchIsVersionList():
      self.addDefine('PKG_CUDA_MIN_ARCH', min(self.cudaArchList()))

    # Check flags validity
    if hasattr(self,'cudaArch'):
      self.pushLanguage('CUDA')
      if self.cudaclang:
        self.setCompilers.CUDAFLAGS += self.clangArchFlags()
      else: # assuming nvcc
        self.setCompilers.CUDAFLAGS += self.nvccArchFlags()

      try:
        valid = self.checkCompile()
      except Exception as e:
        self.log.write('checkCompile on CUDA compile with gencode failed '+str(e)+'\n')
        self.popLanguage()
        valid = False
      else:
        self.log.write('Flag from checkCompile on CUDA compile with gencode '+str(valid)+'\n')
        self.popLanguage()

      if not valid:
        raise RuntimeError('CUDA compile failed with arch flags "'+self.setCompilers.CUDAFLAGS+'"'
                           ' generated from "--with-cuda-arch='+self.cudaArch+'"')

    self.addDefine('HAVE_CUPM','1') # Have either CUDA or HIP
    if not self.version_tuple:
      self.checkVersion(); # set version_tuple
    if self.version_tuple[0] > 12 or (self.version_tuple[0] == 12 and self.version_tuple[1] >= 2):
      self.addDefine('HAVE_CUDA_VERSION_12_2PLUS','1')
    if self.version_tuple[0] >= 11:
      self.addDefine('HAVE_CUDA_VERSION_11PLUS','1')
    if self.cudaclang:
      self.addDefine('HAVE_CUDA_CLANG','1') # code compilation in aijdevice and landau is broken

    # determine the compiler used by nvcc
    # '-ccbin mpicxx' might be in by self.setCompilers.CUDAFLAGS
    if not self.cudaclang:
      (out, err, ret) = Configure.executeShellCommand(petscNvcc + ' ' + self.setCompilers.CUDAFLAGS + ' --dryrun dummy.cu 2>&1 | grep D__CUDACC__ | head -1 | cut -f2 -d" "')
      if out:
        # MPI.py adds its include paths and libraries to these lists and saves them again
        self.setCompilers.CUDA_CXX = out
        self.setCompilers.CUDA_CXXFLAGS = ''
        self.setCompilers.CUDA_CXXLIBS = ''
        self.logPrint('Determined the compiler nvcc uses is ' + out);
        self.logPrint('PETSc C compiler '+self.compilers.CC)
        self.logPrint('PETSc C++ compiler '+self.compilers.CXX)

        # TODO: How to handle MPI compiler wrapper as opposed to its underlying compiler
        if out == self.compilers.CXX:
          # nvcc will say it is using gcc as its compiler, it pass a flag when using to
          # treat it as a C++ compiler
          newFlags = self.setCompilers.CXXPPFLAGS.split()+self.setCompilers.CXXFLAGS.split()
          # need to remove the std flag from the list, nvcc will already have its own flag set
          # With IBM XL compilers, we also need to remove -+
          # Remove -O since the optimization level is already set by CUDAC_FLAGS, otherwise Kokkos nvcc_wrapper will complain
          #   "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the last
          #    is used because nvcc can only accept a single optimization setting."
          self.setCompilers.CUDA_CXXFLAGS = ' '.join([flg for flg in newFlags if not flg.startswith(('-std=c++','-std=gnu++','-+','-O'))])
        else:
          # only add any -I arguments since compiler arguments may not work
          flags = self.setCompilers.CPPFLAGS.split(' ')+self.setCompilers.CXXFLAGS.split(' ')
          for i in flags:
            if i.startswith('-I'):
              self.setCompilers.CUDA_CXXFLAGS += ' '+i
        # set compiler flags for compiler called by nvcc
        if self.setCompilers.CUDA_CXXFLAGS:
          self.addMakeMacro('CUDA_CXXFLAGS',self.setCompilers.CUDA_CXXFLAGS)
        else:
          self.logPrint('No CUDA_CXXFLAGS available')
        self.addMakeMacro('CUDA_CXX',self.setCompilers.CUDA_CXX)

        # Intel compiler environment breaks GNU compilers, fix it just enough to allow g++ to run
        if self.setCompilers.CUDA_CXX == 'gcc' and config.setCompilers.Configure.isIntel(self.compilers.CXX,self.log):
          self.logPrint('''Removing Intel's CPLUS_INCLUDE_PATH when using nvcc since it breaks g++''')
          self.delMakeMacro('CUDAC')
          self.addMakeMacro('CUDAC','CPLUS_INCLUDE_PATH="" '+petscNvcc)
      else:
        self.logPrint('nvcc --dryrun failed, unable to determine CUDA_CXX and CUDA_CXXFLAGS')

    if not self.cudaclang:
      self.addMakeMacro('CUDA_HOSTFLAGS','--compiler-options="$(CXXCPPFLAGS) $(CUDA_CXXFLAGS)"')
      self.addMakeMacro('CUDA_PETSC_GENDEPS','$(call quiet,CUDAC,.dep) --generate-dependencies --output-directory=$(@D) $(MPICXX_INCLUDES) $(CUDAC_FLAGS) --compiler-options="$(CXXCPPFLAGS) $(CUDA_CXXFLAGS)"')
    else:
      self.addMakeMacro('CUDA_HOSTFLAGS','$(CXXCPPFLAGS) $(CUDA_CXXFLAGS) $(CUDA_DEPFLAGS) $(PETSC_CC_INCLUDES)')
      self.addMakeMacro('CUDA_PETSC_GENDEPS','true')
    return

  def checkKnownBadCUDAHostCompilerCombo(self):
    """
    Check for nvcc + host compiler combinations that are unable to compile or have some other known
    defect and prints a warning to the user. Has no other effect.

    For example:
    1. CUDA 11.5 + gcc 11.3.0 produces

    /usr/include/c++/11/bits/std_function.h:435:145: error: parameter packs not expanded with '...':
  435 |         function(_Functor&& __f)
      |         ^

    """
    if not self.argDB['with-cuda']:
      return

    assert self.version_tuple
    assert isinstance(self.version_tuple, tuple)
    assert isinstance(self.version_tuple[0], int)
    if self.version_tuple[:2] == (11, 5):
      # CUDA 11.5.X
      cxx = self.setCompilers.CXX
      if self.setCompilers.isGNU(cxx, self.log):
        output, _, _      = self.executeShellCommand(cxx + ' -dumpfullversion', log=self.log)
        gcc_version       = output.strip().split('.')
        gcc_version_tuple = tuple(map(int, gcc_version))
        if gcc_version_tuple[:3] == (11, 3, 0):
          mess = """
          You appear to be using CUDA {} and GCC {}. If you get compile errors along the lines of:

          /usr/include/c++/11/bits/std_function.h:435:145: error: parameter packs not expanded with '...':
          435 |         function(_Functor&& __f)

          This is a bug that crops up with exactly the combination of CUDA 11.5.X + GCC 11.3.0.
          It is a bug in nvcc itself, and the file (C++ standard header <function>) originates from
          within CUDA headers. There is no way to work around it in software.

          Your only options are:
          - Use a newer nvcc version
          - Use an older gcc version
          """.format('.'.join(map(str, self.version_tuple)), gcc_version)
          self.logPrintWarning(mess)
    return

  def configure(self, *args, **kwargs):
    super().configure(*args, **kwargs)
    self.executeTest(self.checkKnownBadCUDAHostCompilerCombo)
    return