1 #pragma once 2 3 #include <petscdevice.h> 4 #include <petscpkg_version.h> 5 6 /* MANSEC = Sys */ 7 8 #if defined(__NVCC__) || defined(__CUDACC__) 9 #define PETSC_USING_NVCC 1 10 #endif 11 12 #if PetscDefined(HAVE_CUDA) 13 #include <cuda.h> 14 #include <cuda_runtime.h> 15 #include <cublas_v2.h> 16 #define DISABLE_CUSPARSE_DEPRECATED 17 #include <cusparse.h> 18 #include <cusolverDn.h> 19 #include <cusolverSp.h> 20 #include <cufft.h> 21 #include <curand.h> 22 23 /* cuBLAS does not have cublasGetErrorName(). We create one on our own. */ 24 PETSC_EXTERN const char *PetscCUBLASGetErrorName(cublasStatus_t); /* PETSC_EXTERN since it is exposed by the CHKERRCUBLAS macro */ 25 PETSC_EXTERN const char *PetscCUSolverGetErrorName(cusolverStatus_t); 26 PETSC_EXTERN const char *PetscCUFFTGetErrorName(cufftResult); 27 28 /* REMOVE ME */ 29 #define WaitForCUDA() cudaDeviceSynchronize() 30 31 /* CUDART_VERSION = 1000 x major + 10 x minor version */ 32 33 /* Could not find exactly which CUDART_VERSION introduced cudaGetErrorName. At least it was in CUDA 8.0 (Sep. 2016) */ 34 #if PETSC_PKG_CUDA_VERSION_GE(8, 0, 0) 35 #define PetscCallCUDAVoid(...) \ 36 do { \ 37 const cudaError_t _p_cuda_err__ = __VA_ARGS__; \ 38 PetscCheckAbort(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d (%s) : %s", (PetscErrorCode)_p_cuda_err__, cudaGetErrorName(_p_cuda_err__), cudaGetErrorString(_p_cuda_err__)); \ 39 } while (0) 40 41 #define PetscCallCUDA(...) \ 42 do { \ 43 const cudaError_t _p_cuda_err__ = __VA_ARGS__; \ 44 PetscCheck(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d (%s) : %s", (PetscErrorCode)_p_cuda_err__, cudaGetErrorName(_p_cuda_err__), cudaGetErrorString(_p_cuda_err__)); \ 45 } while (0) 46 #else /* PETSC_PKG_CUDA_VERSION_GE(8,0,0) */ 47 #define PetscCallCUDA(...) \ 48 do { \ 49 const cudaError_t _p_cuda_err__ = __VA_ARGS__; \ 50 PetscCheck(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d", (PetscErrorCode)_p_cuda_err__); \ 51 } while (0) 52 53 #define PetscCallCUDAVoid(...) \ 54 do { \ 55 const cudaError_t _p_cuda_err__ = __VA_ARGS__; \ 56 PetscCheckAbort(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d", (PetscErrorCode)_p_cuda_err__); \ 57 } while (0) 58 #endif /* PETSC_PKG_CUDA_VERSION_GE(8,0,0) */ 59 #define CHKERRCUDA(...) PetscCallCUDA(__VA_ARGS__) 60 61 #define PetscCUDACheckLaunch \ 62 do { \ 63 /* Check synchronous errors, i.e. pre-launch */ \ 64 PetscCallCUDA(cudaGetLastError()); \ 65 /* Check asynchronous errors, i.e. kernel failed (ULF) */ \ 66 PetscCallCUDA(cudaDeviceSynchronize()); \ 67 } while (0) 68 69 #define PetscCallCUBLAS(...) \ 70 do { \ 71 const cublasStatus_t _p_cublas_stat__ = __VA_ARGS__; \ 72 if (PetscUnlikely(_p_cublas_stat__ != CUBLAS_STATUS_SUCCESS)) { \ 73 const char *name = PetscCUBLASGetErrorName(_p_cublas_stat__); \ 74 if (((_p_cublas_stat__ == CUBLAS_STATUS_NOT_INITIALIZED) || (_p_cublas_stat__ == CUBLAS_STATUS_ALLOC_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \ 75 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \ 76 "cuBLAS error %d (%s). " \ 77 "Reports not initialized or alloc failed; " \ 78 "this indicates the GPU may have run out resources", \ 79 (PetscErrorCode)_p_cublas_stat__, name); \ 80 } else { \ 81 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuBLAS error %d (%s)", (PetscErrorCode)_p_cublas_stat__, name); \ 82 } \ 83 } \ 84 } while (0) 85 #define CHKERRCUBLAS(...) PetscCallCUBLAS(__VA_ARGS__) 86 87 #if (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) /* According to cuda/10.1.168 on OLCF Summit */ 88 #define PetscCallCUSPARSE(...) \ 89 do { \ 90 const cusparseStatus_t _p_cusparse_stat__ = __VA_ARGS__; \ 91 if (PetscUnlikely(_p_cusparse_stat__)) { \ 92 const char *name = cusparseGetErrorName(_p_cusparse_stat__); \ 93 const char *descr = cusparseGetErrorString(_p_cusparse_stat__); \ 94 PetscCheck((_p_cusparse_stat__ != CUSPARSE_STATUS_NOT_INITIALIZED) && (_p_cusparse_stat__ != CUSPARSE_STATUS_ALLOC_FAILED), PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \ 95 "cuSPARSE errorcode %d (%s) : %s.; " \ 96 "this indicates the GPU has run out resources", \ 97 (int)_p_cusparse_stat__, name, descr); \ 98 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSPARSE errorcode %d (%s) : %s", (int)_p_cusparse_stat__, name, descr); \ 99 } \ 100 } while (0) 101 #else /* (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) */ 102 #define PetscCallCUSPARSE(...) \ 103 do { \ 104 const cusparseStatus_t _p_cusparse_stat__ = __VA_ARGS__; \ 105 PetscCheck(_p_cusparse_stat__ == CUSPARSE_STATUS_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSPARSE errorcode %d", (PetscErrorCode)_p_cusparse_stat__); \ 106 } while (0) 107 #endif /* (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) */ 108 #define CHKERRCUSPARSE(...) PetscCallCUSPARSE(__VA_ARGS__) 109 110 #define PetscCallCUSOLVER(...) \ 111 do { \ 112 const cusolverStatus_t _p_cusolver_stat__ = __VA_ARGS__; \ 113 if (PetscUnlikely(_p_cusolver_stat__ != CUSOLVER_STATUS_SUCCESS)) { \ 114 const char *name = PetscCUSolverGetErrorName(_p_cusolver_stat__); \ 115 if (((_p_cusolver_stat__ == CUSOLVER_STATUS_NOT_INITIALIZED) || (_p_cusolver_stat__ == CUSOLVER_STATUS_ALLOC_FAILED) || (_p_cusolver_stat__ == CUSOLVER_STATUS_INTERNAL_ERROR)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \ 116 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \ 117 "cuSolver error %d (%s). " \ 118 "This indicates the GPU may have run out resources", \ 119 (PetscErrorCode)_p_cusolver_stat__, name); \ 120 } else { \ 121 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSolver error %d (%s)", (PetscErrorCode)_p_cusolver_stat__, name); \ 122 } \ 123 } \ 124 } while (0) 125 #define CHKERRCUSOLVER(...) PetscCallCUSOLVER(__VA_ARGS__) 126 127 #define PetscCallCUFFT(...) \ 128 do { \ 129 const cufftResult_t _p_cufft_stat__ = __VA_ARGS__; \ 130 if (PetscUnlikely(_p_cufft_stat__ != CUFFT_SUCCESS)) { \ 131 const char *name = PetscCUFFTGetErrorName(_p_cufft_stat__); \ 132 if (((_p_cufft_stat__ == CUFFT_SETUP_FAILED) || (_p_cufft_stat__ == CUFFT_ALLOC_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \ 133 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \ 134 "cuFFT error %d (%s). " \ 135 "Reports not initialized or alloc failed; " \ 136 "this indicates the GPU has run out resources", \ 137 (PetscErrorCode)_p_cufft_stat__, name); \ 138 } else { \ 139 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuFFT error %d (%s)", (PetscErrorCode)_p_cufft_stat__, name); \ 140 } \ 141 } \ 142 } while (0) 143 #define CHKERRCUFFT(...) PetscCallCUFFT(__VA_ARGS__) 144 145 #define PetscCallCURAND(...) \ 146 do { \ 147 const curandStatus_t _p_curand_stat__ = __VA_ARGS__; \ 148 if (PetscUnlikely(_p_curand_stat__ != CURAND_STATUS_SUCCESS)) { \ 149 if (((_p_curand_stat__ == CURAND_STATUS_INITIALIZATION_FAILED) || (_p_curand_stat__ == CURAND_STATUS_ALLOCATION_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \ 150 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \ 151 "cuRAND error %d. " \ 152 "Reports not initialized or alloc failed; " \ 153 "this indicates the GPU has run out resources", \ 154 (PetscErrorCode)_p_curand_stat__); \ 155 } else { \ 156 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuRand error %d", (PetscErrorCode)_p_curand_stat__); \ 157 } \ 158 } \ 159 } while (0) 160 #define CHKERRCURAND(...) PetscCallCURAND(__VA_ARGS__) 161 162 PETSC_EXTERN cudaStream_t PetscDefaultCudaStream; // The default stream used by PETSc 163 PETSC_EXTERN PetscErrorCode PetscCUBLASGetHandle(cublasHandle_t *); 164 PETSC_EXTERN PetscErrorCode PetscCUSOLVERDnGetHandle(cusolverDnHandle_t *); 165 PETSC_EXTERN PetscErrorCode PetscGetCurrentCUDAStream(cudaStream_t *); 166 167 #endif // PETSC_HAVE_CUDA 168 169 // these can also be defined in petscdevice_hip.h so we undef and define them *only* if the 170 // current compiler is NVCC. In this case if petscdevice_hip.h is included first, the macros 171 // would already be defined, but they would be empty since we cannot be using HCC at the same 172 // time. 173 #if PetscDefined(USING_NVCC) 174 #undef PETSC_HOST_DECL 175 #undef PETSC_DEVICE_DECL 176 #undef PETSC_KERNEL_DECL 177 #undef PETSC_SHAREDMEM_DECL 178 #undef PETSC_FORCEINLINE 179 #undef PETSC_CONSTMEM_DECL 180 181 #define PETSC_HOST_DECL __host__ 182 #define PETSC_DEVICE_DECL __device__ 183 #define PETSC_KERNEL_DECL __global__ 184 #define PETSC_SHAREDMEM_DECL __shared__ 185 #define PETSC_FORCEINLINE __forceinline__ 186 #define PETSC_CONSTMEM_DECL __constant__ 187 #endif 188 189 #ifndef PETSC_HOST_DECL // use HOST_DECL as canary 190 #define PETSC_HOST_DECL 191 #define PETSC_DEVICE_DECL 192 #define PETSC_KERNEL_DECL 193 #define PETSC_SHAREDMEM_DECL 194 #define PETSC_FORCEINLINE inline 195 #define PETSC_CONSTMEM_DECL 196 #endif 197 198 #ifndef PETSC_DEVICE_DEFINED_DECLS_PRIVATE 199 #define PETSC_DEVICE_DEFINED_DECLS_PRIVATE 200 #define PETSC_HOSTDEVICE_DECL PETSC_HOST_DECL PETSC_DEVICE_DECL 201 #define PETSC_DEVICE_INLINE_DECL PETSC_DEVICE_DECL PETSC_FORCEINLINE 202 #define PETSC_HOSTDEVICE_INLINE_DECL PETSC_HOSTDEVICE_DECL PETSC_FORCEINLINE 203 #endif 204