xref: /petsc/include/petscdevice_cuda.h (revision 9d47de495d3c23378050c1b4a410c12a375cb6c6)
1 #pragma once
2 
3 #include <petscdevice.h>
4 #include <petscpkg_version.h>
5 
6 /* MANSEC = Sys */
7 
8 #if defined(__NVCC__) || defined(__CUDACC__)
9   #define PETSC_USING_NVCC 1
10 #endif
11 
12 #if PetscDefined(HAVE_CUDA)
13   #include <cuda.h>
14   #include <cuda_runtime.h>
15   #include <cublas_v2.h>
16   #define DISABLE_CUSPARSE_DEPRECATED
17   #include <cusparse.h>
18   #include <cusolverDn.h>
19   #include <cusolverSp.h>
20   #include <cufft.h>
21   #include <curand.h>
22   #include <nvml.h> // NVML comes with the NVIDIA GPU driver
23 
24 /* cuBLAS does not have cublasGetErrorName(). We create one on our own. */
25 PETSC_EXTERN const char *PetscCUBLASGetErrorName(cublasStatus_t); /* PETSC_EXTERN since it is exposed by the CHKERRCUBLAS macro */
26 PETSC_EXTERN const char *PetscCUSolverGetErrorName(cusolverStatus_t);
27 PETSC_EXTERN const char *PetscCUFFTGetErrorName(cufftResult);
28 
29   /* REMOVE ME */
30   #define WaitForCUDA() cudaDeviceSynchronize()
31 
32   /* CUDART_VERSION = 1000 x major + 10 x minor version */
33 
34   /* Could not find exactly which CUDART_VERSION introduced cudaGetErrorName. At least it was in CUDA 8.0 (Sep. 2016) */
35   #if PETSC_PKG_CUDA_VERSION_GE(8, 0, 0)
36     #define PetscCallCUDAVoid(...) \
37       do { \
38         const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
39         PetscCheckAbort(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d (%s) : %s", (PetscErrorCode)_p_cuda_err__, cudaGetErrorName(_p_cuda_err__), cudaGetErrorString(_p_cuda_err__)); \
40       } while (0)
41 
42     #define PetscCallCUDA(...) \
43       do { \
44         const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
45         PetscCheck(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d (%s) : %s", (PetscErrorCode)_p_cuda_err__, cudaGetErrorName(_p_cuda_err__), cudaGetErrorString(_p_cuda_err__)); \
46       } while (0)
47   #else /* PETSC_PKG_CUDA_VERSION_GE(8,0,0) */
48     #define PetscCallCUDA(...) \
49       do { \
50         const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
51         PetscCheck(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d", (PetscErrorCode)_p_cuda_err__); \
52       } while (0)
53 
54     #define PetscCallCUDAVoid(...) \
55       do { \
56         const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
57         PetscCheckAbort(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d", (PetscErrorCode)_p_cuda_err__); \
58       } while (0)
59   #endif /* PETSC_PKG_CUDA_VERSION_GE(8,0,0) */
60   #define CHKERRCUDA(...) PetscCallCUDA(__VA_ARGS__)
61 
62   #define PetscCUDACheckLaunch \
63     do { \
64       /* Check synchronous errors, i.e. pre-launch */ \
65       PetscCallCUDA(cudaGetLastError()); \
66       /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
67       PetscCallCUDA(cudaDeviceSynchronize()); \
68     } while (0)
69 
70   #define PetscCallCUBLAS(...) \
71     do { \
72       const cublasStatus_t _p_cublas_stat__ = __VA_ARGS__; \
73       if (PetscUnlikely(_p_cublas_stat__ != CUBLAS_STATUS_SUCCESS)) { \
74         const char *name = PetscCUBLASGetErrorName(_p_cublas_stat__); \
75         if (((_p_cublas_stat__ == CUBLAS_STATUS_NOT_INITIALIZED) || (_p_cublas_stat__ == CUBLAS_STATUS_ALLOC_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
76           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
77                   "cuBLAS error %d (%s). " \
78                   "Reports not initialized or alloc failed; " \
79                   "this indicates the GPU may have run out resources", \
80                   (PetscErrorCode)_p_cublas_stat__, name); \
81         } else { \
82           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuBLAS error %d (%s)", (PetscErrorCode)_p_cublas_stat__, name); \
83         } \
84       } \
85     } while (0)
86   #define CHKERRCUBLAS(...) PetscCallCUBLAS(__VA_ARGS__)
87 
88   #if (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) /* According to cuda/10.1.168 on OLCF Summit */
89     #define PetscCallCUSPARSE(...) \
90       do { \
91         const cusparseStatus_t _p_cusparse_stat__ = __VA_ARGS__; \
92         if (PetscUnlikely(_p_cusparse_stat__)) { \
93           const char *name  = cusparseGetErrorName(_p_cusparse_stat__); \
94           const char *descr = cusparseGetErrorString(_p_cusparse_stat__); \
95           PetscCheck((_p_cusparse_stat__ != CUSPARSE_STATUS_NOT_INITIALIZED) && (_p_cusparse_stat__ != CUSPARSE_STATUS_ALLOC_FAILED), PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
96                      "cuSPARSE errorcode %d (%s) : %s.; " \
97                      "this indicates the GPU has run out resources", \
98                      (int)_p_cusparse_stat__, name, descr); \
99           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSPARSE errorcode %d (%s) : %s", (int)_p_cusparse_stat__, name, descr); \
100         } \
101       } while (0)
102   #else /* (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) */
103     #define PetscCallCUSPARSE(...) \
104       do { \
105         const cusparseStatus_t _p_cusparse_stat__ = __VA_ARGS__; \
106         PetscCheck(_p_cusparse_stat__ == CUSPARSE_STATUS_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSPARSE errorcode %d", (PetscErrorCode)_p_cusparse_stat__); \
107       } while (0)
108   #endif /* (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) */
109   #define CHKERRCUSPARSE(...) PetscCallCUSPARSE(__VA_ARGS__)
110 
111   #define PetscCallCUSOLVER(...) \
112     do { \
113       const cusolverStatus_t _p_cusolver_stat__ = __VA_ARGS__; \
114       if (PetscUnlikely(_p_cusolver_stat__ != CUSOLVER_STATUS_SUCCESS)) { \
115         const char *name = PetscCUSolverGetErrorName(_p_cusolver_stat__); \
116         if (((_p_cusolver_stat__ == CUSOLVER_STATUS_NOT_INITIALIZED) || (_p_cusolver_stat__ == CUSOLVER_STATUS_ALLOC_FAILED) || (_p_cusolver_stat__ == CUSOLVER_STATUS_INTERNAL_ERROR)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
117           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
118                   "cuSolver error %d (%s). " \
119                   "This indicates the GPU may have run out resources", \
120                   (PetscErrorCode)_p_cusolver_stat__, name); \
121         } else { \
122           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSolver error %d (%s)", (PetscErrorCode)_p_cusolver_stat__, name); \
123         } \
124       } \
125     } while (0)
126   #define CHKERRCUSOLVER(...) PetscCallCUSOLVER(__VA_ARGS__)
127 
128   #define PetscCallCUFFT(...) \
129     do { \
130       const cufftResult_t _p_cufft_stat__ = __VA_ARGS__; \
131       if (PetscUnlikely(_p_cufft_stat__ != CUFFT_SUCCESS)) { \
132         const char *name = PetscCUFFTGetErrorName(_p_cufft_stat__); \
133         if (((_p_cufft_stat__ == CUFFT_SETUP_FAILED) || (_p_cufft_stat__ == CUFFT_ALLOC_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
134           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
135                   "cuFFT error %d (%s). " \
136                   "Reports not initialized or alloc failed; " \
137                   "this indicates the GPU has run out resources", \
138                   (PetscErrorCode)_p_cufft_stat__, name); \
139         } else { \
140           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuFFT error %d (%s)", (PetscErrorCode)_p_cufft_stat__, name); \
141         } \
142       } \
143     } while (0)
144   #define CHKERRCUFFT(...) PetscCallCUFFT(__VA_ARGS__)
145 
146   #define PetscCallCURAND(...) \
147     do { \
148       const curandStatus_t _p_curand_stat__ = __VA_ARGS__; \
149       if (PetscUnlikely(_p_curand_stat__ != CURAND_STATUS_SUCCESS)) { \
150         if (((_p_curand_stat__ == CURAND_STATUS_INITIALIZATION_FAILED) || (_p_curand_stat__ == CURAND_STATUS_ALLOCATION_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
151           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
152                   "cuRAND error %d. " \
153                   "Reports not initialized or alloc failed; " \
154                   "this indicates the GPU has run out resources", \
155                   (PetscErrorCode)_p_curand_stat__); \
156         } else { \
157           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuRand error %d", (PetscErrorCode)_p_curand_stat__); \
158         } \
159       } \
160     } while (0)
161   #define CHKERRCURAND(...) PetscCallCURAND(__VA_ARGS__)
162 
163 PETSC_EXTERN cudaStream_t   PetscDefaultCudaStream; // The default stream used by PETSc
164 PETSC_EXTERN PetscErrorCode PetscCUBLASGetHandle(cublasHandle_t *);
165 PETSC_EXTERN PetscErrorCode PetscCUSOLVERDnGetHandle(cusolverDnHandle_t *);
166 PETSC_EXTERN PetscErrorCode PetscGetCurrentCUDAStream(cudaStream_t *);
167 
168 #endif // PETSC_HAVE_CUDA
169 
170 // these can also be defined in petscdevice_hip.h so we undef and define them *only* if the
171 // current compiler is NVCC. In this case if petscdevice_hip.h is included first, the macros
172 // would already be defined, but they would be empty since we cannot be using HCC at the same
173 // time.
174 #if PetscDefined(USING_NVCC)
175   #undef PETSC_HOST_DECL
176   #undef PETSC_DEVICE_DECL
177   #undef PETSC_KERNEL_DECL
178   #undef PETSC_SHAREDMEM_DECL
179   #undef PETSC_FORCEINLINE
180   #undef PETSC_CONSTMEM_DECL
181 
182   #define PETSC_HOST_DECL      __host__
183   #define PETSC_DEVICE_DECL    __device__
184   #define PETSC_KERNEL_DECL    __global__
185   #define PETSC_SHAREDMEM_DECL __shared__
186   #define PETSC_FORCEINLINE    __forceinline__
187   #define PETSC_CONSTMEM_DECL  __constant__
188 #endif
189 
190 #if !defined(PETSC_HOST_DECL) // use HOST_DECL as canary
191   #define PETSC_HOST_DECL
192   #define PETSC_DEVICE_DECL
193   #define PETSC_KERNEL_DECL
194   #define PETSC_SHAREDMEM_DECL
195   #define PETSC_FORCEINLINE inline
196   #define PETSC_CONSTMEM_DECL
197 #endif
198 
199 #if !defined(PETSC_DEVICE_DEFINED_DECLS_PRIVATE)
200   #define PETSC_DEVICE_DEFINED_DECLS_PRIVATE
201   #define PETSC_HOSTDEVICE_DECL        PETSC_HOST_DECL PETSC_DEVICE_DECL
202   #define PETSC_DEVICE_INLINE_DECL     PETSC_DEVICE_DECL PETSC_FORCEINLINE
203   #define PETSC_HOSTDEVICE_INLINE_DECL PETSC_HOSTDEVICE_DECL PETSC_FORCEINLINE
204 #endif
205