xref: /petsc/include/petscdevice_cuda.h (revision 6dd63270497ad23dcf16ae500a87ff2b2a0b7474)
1 #pragma once
2 
3 #include <petscdevice.h>
4 #include <petscpkg_version.h>
5 
6 #if defined(__NVCC__) || defined(__CUDACC__)
7   #define PETSC_USING_NVCC 1
8 #endif
9 
10 #if PetscDefined(HAVE_CUDA)
11 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
12   #include <cuda.h>
13   #include <cuda_runtime.h>
14   #include <cublas_v2.h>
15   #include <cusolverDn.h>
16   #include <cusolverSp.h>
17   #include <cufft.h>
18 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
19 
20 /* cuBLAS does not have cublasGetErrorName(). We create one on our own. */
21 PETSC_EXTERN const char *PetscCUBLASGetErrorName(cublasStatus_t); /* PETSC_EXTERN since it is exposed by the CHKERRCUBLAS macro */
22 PETSC_EXTERN const char *PetscCUSolverGetErrorName(cusolverStatus_t);
23 PETSC_EXTERN const char *PetscCUFFTGetErrorName(cufftResult);
24 
25   /* REMOVE ME */
26   #define WaitForCUDA() cudaDeviceSynchronize()
27 
28   /* CUDART_VERSION = 1000 x major + 10 x minor version */
29 
30   /* Could not find exactly which CUDART_VERSION introduced cudaGetErrorName. At least it was in CUDA 8.0 (Sep. 2016) */
31   #if PETSC_PKG_CUDA_VERSION_GE(8, 0, 0)
32     #define PetscCallCUDAVoid(...) \
33       do { \
34         const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
35         PetscCheckAbort(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d (%s) : %s", (PetscErrorCode)_p_cuda_err__, cudaGetErrorName(_p_cuda_err__), cudaGetErrorString(_p_cuda_err__)); \
36       } while (0)
37 
38     #define PetscCallCUDA(...) \
39       do { \
40         const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
41         PetscCheck(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d (%s) : %s", (PetscErrorCode)_p_cuda_err__, cudaGetErrorName(_p_cuda_err__), cudaGetErrorString(_p_cuda_err__)); \
42       } while (0)
43   #else /* PETSC_PKG_CUDA_VERSION_GE(8,0,0) */
44     #define PetscCallCUDA(...) \
45       do { \
46         const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
47         PetscCheck(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d", (PetscErrorCode)_p_cuda_err__); \
48       } while (0)
49 
50     #define PetscCallCUDAVoid(...) \
51       do { \
52         const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
53         PetscCheckAbort(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d", (PetscErrorCode)_p_cuda_err__); \
54       } while (0)
55   #endif /* PETSC_PKG_CUDA_VERSION_GE(8,0,0) */
56   #define CHKERRCUDA(...) PetscCallCUDA(__VA_ARGS__)
57 
58   #define PetscCUDACheckLaunch \
59     do { \
60       /* Check synchronous errors, i.e. pre-launch */ \
61       PetscCallCUDA(cudaGetLastError()); \
62       /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
63       PetscCallCUDA(cudaDeviceSynchronize()); \
64     } while (0)
65 
66   #define PetscCallCUBLAS(...) \
67     do { \
68       const cublasStatus_t _p_cublas_stat__ = __VA_ARGS__; \
69       if (PetscUnlikely(_p_cublas_stat__ != CUBLAS_STATUS_SUCCESS)) { \
70         const char *name = PetscCUBLASGetErrorName(_p_cublas_stat__); \
71         if (((_p_cublas_stat__ == CUBLAS_STATUS_NOT_INITIALIZED) || (_p_cublas_stat__ == CUBLAS_STATUS_ALLOC_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
72           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
73                   "cuBLAS error %d (%s). " \
74                   "Reports not initialized or alloc failed; " \
75                   "this indicates the GPU may have run out resources", \
76                   (PetscErrorCode)_p_cublas_stat__, name); \
77         } else { \
78           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuBLAS error %d (%s)", (PetscErrorCode)_p_cublas_stat__, name); \
79         } \
80       } \
81     } while (0)
82   #define CHKERRCUBLAS(...) PetscCallCUBLAS(__VA_ARGS__)
83 
84   #if (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) /* According to cuda/10.1.168 on OLCF Summit */
85     #define PetscCallCUSPARSE(...) \
86       do { \
87         const cusparseStatus_t _p_cusparse_stat__ = __VA_ARGS__; \
88         if (PetscUnlikely(_p_cusparse_stat__)) { \
89           const char *name  = cusparseGetErrorName(_p_cusparse_stat__); \
90           const char *descr = cusparseGetErrorString(_p_cusparse_stat__); \
91           PetscCheck((_p_cusparse_stat__ != CUSPARSE_STATUS_NOT_INITIALIZED) && (_p_cusparse_stat__ != CUSPARSE_STATUS_ALLOC_FAILED), PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
92                      "cuSPARSE errorcode %d (%s) : %s.; " \
93                      "this indicates the GPU has run out resources", \
94                      (int)_p_cusparse_stat__, name, descr); \
95           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSPARSE errorcode %d (%s) : %s", (int)_p_cusparse_stat__, name, descr); \
96         } \
97       } while (0)
98   #else /* (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) */
99     #define PetscCallCUSPARSE(...) \
100       do { \
101         const cusparseStatus_t _p_cusparse_stat__ = __VA_ARGS__; \
102         PetscCheck(_p_cusparse_stat__ == CUSPARSE_STATUS_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSPARSE errorcode %d", (PetscErrorCode)_p_cusparse_stat__); \
103       } while (0)
104   #endif /* (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) */
105   #define CHKERRCUSPARSE(...) PetscCallCUSPARSE(__VA_ARGS__)
106 
107   #define PetscCallCUSOLVER(...) \
108     do { \
109       const cusolverStatus_t _p_cusolver_stat__ = __VA_ARGS__; \
110       if (PetscUnlikely(_p_cusolver_stat__ != CUSOLVER_STATUS_SUCCESS)) { \
111         const char *name = PetscCUSolverGetErrorName(_p_cusolver_stat__); \
112         if (((_p_cusolver_stat__ == CUSOLVER_STATUS_NOT_INITIALIZED) || (_p_cusolver_stat__ == CUSOLVER_STATUS_ALLOC_FAILED) || (_p_cusolver_stat__ == CUSOLVER_STATUS_INTERNAL_ERROR)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
113           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
114                   "cuSolver error %d (%s). " \
115                   "This indicates the GPU may have run out resources", \
116                   (PetscErrorCode)_p_cusolver_stat__, name); \
117         } else { \
118           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSolver error %d (%s)", (PetscErrorCode)_p_cusolver_stat__, name); \
119         } \
120       } \
121     } while (0)
122   #define CHKERRCUSOLVER(...) PetscCallCUSOLVER(__VA_ARGS__)
123 
124   #define PetscCallCUFFT(...) \
125     do { \
126       const cufftResult_t _p_cufft_stat__ = __VA_ARGS__; \
127       if (PetscUnlikely(_p_cufft_stat__ != CUFFT_SUCCESS)) { \
128         const char *name = PetscCUFFTGetErrorName(_p_cufft_stat__); \
129         if (((_p_cufft_stat__ == CUFFT_SETUP_FAILED) || (_p_cufft_stat__ == CUFFT_ALLOC_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
130           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
131                   "cuFFT error %d (%s). " \
132                   "Reports not initialized or alloc failed; " \
133                   "this indicates the GPU has run out resources", \
134                   (PetscErrorCode)_p_cufft_stat__, name); \
135         } else { \
136           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuFFT error %d (%s)", (PetscErrorCode)_p_cufft_stat__, name); \
137         } \
138       } \
139     } while (0)
140   #define CHKERRCUFFT(...) PetscCallCUFFT(__VA_ARGS__)
141 
142   #define PetscCallCURAND(...) \
143     do { \
144       const curandStatus_t _p_curand_stat__ = __VA_ARGS__; \
145       if (PetscUnlikely(_p_curand_stat__ != CURAND_STATUS_SUCCESS)) { \
146         if (((_p_curand_stat__ == CURAND_STATUS_INITIALIZATION_FAILED) || (_p_curand_stat__ == CURAND_STATUS_ALLOCATION_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
147           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
148                   "cuRAND error %d. " \
149                   "Reports not initialized or alloc failed; " \
150                   "this indicates the GPU has run out resources", \
151                   (PetscErrorCode)_p_curand_stat__); \
152         } else { \
153           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuRand error %d", (PetscErrorCode)_p_curand_stat__); \
154         } \
155       } \
156     } while (0)
157   #define CHKERRCURAND(...) PetscCallCURAND(__VA_ARGS__)
158 
159 PETSC_EXTERN cudaStream_t   PetscDefaultCudaStream; // The default stream used by PETSc
160 PETSC_EXTERN PetscErrorCode PetscCUBLASGetHandle(cublasHandle_t *);
161 PETSC_EXTERN PetscErrorCode PetscCUSOLVERDnGetHandle(cusolverDnHandle_t *);
162 PETSC_EXTERN PetscErrorCode PetscGetCurrentCUDAStream(cudaStream_t *);
163 
164 #endif // PETSC_HAVE_CUDA
165 
166 // these can also be defined in petscdevice_hip.h so we undef and define them *only* if the
167 // current compiler is NVCC. In this case if petscdevice_hip.h is included first, the macros
168 // would already be defined, but they would be empty since we cannot be using HCC at the same
169 // time.
170 #if PetscDefined(USING_NVCC)
171   #undef PETSC_HOST_DECL
172   #undef PETSC_DEVICE_DECL
173   #undef PETSC_KERNEL_DECL
174   #undef PETSC_SHAREDMEM_DECL
175   #undef PETSC_FORCEINLINE
176   #undef PETSC_CONSTMEM_DECL
177 
178   #define PETSC_HOST_DECL      __host__
179   #define PETSC_DEVICE_DECL    __device__
180   #define PETSC_KERNEL_DECL    __global__
181   #define PETSC_SHAREDMEM_DECL __shared__
182   #define PETSC_FORCEINLINE    __forceinline__
183   #define PETSC_CONSTMEM_DECL  __constant__
184 #endif
185 
186 #ifndef PETSC_HOST_DECL // use HOST_DECL as canary
187   #define PETSC_HOST_DECL
188   #define PETSC_DEVICE_DECL
189   #define PETSC_KERNEL_DECL
190   #define PETSC_SHAREDMEM_DECL
191   #define PETSC_FORCEINLINE inline
192   #define PETSC_CONSTMEM_DECL
193 #endif
194 
195 #ifndef PETSC_DEVICE_DEFINED_DECLS_PRIVATE
196   #define PETSC_DEVICE_DEFINED_DECLS_PRIVATE
197   #define PETSC_HOSTDEVICE_DECL        PETSC_HOST_DECL PETSC_DEVICE_DECL
198   #define PETSC_DEVICE_INLINE_DECL     PETSC_DEVICE_DECL PETSC_FORCEINLINE
199   #define PETSC_HOSTDEVICE_INLINE_DECL PETSC_HOSTDEVICE_DECL PETSC_FORCEINLINE
200 #endif
201