xref: /petsc/src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
1 #if !defined(CUSPARSEMATIMPL)
2   #define CUSPARSEMATIMPL
3 
4   #include <petscpkg_version.h>
5   #include <petsc/private/cudavecimpl.h>
6   #include <petscaijdevice.h>
7 
8   #include <cusparse_v2.h>
9 
10   #include <algorithm>
11   #include <vector>
12 
13   #include <thrust/device_vector.h>
14   #include <thrust/device_ptr.h>
15   #include <thrust/device_malloc_allocator.h>
16   #include <thrust/transform.h>
17   #include <thrust/functional.h>
18   #include <thrust/sequence.h>
19   #include <thrust/system/system_error.h>
20 
21   #define PetscCallThrust(body) \
22     do { \
23       try { \
24         body; \
25       } catch (thrust::system_error & e) { \
26         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in Thrust %s", e.what()); \
27       } \
28     } while (0)
29 
30   #if defined(PETSC_USE_COMPLEX)
31     #if defined(PETSC_USE_REAL_SINGLE)
32 const cuComplex PETSC_CUSPARSE_ONE  = {1.0f, 0.0f};
33 const cuComplex PETSC_CUSPARSE_ZERO = {0.0f, 0.0f};
34       #define cusparseXcsrilu02_bufferSize(a, b, c, d, e, f, g, h, i)  cusparseCcsrilu02_bufferSize(a, b, c, d, (cuComplex *)e, f, g, h, i)
35       #define cusparseXcsrilu02_analysis(a, b, c, d, e, f, g, h, i, j) cusparseCcsrilu02_analysis(a, b, c, d, (cuComplex *)e, f, g, h, i, j)
36       #define cusparseXcsrilu02(a, b, c, d, e, f, g, h, i, j)          cusparseCcsrilu02(a, b, c, d, (cuComplex *)e, f, g, h, i, j)
37       #define cusparseXcsric02_bufferSize(a, b, c, d, e, f, g, h, i)   cusparseCcsric02_bufferSize(a, b, c, d, (cuComplex *)e, f, g, h, i)
38       #define cusparseXcsric02_analysis(a, b, c, d, e, f, g, h, i, j)  cusparseCcsric02_analysis(a, b, c, d, (cuComplex *)e, f, g, h, i, j)
39       #define cusparseXcsric02(a, b, c, d, e, f, g, h, i, j)           cusparseCcsric02(a, b, c, d, (cuComplex *)e, f, g, h, i, j)
40     #elif defined(PETSC_USE_REAL_DOUBLE)
41 const cuDoubleComplex PETSC_CUSPARSE_ONE  = {1.0, 0.0};
42 const cuDoubleComplex PETSC_CUSPARSE_ZERO = {0.0, 0.0};
43       #define cusparseXcsrilu02_bufferSize(a, b, c, d, e, f, g, h, i)  cusparseZcsrilu02_bufferSize(a, b, c, d, (cuDoubleComplex *)e, f, g, h, i)
44       #define cusparseXcsrilu02_analysis(a, b, c, d, e, f, g, h, i, j) cusparseZcsrilu02_analysis(a, b, c, d, (cuDoubleComplex *)e, f, g, h, i, j)
45       #define cusparseXcsrilu02(a, b, c, d, e, f, g, h, i, j)          cusparseZcsrilu02(a, b, c, d, (cuDoubleComplex *)e, f, g, h, i, j)
46       #define cusparseXcsric02_bufferSize(a, b, c, d, e, f, g, h, i)   cusparseZcsric02_bufferSize(a, b, c, d, (cuDoubleComplex *)e, f, g, h, i)
47       #define cusparseXcsric02_analysis(a, b, c, d, e, f, g, h, i, j)  cusparseZcsric02_analysis(a, b, c, d, (cuDoubleComplex *)e, f, g, h, i, j)
48       #define cusparseXcsric02(a, b, c, d, e, f, g, h, i, j)           cusparseZcsric02(a, b, c, d, (cuDoubleComplex *)e, f, g, h, i, j)
49     #endif
50   #else
51 const PetscScalar PETSC_CUSPARSE_ONE  = 1.0;
52 const PetscScalar PETSC_CUSPARSE_ZERO = 0.0;
53     #if defined(PETSC_USE_REAL_SINGLE)
54       #define cusparseXcsrilu02_bufferSize cusparseScsrilu02_bufferSize
55       #define cusparseXcsrilu02_analysis   cusparseScsrilu02_analysis
56       #define cusparseXcsrilu02            cusparseScsrilu02
57       #define cusparseXcsric02_bufferSize  cusparseScsric02_bufferSize
58       #define cusparseXcsric02_analysis    cusparseScsric02_analysis
59       #define cusparseXcsric02             cusparseScsric02
60     #elif defined(PETSC_USE_REAL_DOUBLE)
61       #define cusparseXcsrilu02_bufferSize cusparseDcsrilu02_bufferSize
62       #define cusparseXcsrilu02_analysis   cusparseDcsrilu02_analysis
63       #define cusparseXcsrilu02            cusparseDcsrilu02
64       #define cusparseXcsric02_bufferSize  cusparseDcsric02_bufferSize
65       #define cusparseXcsric02_analysis    cusparseDcsric02_analysis
66       #define cusparseXcsric02             cusparseDcsric02
67     #endif
68   #endif
69 
70   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
71     #define csrsvInfo_t              csrsv2Info_t
72     #define cusparseCreateCsrsvInfo  cusparseCreateCsrsv2Info
73     #define cusparseDestroyCsrsvInfo cusparseDestroyCsrsv2Info
74     #if defined(PETSC_USE_COMPLEX)
75       #if defined(PETSC_USE_REAL_SINGLE)
76         #define cusparseXcsrsv_buffsize(a, b, c, d, e, f, g, h, i, j)          cusparseCcsrsv2_bufferSize(a, b, c, d, e, (cuComplex *)(f), g, h, i, j)
77         #define cusparseXcsrsv_analysis(a, b, c, d, e, f, g, h, i, j, k)       cusparseCcsrsv2_analysis(a, b, c, d, e, (const cuComplex *)(f), g, h, i, j, k)
78         #define cusparseXcsrsv_solve(a, b, c, d, e, f, g, h, i, j, k, l, m, n) cusparseCcsrsv2_solve(a, b, c, d, (const cuComplex *)(e), f, (const cuComplex *)(g), h, i, j, (const cuComplex *)(k), (cuComplex *)(l), m, n)
79       #elif defined(PETSC_USE_REAL_DOUBLE)
80         #define cusparseXcsrsv_buffsize(a, b, c, d, e, f, g, h, i, j)          cusparseZcsrsv2_bufferSize(a, b, c, d, e, (cuDoubleComplex *)(f), g, h, i, j)
81         #define cusparseXcsrsv_analysis(a, b, c, d, e, f, g, h, i, j, k)       cusparseZcsrsv2_analysis(a, b, c, d, e, (const cuDoubleComplex *)(f), g, h, i, j, k)
82         #define cusparseXcsrsv_solve(a, b, c, d, e, f, g, h, i, j, k, l, m, n) cusparseZcsrsv2_solve(a, b, c, d, (const cuDoubleComplex *)(e), f, (const cuDoubleComplex *)(g), h, i, j, (const cuDoubleComplex *)(k), (cuDoubleComplex *)(l), m, n)
83       #endif
84     #else /* not complex */
85       #if defined(PETSC_USE_REAL_SINGLE)
86         #define cusparseXcsrsv_buffsize cusparseScsrsv2_bufferSize
87         #define cusparseXcsrsv_analysis cusparseScsrsv2_analysis
88         #define cusparseXcsrsv_solve    cusparseScsrsv2_solve
89       #elif defined(PETSC_USE_REAL_DOUBLE)
90         #define cusparseXcsrsv_buffsize cusparseDcsrsv2_bufferSize
91         #define cusparseXcsrsv_analysis cusparseDcsrsv2_analysis
92         #define cusparseXcsrsv_solve    cusparseDcsrsv2_solve
93       #endif
94     #endif
95   #else
96     #define csrsvInfo_t              cusparseSolveAnalysisInfo_t
97     #define cusparseCreateCsrsvInfo  cusparseCreateSolveAnalysisInfo
98     #define cusparseDestroyCsrsvInfo cusparseDestroySolveAnalysisInfo
99     #if defined(PETSC_USE_COMPLEX)
100       #if defined(PETSC_USE_REAL_SINGLE)
101         #define cusparseXcsrsv_solve(a, b, c, d, e, f, g, h, i, j, k) cusparseCcsrsv_solve((a), (b), (c), (cuComplex *)(d), (e), (cuComplex *)(f), (g), (h), (i), (cuComplex *)(j), (cuComplex *)(k))
102         #define cusparseXcsrsv_analysis(a, b, c, d, e, f, g, h, i)    cusparseCcsrsv_analysis((a), (b), (c), (d), (e), (cuComplex *)(f), (g), (h), (i))
103       #elif defined(PETSC_USE_REAL_DOUBLE)
104         #define cusparseXcsrsv_solve(a, b, c, d, e, f, g, h, i, j, k) cusparseZcsrsv_solve((a), (b), (c), (cuDoubleComplex *)(d), (e), (cuDoubleComplex *)(f), (g), (h), (i), (cuDoubleComplex *)(j), (cuDoubleComplex *)(k))
105         #define cusparseXcsrsv_analysis(a, b, c, d, e, f, g, h, i)    cusparseZcsrsv_analysis((a), (b), (c), (d), (e), (cuDoubleComplex *)(f), (g), (h), (i))
106       #endif
107     #else /* not complex */
108       #if defined(PETSC_USE_REAL_SINGLE)
109         #define cusparseXcsrsv_solve    cusparseScsrsv_solve
110         #define cusparseXcsrsv_analysis cusparseScsrsv_analysis
111       #elif defined(PETSC_USE_REAL_DOUBLE)
112         #define cusparseXcsrsv_solve    cusparseDcsrsv_solve
113         #define cusparseXcsrsv_analysis cusparseDcsrsv_analysis
114       #endif
115     #endif
116   #endif
117 
118   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
119     #define cusparse_csr2csc cusparseCsr2cscEx2
120     #if defined(PETSC_USE_COMPLEX)
121       #if defined(PETSC_USE_REAL_SINGLE)
122         #define cusparse_scalartype                                                             CUDA_C_32F
123         #define cusparse_csr_spgeam(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) cusparseCcsrgeam2(a, b, c, (cuComplex *)d, e, f, (cuComplex *)g, h, i, (cuComplex *)j, k, l, (cuComplex *)m, n, o, p, (cuComplex *)q, r, s, t)
124         #define cusparse_csr_spgeam_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) \
125           cusparseCcsrgeam2_bufferSizeExt(a, b, c, (cuComplex *)d, e, f, (cuComplex *)g, h, i, (cuComplex *)j, k, l, (cuComplex *)m, n, o, p, (cuComplex *)q, r, s, t)
126       #elif defined(PETSC_USE_REAL_DOUBLE)
127         #define cusparse_scalartype CUDA_C_64F
128         #define cusparse_csr_spgeam(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) \
129           cusparseZcsrgeam2(a, b, c, (cuDoubleComplex *)d, e, f, (cuDoubleComplex *)g, h, i, (cuDoubleComplex *)j, k, l, (cuDoubleComplex *)m, n, o, p, (cuDoubleComplex *)q, r, s, t)
130         #define cusparse_csr_spgeam_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) \
131           cusparseZcsrgeam2_bufferSizeExt(a, b, c, (cuDoubleComplex *)d, e, f, (cuDoubleComplex *)g, h, i, (cuDoubleComplex *)j, k, l, (cuDoubleComplex *)m, n, o, p, (cuDoubleComplex *)q, r, s, t)
132       #endif
133     #else /* not complex */
134       #if defined(PETSC_USE_REAL_SINGLE)
135         #define cusparse_scalartype            CUDA_R_32F
136         #define cusparse_csr_spgeam            cusparseScsrgeam2
137         #define cusparse_csr_spgeam_bufferSize cusparseScsrgeam2_bufferSizeExt
138       #elif defined(PETSC_USE_REAL_DOUBLE)
139         #define cusparse_scalartype            CUDA_R_64F
140         #define cusparse_csr_spgeam            cusparseDcsrgeam2
141         #define cusparse_csr_spgeam_bufferSize cusparseDcsrgeam2_bufferSizeExt
142       #endif
143     #endif
144   #else
145     #if defined(PETSC_USE_COMPLEX)
146       #if defined(PETSC_USE_REAL_SINGLE)
147         #define cusparse_csr_spmv(a, b, c, d, e, f, g, h, i, j, k, l, m)                        cusparseCcsrmv((a), (b), (c), (d), (e), (cuComplex *)(f), (g), (cuComplex *)(h), (i), (j), (cuComplex *)(k), (cuComplex *)(l), (cuComplex *)(m))
148         #define cusparse_csr_spmm(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)               cusparseCcsrmm((a), (b), (c), (d), (e), (f), (cuComplex *)(g), (h), (cuComplex *)(i), (j), (k), (cuComplex *)(l), (m), (cuComplex *)(n), (cuComplex *)(o), (p))
149         #define cusparse_csr2csc(a, b, c, d, e, f, g, h, i, j, k, l)                            cusparseCcsr2csc((a), (b), (c), (d), (cuComplex *)(e), (f), (g), (cuComplex *)(h), (i), (j), (k), (l))
150         #define cusparse_hyb_spmv(a, b, c, d, e, f, g, h)                                       cusparseChybmv((a), (b), (cuComplex *)(c), (d), (e), (cuComplex *)(f), (cuComplex *)(g), (cuComplex *)(h))
151         #define cusparse_csr2hyb(a, b, c, d, e, f, g, h, i, j)                                  cusparseCcsr2hyb((a), (b), (c), (d), (cuComplex *)(e), (f), (g), (h), (i), (j))
152         #define cusparse_hyb2csr(a, b, c, d, e, f)                                              cusparseChyb2csr((a), (b), (c), (cuComplex *)(d), (e), (f))
153         #define cusparse_csr_spgemm(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) cusparseCcsrgemm(a, b, c, d, e, f, g, h, (cuComplex *)i, j, k, l, m, (cuComplex *)n, o, p, q, (cuComplex *)r, s, t)
154         #define cusparse_csr_spgeam(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s)    cusparseCcsrgeam(a, b, c, (cuComplex *)d, e, f, (cuComplex *)g, h, i, (cuComplex *)j, k, l, (cuComplex *)m, n, o, p, (cuComplex *)q, r, s)
155       #elif defined(PETSC_USE_REAL_DOUBLE)
156         #define cusparse_csr_spmv(a, b, c, d, e, f, g, h, i, j, k, l, m) cusparseZcsrmv((a), (b), (c), (d), (e), (cuDoubleComplex *)(f), (g), (cuDoubleComplex *)(h), (i), (j), (cuDoubleComplex *)(k), (cuDoubleComplex *)(l), (cuDoubleComplex *)(m))
157         #define cusparse_csr_spmm(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
158           cusparseZcsrmm((a), (b), (c), (d), (e), (f), (cuDoubleComplex *)(g), (h), (cuDoubleComplex *)(i), (j), (k), (cuDoubleComplex *)(l), (m), (cuDoubleComplex *)(n), (cuDoubleComplex *)(o), (p))
159         #define cusparse_csr2csc(a, b, c, d, e, f, g, h, i, j, k, l)                            cusparseZcsr2csc((a), (b), (c), (d), (cuDoubleComplex *)(e), (f), (g), (cuDoubleComplex *)(h), (i), (j), (k), (l))
160         #define cusparse_hyb_spmv(a, b, c, d, e, f, g, h)                                       cusparseZhybmv((a), (b), (cuDoubleComplex *)(c), (d), (e), (cuDoubleComplex *)(f), (cuDoubleComplex *)(g), (cuDoubleComplex *)(h))
161         #define cusparse_csr2hyb(a, b, c, d, e, f, g, h, i, j)                                  cusparseZcsr2hyb((a), (b), (c), (d), (cuDoubleComplex *)(e), (f), (g), (h), (i), (j))
162         #define cusparse_hyb2csr(a, b, c, d, e, f)                                              cusparseZhyb2csr((a), (b), (c), (cuDoubleComplex *)(d), (e), (f))
163         #define cusparse_csr_spgemm(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) cusparseZcsrgemm(a, b, c, d, e, f, g, h, (cuDoubleComplex *)i, j, k, l, m, (cuDoubleComplex *)n, o, p, q, (cuDoubleComplex *)r, s, t)
164         #define cusparse_csr_spgeam(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s) \
165           cusparseZcsrgeam(a, b, c, (cuDoubleComplex *)d, e, f, (cuDoubleComplex *)g, h, i, (cuDoubleComplex *)j, k, l, (cuDoubleComplex *)m, n, o, p, (cuDoubleComplex *)q, r, s)
166       #endif
167     #else
168       #if defined(PETSC_USE_REAL_SINGLE)
169         #define cusparse_csr_spmv   cusparseScsrmv
170         #define cusparse_csr_spmm   cusparseScsrmm
171         #define cusparse_csr2csc    cusparseScsr2csc
172         #define cusparse_hyb_spmv   cusparseShybmv
173         #define cusparse_csr2hyb    cusparseScsr2hyb
174         #define cusparse_hyb2csr    cusparseShyb2csr
175         #define cusparse_csr_spgemm cusparseScsrgemm
176         #define cusparse_csr_spgeam cusparseScsrgeam
177       #elif defined(PETSC_USE_REAL_DOUBLE)
178         #define cusparse_csr_spmv   cusparseDcsrmv
179         #define cusparse_csr_spmm   cusparseDcsrmm
180         #define cusparse_csr2csc    cusparseDcsr2csc
181         #define cusparse_hyb_spmv   cusparseDhybmv
182         #define cusparse_csr2hyb    cusparseDcsr2hyb
183         #define cusparse_hyb2csr    cusparseDhyb2csr
184         #define cusparse_csr_spgemm cusparseDcsrgemm
185         #define cusparse_csr_spgeam cusparseDcsrgeam
186       #endif
187     #endif
188   #endif
189 
190   #define THRUSTINTARRAY32 thrust::device_vector<int>
191   #define THRUSTINTARRAY   thrust::device_vector<PetscInt>
192   #define THRUSTARRAY      thrust::device_vector<PetscScalar>
193 
194 /* A CSR matrix structure */
195 struct CsrMatrix {
196   PetscInt          num_rows;
197   PetscInt          num_cols;
198   PetscInt          num_entries;
199   THRUSTINTARRAY32 *row_offsets;
200   THRUSTINTARRAY32 *column_indices;
201   THRUSTARRAY      *values;
202 };
203 
204 /* This is struct holding the relevant data needed to a MatSolve */
205 struct Mat_SeqAIJCUSPARSETriFactorStruct {
206   /* Data needed for triangular solve */
207   cusparseMatDescr_t    descr;
208   cusparseOperation_t   solveOp;
209   CsrMatrix            *csrMat;
210   csrsvInfo_t           solveInfo;
211   cusparseSolvePolicy_t solvePolicy; /* whether level information is generated and used */
212   int                   solveBufferSize;
213   void                 *solveBuffer;
214   size_t                csr2cscBufferSize; /* to transpose the triangular factor (only used for CUDA >= 11.0) */
215   void                 *csr2cscBuffer;
216   PetscScalar          *AA_h; /* managed host buffer for moving values to the GPU */
217 };
218 
219 /* This is a larger struct holding all the triangular factors for a solve, transpose solve, and any indices used in a reordering */
220 struct Mat_SeqAIJCUSPARSETriFactors {
221   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorPtr;          /* pointer for lower triangular (factored matrix) on GPU */
222   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorPtr;          /* pointer for upper triangular (factored matrix) on GPU */
223   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorPtrTranspose; /* pointer for lower triangular (factored matrix) on GPU for the transpose (useful for BiCG) */
224   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorPtrTranspose; /* pointer for upper triangular (factored matrix) on GPU for the transpose (useful for BiCG)*/
225   THRUSTINTARRAY                    *rpermIndices;            /* indices used for any reordering */
226   THRUSTINTARRAY                    *cpermIndices;            /* indices used for any reordering */
227   THRUSTARRAY                       *workVector;
228   cusparseHandle_t                   handle;   /* a handle to the cusparse library */
229   PetscInt                           nnz;      /* number of nonzeros ... need this for accurate logging between ICC and ILU */
230   PetscScalar                       *a_band_d; /* GPU data for banded CSR LU factorization matrix diag(L)=1 */
231   int                               *i_band_d; /* this could be optimized away */
232   cudaDeviceProp                     dev_prop;
233   PetscBool                          init_dev_prop;
234 
235   /* csrilu0/csric0 appeared in cusparse-8.0, but we use it along with cusparseSpSV,
236      which first appeared in cusparse-11.5 with cuda-11.3.
237   */
238   PetscBool factorizeOnDevice; /* Do factorization on device or not */
239   #if CUSPARSE_VERSION >= 11500
240   PetscScalar *csrVal;
241   int         *csrRowPtr, *csrColIdx; /* a,i,j of M. Using int since some cusparse APIs only support 32-bit indices */
242 
243   /* Mixed mat descriptor types? yes, different cusparse APIs use different types */
244   cusparseMatDescr_t   matDescr_M;
245   cusparseSpMatDescr_t spMatDescr_L, spMatDescr_U;
246   cusparseSpSVDescr_t  spsvDescr_L, spsvDescr_Lt, spsvDescr_U, spsvDescr_Ut;
247 
248   cusparseDnVecDescr_t dnVecDescr_X, dnVecDescr_Y;
249   PetscScalar         *X, *Y; /* data array of dnVec X and Y */
250 
251   /* Mixed size types? yes, CUDA-11.7.0 declared cusparseDcsrilu02_bufferSizeExt() that returns size_t but did not implement it! */
252   int    factBufferSize_M; /* M ~= LU or LLt */
253   size_t spsvBufferSize_L, spsvBufferSize_Lt, spsvBufferSize_U, spsvBufferSize_Ut;
254   /* cusparse needs various buffers for factorization and solve of L, U, Lt, or Ut.
255      So save memory, we share the factorization buffer with one of spsvBuffer_L/U.
256   */
257   void *factBuffer_M, *spsvBuffer_L, *spsvBuffer_U, *spsvBuffer_Lt, *spsvBuffer_Ut;
258 
259   csrilu02Info_t        ilu0Info_M;
260   csric02Info_t         ic0Info_M;
261   int                   structural_zero, numerical_zero;
262   cusparseSolvePolicy_t policy_M;
263 
264   /* In MatSolveTranspose() for ILU0, we use the two flags to do on-demand solve */
265   PetscBool createdTransposeSpSVDescr;    /* Have we created SpSV descriptors for Lt, Ut? */
266   PetscBool updatedTransposeSpSVAnalysis; /* Have we updated SpSV analysis with the latest L, U values? */
267 
268   PetscLogDouble numericFactFlops; /* Estimated FLOPs in ILU0/ICC0 numeric factorization */
269   #endif
270 };
271 
272 struct Mat_CusparseSpMV {
273   PetscBool initialized;    /* Don't rely on spmvBuffer != NULL to test if the struct is initialized, */
274   size_t    spmvBufferSize; /* since I'm not sure if smvBuffer can be NULL even after cusparseSpMV_bufferSize() */
275   void     *spmvBuffer;
276   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)    /* these are present from CUDA 10.1, but PETSc code makes use of them from CUDA 11 on */
277   cusparseDnVecDescr_t vecXDescr, vecYDescr; /* descriptor for the dense vectors in y=op(A)x */
278   #endif
279 };
280 
281 /* This is struct holding the relevant data needed to a MatMult */
282 struct Mat_SeqAIJCUSPARSEMultStruct {
283   void              *mat;          /* opaque pointer to a matrix. This could be either a cusparseHybMat_t or a CsrMatrix */
284   cusparseMatDescr_t descr;        /* Data needed to describe the matrix for a multiply */
285   THRUSTINTARRAY    *cprowIndices; /* compressed row indices used in the parallel SpMV */
286   PetscScalar       *alpha_one;    /* pointer to a device "scalar" storing the alpha parameter in the SpMV */
287   PetscScalar       *beta_zero;    /* pointer to a device "scalar" storing the beta parameter in the SpMV as zero*/
288   PetscScalar       *beta_one;     /* pointer to a device "scalar" storing the beta parameter in the SpMV as one */
289   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
290   cusparseSpMatDescr_t matDescr;  /* descriptor for the matrix, used by SpMV and SpMM */
291   Mat_CusparseSpMV     cuSpMV[3]; /* different Mat_CusparseSpMV structs for non-transpose, transpose, conj-transpose */
292   Mat_SeqAIJCUSPARSEMultStruct() : matDescr(NULL)
293   {
294     for (int i = 0; i < 3; i++) cuSpMV[i].initialized = PETSC_FALSE;
295   }
296   #endif
297 };
298 
299 /* This is a larger struct holding all the matrices for a SpMV, and SpMV Transpose */
300 struct Mat_SeqAIJCUSPARSE {
301   Mat_SeqAIJCUSPARSEMultStruct *mat;            /* pointer to the matrix on the GPU */
302   Mat_SeqAIJCUSPARSEMultStruct *matTranspose;   /* pointer to the matrix on the GPU (for the transpose ... useful for BiCG) */
303   THRUSTARRAY                  *workVector;     /* pointer to a workvector to which we can copy the relevant indices of a vector we want to multiply */
304   THRUSTINTARRAY32             *rowoffsets_gpu; /* rowoffsets on GPU in non-compressed-row format. It is used to convert CSR to CSC */
305   PetscInt                      nrows;          /* number of rows of the matrix seen by GPU */
306   MatCUSPARSEStorageFormat      format;         /* the storage format for the matrix on the device */
307   PetscBool                     use_cpu_solve;  /* Use AIJ_Seq (I)LU solve */
308   cudaStream_t                  stream;         /* a stream for the parallel SpMV ... this is not owned and should not be deleted */
309   cusparseHandle_t              handle;         /* a handle to the cusparse library ... this may not be owned (if we're working in parallel i.e. multiGPUs) */
310   PetscObjectState              nonzerostate;   /* track nonzero state to possibly recreate the GPU matrix */
311   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
312   size_t               csr2cscBufferSize; /* stuff used to compute the matTranspose above */
313   void                *csr2cscBuffer;     /* This is used as a C struct and is calloc'ed by PetscNewLog() */
314   cusparseCsr2CscAlg_t csr2cscAlg;        /* algorithms can be selected from command line options */
315   cusparseSpMVAlg_t    spmvAlg;
316   cusparseSpMMAlg_t    spmmAlg;
317   #endif
318   THRUSTINTARRAY            *csr2csc_i;
319   PetscSplitCSRDataStructure deviceMat; /* Matrix on device for, eg, assembly */
320 
321   /* Stuff for basic COO support */
322   THRUSTINTARRAY *cooPerm;   /* permutation array that sorts the input coo entris by row and col */
323   THRUSTINTARRAY *cooPerm_a; /* ordered array that indicate i-th nonzero (after sorting) is the j-th unique nonzero */
324 
325   /* Stuff for extended COO support */
326   PetscBool   use_extended_coo; /* Use extended COO format */
327   PetscCount *jmap_d;           /* perm[disp+jmap[i]..disp+jmap[i+1]) gives indices of entries in v[] associated with i-th nonzero of the matrix */
328   PetscCount *perm_d;
329 
330   Mat_SeqAIJCUSPARSE() : use_extended_coo(PETSC_FALSE), perm_d(NULL), jmap_d(NULL) { }
331 };
332 
333 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
334 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat, PetscCount, PetscInt[], PetscInt[]);
335 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat, const PetscScalar[], InsertMode);
336 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat, Mat, MatReuse, Mat *);
337 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *);
338 
339 static inline bool isCudaMem(const void *data)
340 {
341   cudaError_t                  cerr;
342   struct cudaPointerAttributes attr;
343   enum cudaMemoryType          mtype;
344   cerr = cudaPointerGetAttributes(&attr, data); /* Do not check error since before CUDA 11.0, passing a host pointer returns cudaErrorInvalidValue */
345   cudaGetLastError();                           /* Reset the last error */
346   #if (CUDART_VERSION < 10000)
347   mtype = attr.memoryType;
348   #else
349   mtype = attr.type;
350   #endif
351   if (cerr == cudaSuccess && mtype == cudaMemoryTypeDevice) return true;
352   else return false;
353 }
354 
355 #endif
356