xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision b698fc57f0bea7237255b29c1b77df0acc362ffd)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
75 
76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
84 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
85 
86 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
87 {
88   cusparseStatus_t   stat;
89   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
90 
91   PetscFunctionBegin;
92   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
93   cusparsestruct->stream = stream;
94   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
95   PetscFunctionReturn(0);
96 }
97 
98 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
99 {
100   cusparseStatus_t   stat;
101   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
102 
103   PetscFunctionBegin;
104   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
105   if (cusparsestruct->handle != handle) {
106     if (cusparsestruct->handle) {
107       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
108     }
109     cusparsestruct->handle = handle;
110   }
111   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
112   PetscFunctionReturn(0);
113 }
114 
115 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
116 {
117   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
118   PetscBool          flg;
119   PetscErrorCode     ierr;
120 
121   PetscFunctionBegin;
122   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
123   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
124   if (cusparsestruct->handle) cusparsestruct->handle = 0;
125   PetscFunctionReturn(0);
126 }
127 
128 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
129 {
130   PetscFunctionBegin;
131   *type = MATSOLVERCUSPARSE;
132   PetscFunctionReturn(0);
133 }
134 
135 /*MC
136   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
137   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
138   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
139   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
140   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
141   algorithms are not recommended. This class does NOT support direct solver operations.
142 
143   Level: beginner
144 
145 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
146 M*/
147 
148 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
149 {
150   PetscErrorCode ierr;
151   PetscInt       n = A->rmap->n;
152 
153   PetscFunctionBegin;
154   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
155   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
156   (*B)->factortype = ftype;
157   (*B)->useordering = PETSC_TRUE;
158   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
159 
160   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
161     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
162     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
163     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
164   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
165     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
166     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
167   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
168 
169   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
170   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
171   PetscFunctionReturn(0);
172 }
173 
174 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
175 {
176   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   switch (op) {
180   case MAT_CUSPARSE_MULT:
181     cusparsestruct->format = format;
182     break;
183   case MAT_CUSPARSE_ALL:
184     cusparsestruct->format = format;
185     break;
186   default:
187     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
188   }
189   PetscFunctionReturn(0);
190 }
191 
192 /*@
193    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
194    operation. Only the MatMult operation can use different GPU storage formats
195    for MPIAIJCUSPARSE matrices.
196    Not Collective
197 
198    Input Parameters:
199 +  A - Matrix of type SEQAIJCUSPARSE
200 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
201 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
202 
203    Output Parameter:
204 
205    Level: intermediate
206 
207 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
208 @*/
209 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
210 {
211   PetscErrorCode ierr;
212 
213   PetscFunctionBegin;
214   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
215   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
216   PetscFunctionReturn(0);
217 }
218 
219 /*@
220    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
221 
222    Collective on mat
223 
224    Input Parameters:
225 +  A - Matrix of type SEQAIJCUSPARSE
226 -  transgen - the boolean flag
227 
228    Level: intermediate
229 
230 .seealso: MATSEQAIJCUSPARSE
231 @*/
232 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
233 {
234   PetscErrorCode ierr;
235   PetscBool      flg;
236 
237   PetscFunctionBegin;
238   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
239   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
240   if (flg) {
241     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
242 
243     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
244     cusp->transgen = transgen;
245     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
246       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
247     }
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
253 {
254   PetscErrorCode           ierr;
255   MatCUSPARSEStorageFormat format;
256   PetscBool                flg;
257   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
258 
259   PetscFunctionBegin;
260   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
261   if (A->factortype == MAT_FACTOR_NONE) {
262     PetscBool transgen = cusparsestruct->transgen;
263 
264     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
265     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
266 
267     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
268                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
269     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
270 
271     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
272                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
273     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
274    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
275     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
276     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
277                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
278     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
279     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
280 
281     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
282     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
283                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
284     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
285 
286     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
287     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
288                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
289     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
290    #endif
291   }
292   ierr = PetscOptionsTail();CHKERRQ(ierr);
293   PetscFunctionReturn(0);
294 }
295 
296 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
297 {
298   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
299   PetscErrorCode               ierr;
300 
301   PetscFunctionBegin;
302   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
303   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
304   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
305   PetscFunctionReturn(0);
306 }
307 
308 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
309 {
310   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
311   PetscErrorCode               ierr;
312 
313   PetscFunctionBegin;
314   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
315   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
316   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
317   PetscFunctionReturn(0);
318 }
319 
320 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
321 {
322   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
323   PetscErrorCode               ierr;
324 
325   PetscFunctionBegin;
326   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
327   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
328   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
329   PetscFunctionReturn(0);
330 }
331 
332 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
333 {
334   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335   PetscErrorCode               ierr;
336 
337   PetscFunctionBegin;
338   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
339   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
340   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
341   PetscFunctionReturn(0);
342 }
343 
344 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
345 {
346   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
347   PetscInt                          n = A->rmap->n;
348   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
349   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
350   cusparseStatus_t                  stat;
351   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
352   const MatScalar                   *aa = a->a,*v;
353   PetscInt                          *AiLo, *AjLo;
354   PetscInt                          i,nz, nzLower, offset, rowOffset;
355   PetscErrorCode                    ierr;
356   cudaError_t                       cerr;
357 
358   PetscFunctionBegin;
359   if (!n) PetscFunctionReturn(0);
360   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
361     try {
362       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
363       nzLower=n+ai[n]-ai[1];
364       if (!loTriFactor) {
365 	PetscScalar                       *AALo;
366 
367 	cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
368 
369         /* Allocate Space for the lower triangular matrix */
370         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
371         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
372 
373         /* Fill the lower triangular matrix */
374         AiLo[0]  = (PetscInt) 0;
375         AiLo[n]  = nzLower;
376         AjLo[0]  = (PetscInt) 0;
377         AALo[0]  = (MatScalar) 1.0;
378         v        = aa;
379         vi       = aj;
380         offset   = 1;
381         rowOffset= 1;
382         for (i=1; i<n; i++) {
383           nz = ai[i+1] - ai[i];
384           /* additional 1 for the term on the diagonal */
385           AiLo[i]    = rowOffset;
386           rowOffset += nz+1;
387 
388           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
389           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
390 
391           offset      += nz;
392           AjLo[offset] = (PetscInt) i;
393           AALo[offset] = (MatScalar) 1.0;
394           offset      += 1;
395 
396           v  += nz;
397           vi += nz;
398         }
399 
400         /* allocate space for the triangular factor information */
401         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
402         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
403         /* Create the matrix description */
404         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
405         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
406        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
407         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
408        #else
409         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
410        #endif
411         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
412         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
413 
414         /* set the operation */
415         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
416 
417         /* set the matrix */
418         loTriFactor->csrMat = new CsrMatrix;
419         loTriFactor->csrMat->num_rows = n;
420         loTriFactor->csrMat->num_cols = n;
421         loTriFactor->csrMat->num_entries = nzLower;
422 
423         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
424         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
425 
426         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
427         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
428 
429         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
430         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
431 
432         /* Create the solve analysis information */
433         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
434         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
435       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
436         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
437                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
438                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
439                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
440                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
441         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
442       #endif
443 
444         /* perform the solve analysis */
445         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
446                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
447                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
448                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
449                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
450                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
451                                #endif
452                                 );CHKERRCUSPARSE(stat);
453         cerr = WaitForCUDA();CHKERRCUDA(cerr);
454         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
455 
456         /* assign the pointer */
457         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
458 	loTriFactor->AA_h = AALo;
459         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
460         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
461         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
462       } else { /* update values only */
463 	if (!loTriFactor->AA_h) {
464 	  cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
465 	}
466         /* Fill the lower triangular matrix */
467         loTriFactor->AA_h[0]  = 1.0;
468         v        = aa;
469         vi       = aj;
470         offset   = 1;
471         for (i=1; i<n; i++) {
472           nz = ai[i+1] - ai[i];
473           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
474           offset      += nz;
475           loTriFactor->AA_h[offset] = 1.0;
476           offset      += 1;
477           v  += nz;
478         }
479         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
480         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
481       }
482     } catch(char *ex) {
483       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
484     }
485   }
486   PetscFunctionReturn(0);
487 }
488 
489 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
490 {
491   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
492   PetscInt                          n = A->rmap->n;
493   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
494   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
495   cusparseStatus_t                  stat;
496   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
497   const MatScalar                   *aa = a->a,*v;
498   PetscInt                          *AiUp, *AjUp;
499   PetscInt                          i,nz, nzUpper, offset;
500   PetscErrorCode                    ierr;
501   cudaError_t                       cerr;
502 
503   PetscFunctionBegin;
504   if (!n) PetscFunctionReturn(0);
505   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
506     try {
507       /* next, figure out the number of nonzeros in the upper triangular matrix. */
508       nzUpper = adiag[0]-adiag[n];
509       if (!upTriFactor) {
510 	PetscScalar *AAUp;
511 
512 	cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
513 
514         /* Allocate Space for the upper triangular matrix */
515         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
516         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
517 
518         /* Fill the upper triangular matrix */
519         AiUp[0]=(PetscInt) 0;
520         AiUp[n]=nzUpper;
521         offset = nzUpper;
522         for (i=n-1; i>=0; i--) {
523           v  = aa + adiag[i+1] + 1;
524           vi = aj + adiag[i+1] + 1;
525 
526           /* number of elements NOT on the diagonal */
527           nz = adiag[i] - adiag[i+1]-1;
528 
529           /* decrement the offset */
530           offset -= (nz+1);
531 
532           /* first, set the diagonal elements */
533           AjUp[offset] = (PetscInt) i;
534           AAUp[offset] = (MatScalar)1./v[nz];
535           AiUp[i]      = AiUp[i+1] - (nz+1);
536 
537           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
538           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
539         }
540 
541         /* allocate space for the triangular factor information */
542         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
543         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
544 
545         /* Create the matrix description */
546         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
547         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
548        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
549         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
550        #else
551         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
552        #endif
553         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
554         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
555 
556         /* set the operation */
557         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
558 
559         /* set the matrix */
560         upTriFactor->csrMat = new CsrMatrix;
561         upTriFactor->csrMat->num_rows = n;
562         upTriFactor->csrMat->num_cols = n;
563         upTriFactor->csrMat->num_entries = nzUpper;
564 
565         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
566         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
567 
568         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
569         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
570 
571         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
572         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
573 
574         /* Create the solve analysis information */
575         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
576         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
577       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
578         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
579                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
580                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
581                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
582                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
583         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
584       #endif
585 
586         /* perform the solve analysis */
587         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
588                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
589                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
590                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
591                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
592                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
593                                #endif
594                                 );CHKERRCUSPARSE(stat);
595         cerr = WaitForCUDA();CHKERRCUDA(cerr);
596         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
597 
598         /* assign the pointer */
599         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
600 	upTriFactor->AA_h = AAUp;
601         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
602         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
603         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
604       } else {
605 	if (!upTriFactor->AA_h) {
606 	  cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
607 	}
608         /* Fill the upper triangular matrix */
609         offset = nzUpper;
610         for (i=n-1; i>=0; i--) {
611           v  = aa + adiag[i+1] + 1;
612 
613           /* number of elements NOT on the diagonal */
614           nz = adiag[i] - adiag[i+1]-1;
615 
616           /* decrement the offset */
617           offset -= (nz+1);
618 
619           /* first, set the diagonal elements */
620           upTriFactor->AA_h[offset] = 1./v[nz];
621           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
622         }
623         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
624         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
625       }
626     } catch(char *ex) {
627       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
628     }
629   }
630   PetscFunctionReturn(0);
631 }
632 
633 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
634 {
635   PetscErrorCode               ierr;
636   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
637   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
638   IS                           isrow = a->row,iscol = a->icol;
639   PetscBool                    row_identity,col_identity;
640   PetscInt                     n = A->rmap->n;
641 
642   PetscFunctionBegin;
643   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
644   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
645   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
646 
647   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
648   cusparseTriFactors->nnz=a->nz;
649 
650   A->offloadmask = PETSC_OFFLOAD_BOTH;
651   /* lower triangular indices */
652   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
653   if (!row_identity && !cusparseTriFactors->rpermIndices) {
654     const PetscInt *r;
655 
656     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
657     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
658     cusparseTriFactors->rpermIndices->assign(r, r+n);
659     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
660     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
661   }
662 
663   /* upper triangular indices */
664   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
665   if (!col_identity && !cusparseTriFactors->cpermIndices) {
666     const PetscInt *c;
667 
668     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
669     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
670     cusparseTriFactors->cpermIndices->assign(c, c+n);
671     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
672     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
673   }
674   PetscFunctionReturn(0);
675 }
676 
677 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
678 {
679   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
680   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
681   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
682   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
683   cusparseStatus_t                  stat;
684   PetscErrorCode                    ierr;
685   cudaError_t                       cerr;
686   PetscInt                          *AiUp, *AjUp;
687   PetscScalar                       *AAUp;
688   PetscScalar                       *AALo;
689   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
690   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
691   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
692   const MatScalar                   *aa = b->a,*v;
693 
694   PetscFunctionBegin;
695   if (!n) PetscFunctionReturn(0);
696   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
697     try {
698       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
699       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
700       if (!upTriFactor && !loTriFactor) {
701         /* Allocate Space for the upper triangular matrix */
702         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
703         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
704 
705         /* Fill the upper triangular matrix */
706         AiUp[0]=(PetscInt) 0;
707         AiUp[n]=nzUpper;
708         offset = 0;
709         for (i=0; i<n; i++) {
710           /* set the pointers */
711           v  = aa + ai[i];
712           vj = aj + ai[i];
713           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
714 
715           /* first, set the diagonal elements */
716           AjUp[offset] = (PetscInt) i;
717           AAUp[offset] = (MatScalar)1.0/v[nz];
718           AiUp[i]      = offset;
719           AALo[offset] = (MatScalar)1.0/v[nz];
720 
721           offset+=1;
722           if (nz>0) {
723             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
724             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
725             for (j=offset; j<offset+nz; j++) {
726               AAUp[j] = -AAUp[j];
727               AALo[j] = AAUp[j]/v[nz];
728             }
729             offset+=nz;
730           }
731         }
732 
733         /* allocate space for the triangular factor information */
734         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
735         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
736 
737         /* Create the matrix description */
738         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
739         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
740        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
741         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
742        #else
743         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
744        #endif
745         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
746         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
747 
748         /* set the matrix */
749         upTriFactor->csrMat = new CsrMatrix;
750         upTriFactor->csrMat->num_rows = A->rmap->n;
751         upTriFactor->csrMat->num_cols = A->cmap->n;
752         upTriFactor->csrMat->num_entries = a->nz;
753 
754         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
755         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
756 
757         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
758         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
759 
760         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
761         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
762 
763         /* set the operation */
764         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
765 
766         /* Create the solve analysis information */
767         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
768         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
769       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
770         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
771                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
772                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
773                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
774                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
775         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
776       #endif
777 
778         /* perform the solve analysis */
779         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
780                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
781                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
782                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
783                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
784                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
785                                 #endif
786                                 );CHKERRCUSPARSE(stat);
787         cerr = WaitForCUDA();CHKERRCUDA(cerr);
788         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
789 
790         /* assign the pointer */
791         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
792 
793         /* allocate space for the triangular factor information */
794         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
795         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
796 
797         /* Create the matrix description */
798         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
799         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
800        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
801         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
802        #else
803         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
804        #endif
805         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
806         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
807 
808         /* set the operation */
809         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
810 
811         /* set the matrix */
812         loTriFactor->csrMat = new CsrMatrix;
813         loTriFactor->csrMat->num_rows = A->rmap->n;
814         loTriFactor->csrMat->num_cols = A->cmap->n;
815         loTriFactor->csrMat->num_entries = a->nz;
816 
817         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
818         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
819 
820         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
821         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
822 
823         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
824         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
825 
826         /* Create the solve analysis information */
827         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
828         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
829       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
830         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
831                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
832                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
833                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
834                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
835         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
836       #endif
837 
838         /* perform the solve analysis */
839         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
840                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
841                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
842                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
843                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
844                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
845                                 #endif
846                                 );CHKERRCUSPARSE(stat);
847         cerr = WaitForCUDA();CHKERRCUDA(cerr);
848         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
849 
850         /* assign the pointer */
851         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
852 
853         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
854         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
855         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
856       } else {
857         /* Fill the upper triangular matrix */
858         offset = 0;
859         for (i=0; i<n; i++) {
860           /* set the pointers */
861           v  = aa + ai[i];
862           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
863 
864           /* first, set the diagonal elements */
865           AAUp[offset] = 1.0/v[nz];
866           AALo[offset] = 1.0/v[nz];
867 
868           offset+=1;
869           if (nz>0) {
870             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
871             for (j=offset; j<offset+nz; j++) {
872               AAUp[j] = -AAUp[j];
873               AALo[j] = AAUp[j]/v[nz];
874             }
875             offset+=nz;
876           }
877         }
878         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
879         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
880         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
881         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
882         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
883       }
884       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
885       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
886     } catch(char *ex) {
887       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
888     }
889   }
890   PetscFunctionReturn(0);
891 }
892 
893 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
894 {
895   PetscErrorCode               ierr;
896   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
897   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
898   IS                           ip = a->row;
899   PetscBool                    perm_identity;
900   PetscInt                     n = A->rmap->n;
901 
902   PetscFunctionBegin;
903   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
904   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
905   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
906   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
907 
908   A->offloadmask = PETSC_OFFLOAD_BOTH;
909 
910   /* lower triangular indices */
911   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
912   if (!perm_identity) {
913     IS             iip;
914     const PetscInt *irip,*rip;
915 
916     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
917     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
918     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
919     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
920     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
921     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
922     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
923     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
924     ierr = ISDestroy(&iip);CHKERRQ(ierr);
925     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
926     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
927   }
928   PetscFunctionReturn(0);
929 }
930 
931 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
932 {
933   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
934   IS             isrow = b->row,iscol = b->col;
935   PetscBool      row_identity,col_identity;
936   PetscErrorCode ierr;
937 
938   PetscFunctionBegin;
939   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
940   B->offloadmask = PETSC_OFFLOAD_CPU;
941   /* determine which version of MatSolve needs to be used. */
942   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
943   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
944   if (row_identity && col_identity) {
945     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
946     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
947     B->ops->matsolve = NULL;
948     B->ops->matsolvetranspose = NULL;
949   } else {
950     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
951     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
952     B->ops->matsolve = NULL;
953     B->ops->matsolvetranspose = NULL;
954   }
955 
956   /* get the triangular factors */
957   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
958   PetscFunctionReturn(0);
959 }
960 
961 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
962 {
963   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
964   IS             ip = b->row;
965   PetscBool      perm_identity;
966   PetscErrorCode ierr;
967 
968   PetscFunctionBegin;
969   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
970   B->offloadmask = PETSC_OFFLOAD_CPU;
971   /* determine which version of MatSolve needs to be used. */
972   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
973   if (perm_identity) {
974     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
975     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
976     B->ops->matsolve = NULL;
977     B->ops->matsolvetranspose = NULL;
978   } else {
979     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
980     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
981     B->ops->matsolve = NULL;
982     B->ops->matsolvetranspose = NULL;
983   }
984 
985   /* get the triangular factors */
986   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
987   PetscFunctionReturn(0);
988 }
989 
990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
991 {
992   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
993   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
994   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
995   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
996   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
997   cusparseStatus_t                  stat;
998   cusparseIndexBase_t               indexBase;
999   cusparseMatrixType_t              matrixType;
1000   cusparseFillMode_t                fillMode;
1001   cusparseDiagType_t                diagType;
1002   cudaError_t                       cerr;
1003   PetscErrorCode                    ierr;
1004 
1005   PetscFunctionBegin;
1006   /* allocate space for the transpose of the lower triangular factor */
1007   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1008   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1009 
1010   /* set the matrix descriptors of the lower triangular factor */
1011   matrixType = cusparseGetMatType(loTriFactor->descr);
1012   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1013   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1014     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1015   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1016 
1017   /* Create the matrix description */
1018   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1019   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1020   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1021   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1022   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1023 
1024   /* set the operation */
1025   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1026 
1027   /* allocate GPU space for the CSC of the lower triangular factor*/
1028   loTriFactorT->csrMat = new CsrMatrix;
1029   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1030   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1031   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1032   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1033   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1034   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1035 
1036   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1039                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1040                                        loTriFactor->csrMat->values->data().get(),
1041                                        loTriFactor->csrMat->row_offsets->data().get(),
1042                                        loTriFactor->csrMat->column_indices->data().get(),
1043                                        loTriFactorT->csrMat->values->data().get(),
1044                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1046                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1047   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1048 #endif
1049 
1050   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1051   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1052                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1053                           loTriFactor->csrMat->values->data().get(),
1054                           loTriFactor->csrMat->row_offsets->data().get(),
1055                           loTriFactor->csrMat->column_indices->data().get(),
1056                           loTriFactorT->csrMat->values->data().get(),
1057                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1058                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1059                           CUSPARSE_ACTION_NUMERIC, indexBase,
1060                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1061                         #else
1062                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1063                           CUSPARSE_ACTION_NUMERIC, indexBase
1064                         #endif
1065                         );CHKERRCUSPARSE(stat);
1066   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1067   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1068 
1069   /* Create the solve analysis information */
1070   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1071   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1072 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1073   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1074                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1075                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1076                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1077                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1078   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1079 #endif
1080 
1081   /* perform the solve analysis */
1082   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1083                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1084                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1085                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1086                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1087                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1088                           #endif
1089                           );CHKERRCUSPARSE(stat);
1090   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1091   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1092 
1093   /* assign the pointer */
1094   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1095 
1096   /*********************************************/
1097   /* Now the Transpose of the Upper Tri Factor */
1098   /*********************************************/
1099 
1100   /* allocate space for the transpose of the upper triangular factor */
1101   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1102   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1103 
1104   /* set the matrix descriptors of the upper triangular factor */
1105   matrixType = cusparseGetMatType(upTriFactor->descr);
1106   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1107   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1108     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1109   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1110 
1111   /* Create the matrix description */
1112   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1113   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1114   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1115   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1116   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1117 
1118   /* set the operation */
1119   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1120 
1121   /* allocate GPU space for the CSC of the upper triangular factor*/
1122   upTriFactorT->csrMat = new CsrMatrix;
1123   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1124   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1125   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1126   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1127   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1128   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1129 
1130   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1131 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1132   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1133                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1134                                 upTriFactor->csrMat->values->data().get(),
1135                                 upTriFactor->csrMat->row_offsets->data().get(),
1136                                 upTriFactor->csrMat->column_indices->data().get(),
1137                                 upTriFactorT->csrMat->values->data().get(),
1138                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1139                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1140                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1141   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1142 #endif
1143 
1144   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1145   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1146                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1147                           upTriFactor->csrMat->values->data().get(),
1148                           upTriFactor->csrMat->row_offsets->data().get(),
1149                           upTriFactor->csrMat->column_indices->data().get(),
1150                           upTriFactorT->csrMat->values->data().get(),
1151                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1152                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1153                           CUSPARSE_ACTION_NUMERIC, indexBase,
1154                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1155                         #else
1156                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1157                           CUSPARSE_ACTION_NUMERIC, indexBase
1158                         #endif
1159                         );CHKERRCUSPARSE(stat);
1160   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1161   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1162 
1163   /* Create the solve analysis information */
1164   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1165   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1166   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1167   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1168                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1169                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1170                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1171                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1172   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1173   #endif
1174 
1175   /* perform the solve analysis */
1176   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1177                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1178                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1179                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1180                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1181                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1182                           #endif
1183                           );CHKERRCUSPARSE(stat);
1184   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1185   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1186 
1187   /* assign the pointer */
1188   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1189   PetscFunctionReturn(0);
1190 }
1191 
1192 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1193 {
1194   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1195   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1196   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1197   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1198   cusparseStatus_t             stat;
1199   cusparseIndexBase_t          indexBase;
1200   cudaError_t                  err;
1201   PetscErrorCode               ierr;
1202 
1203   PetscFunctionBegin;
1204   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
1205   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1206   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1207   /* create cusparse matrix */
1208   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1209   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1210   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1211   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1212   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1213 
1214   /* set alpha and beta */
1215   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1216   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1217   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1218   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1219   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1220   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1221   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1222 
1223   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1224     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1225     CsrMatrix *matrixT= new CsrMatrix;
1226     matrixT->num_rows = A->cmap->n;
1227     matrixT->num_cols = A->rmap->n;
1228     matrixT->num_entries = a->nz;
1229     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1230     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1231     matrixT->values = new THRUSTARRAY(a->nz);
1232 
1233     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1234     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1235 
1236     /* compute the transpose, i.e. the CSC */
1237    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1238     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1239                                   A->cmap->n, matrix->num_entries,
1240                                   matrix->values->data().get(),
1241                                   cusparsestruct->rowoffsets_gpu->data().get(),
1242                                   matrix->column_indices->data().get(),
1243                                   matrixT->values->data().get(),
1244                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1245                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1246                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1247     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1248    #endif
1249 
1250     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1251                             A->cmap->n, matrix->num_entries,
1252                             matrix->values->data().get(),
1253                             cusparsestruct->rowoffsets_gpu->data().get(),
1254                             matrix->column_indices->data().get(),
1255                             matrixT->values->data().get(),
1256                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1257                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1258                             CUSPARSE_ACTION_NUMERIC,indexBase,
1259                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1260                           #else
1261                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1262                             CUSPARSE_ACTION_NUMERIC, indexBase
1263                           #endif
1264                            );CHKERRCUSPARSE(stat);
1265     matstructT->mat = matrixT;
1266 
1267    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1268     stat = cusparseCreateCsr(&matstructT->matDescr,
1269                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1270                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1271                              matrixT->values->data().get(),
1272                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1273                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1274    #endif
1275   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1276    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1277     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1278    #else
1279     CsrMatrix *temp  = new CsrMatrix;
1280     CsrMatrix *tempT = new CsrMatrix;
1281     /* First convert HYB to CSR */
1282     temp->num_rows = A->rmap->n;
1283     temp->num_cols = A->cmap->n;
1284     temp->num_entries = a->nz;
1285     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1286     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1287     temp->values = new THRUSTARRAY(a->nz);
1288 
1289     stat = cusparse_hyb2csr(cusparsestruct->handle,
1290                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1291                             temp->values->data().get(),
1292                             temp->row_offsets->data().get(),
1293                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1294 
1295     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1296     tempT->num_rows = A->rmap->n;
1297     tempT->num_cols = A->cmap->n;
1298     tempT->num_entries = a->nz;
1299     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1300     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1301     tempT->values = new THRUSTARRAY(a->nz);
1302 
1303     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1304                             temp->num_cols, temp->num_entries,
1305                             temp->values->data().get(),
1306                             temp->row_offsets->data().get(),
1307                             temp->column_indices->data().get(),
1308                             tempT->values->data().get(),
1309                             tempT->column_indices->data().get(),
1310                             tempT->row_offsets->data().get(),
1311                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1312 
1313     /* Last, convert CSC to HYB */
1314     cusparseHybMat_t hybMat;
1315     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1316     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1317       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1319                             matstructT->descr, tempT->values->data().get(),
1320                             tempT->row_offsets->data().get(),
1321                             tempT->column_indices->data().get(),
1322                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1323 
1324     /* assign the pointer */
1325     matstructT->mat = hybMat;
1326     /* delete temporaries */
1327     if (tempT) {
1328       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1329       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1330       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1331       delete (CsrMatrix*) tempT;
1332     }
1333     if (temp) {
1334       if (temp->values) delete (THRUSTARRAY*) temp->values;
1335       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1336       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1337       delete (CsrMatrix*) temp;
1338     }
1339    #endif
1340   }
1341   err  = WaitForCUDA();CHKERRCUDA(err);
1342   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1343   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1344   /* the compressed row indices is not used for matTranspose */
1345   matstructT->cprowIndices = NULL;
1346   /* assign the pointer */
1347   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1348   PetscFunctionReturn(0);
1349 }
1350 
1351 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1352 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1353 {
1354   PetscInt                              n = xx->map->n;
1355   const PetscScalar                     *barray;
1356   PetscScalar                           *xarray;
1357   thrust::device_ptr<const PetscScalar> bGPU;
1358   thrust::device_ptr<PetscScalar>       xGPU;
1359   cusparseStatus_t                      stat;
1360   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1361   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1362   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1363   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1364   PetscErrorCode                        ierr;
1365   cudaError_t                           cerr;
1366 
1367   PetscFunctionBegin;
1368   /* Analyze the matrix and create the transpose ... on the fly */
1369   if (!loTriFactorT && !upTriFactorT) {
1370     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1371     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1372     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1373   }
1374 
1375   /* Get the GPU pointers */
1376   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1377   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1378   xGPU = thrust::device_pointer_cast(xarray);
1379   bGPU = thrust::device_pointer_cast(barray);
1380 
1381   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1382   /* First, reorder with the row permutation */
1383   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1384                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1385                xGPU);
1386 
1387   /* First, solve U */
1388   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1389                         upTriFactorT->csrMat->num_rows,
1390                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1391                         upTriFactorT->csrMat->num_entries,
1392                       #endif
1393                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1394                         upTriFactorT->csrMat->values->data().get(),
1395                         upTriFactorT->csrMat->row_offsets->data().get(),
1396                         upTriFactorT->csrMat->column_indices->data().get(),
1397                         upTriFactorT->solveInfo,
1398                         xarray, tempGPU->data().get()
1399                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1400                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1401                       #endif
1402                         );CHKERRCUSPARSE(stat);
1403 
1404   /* Then, solve L */
1405   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1406                         loTriFactorT->csrMat->num_rows,
1407                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1408                         loTriFactorT->csrMat->num_entries,
1409                       #endif
1410                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1411                         loTriFactorT->csrMat->values->data().get(),
1412                         loTriFactorT->csrMat->row_offsets->data().get(),
1413                         loTriFactorT->csrMat->column_indices->data().get(),
1414                         loTriFactorT->solveInfo,
1415                         tempGPU->data().get(), xarray
1416                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1417                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1418                       #endif
1419                         );CHKERRCUSPARSE(stat);
1420 
1421   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1422   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1423                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1424                tempGPU->begin());
1425 
1426   /* Copy the temporary to the full solution. */
1427   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1428 
1429   /* restore */
1430   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1431   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1432   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1433   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1434   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1435   PetscFunctionReturn(0);
1436 }
1437 
1438 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1439 {
1440   const PetscScalar                 *barray;
1441   PetscScalar                       *xarray;
1442   cusparseStatus_t                  stat;
1443   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1444   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1445   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1446   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1447   PetscErrorCode                    ierr;
1448   cudaError_t                       cerr;
1449 
1450   PetscFunctionBegin;
1451   /* Analyze the matrix and create the transpose ... on the fly */
1452   if (!loTriFactorT && !upTriFactorT) {
1453     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1454     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1455     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1456   }
1457 
1458   /* Get the GPU pointers */
1459   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1460   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1461 
1462   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1463   /* First, solve U */
1464   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1465                         upTriFactorT->csrMat->num_rows,
1466                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1467                         upTriFactorT->csrMat->num_entries,
1468                       #endif
1469                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1470                         upTriFactorT->csrMat->values->data().get(),
1471                         upTriFactorT->csrMat->row_offsets->data().get(),
1472                         upTriFactorT->csrMat->column_indices->data().get(),
1473                         upTriFactorT->solveInfo,
1474                         barray, tempGPU->data().get()
1475                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1476                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1477                       #endif
1478                         );CHKERRCUSPARSE(stat);
1479 
1480   /* Then, solve L */
1481   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1482                         loTriFactorT->csrMat->num_rows,
1483                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1484                         loTriFactorT->csrMat->num_entries,
1485                       #endif
1486                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1487                         loTriFactorT->csrMat->values->data().get(),
1488                         loTriFactorT->csrMat->row_offsets->data().get(),
1489                         loTriFactorT->csrMat->column_indices->data().get(),
1490                         loTriFactorT->solveInfo,
1491                         tempGPU->data().get(), xarray
1492                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1493                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1494                       #endif
1495                         );CHKERRCUSPARSE(stat);
1496 
1497   /* restore */
1498   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1499   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1500   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1501   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1502   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1503   PetscFunctionReturn(0);
1504 }
1505 
1506 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1507 {
1508   const PetscScalar                     *barray;
1509   PetscScalar                           *xarray;
1510   thrust::device_ptr<const PetscScalar> bGPU;
1511   thrust::device_ptr<PetscScalar>       xGPU;
1512   cusparseStatus_t                      stat;
1513   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1514   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1515   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1516   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1517   PetscErrorCode                        ierr;
1518   cudaError_t                           cerr;
1519 
1520   PetscFunctionBegin;
1521 
1522   /* Get the GPU pointers */
1523   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1524   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1525   xGPU = thrust::device_pointer_cast(xarray);
1526   bGPU = thrust::device_pointer_cast(barray);
1527 
1528   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1529   /* First, reorder with the row permutation */
1530   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1531                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1532                tempGPU->begin());
1533 
1534   /* Next, solve L */
1535   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1536                         loTriFactor->csrMat->num_rows,
1537                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1538                         loTriFactor->csrMat->num_entries,
1539                       #endif
1540                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1541                         loTriFactor->csrMat->values->data().get(),
1542                         loTriFactor->csrMat->row_offsets->data().get(),
1543                         loTriFactor->csrMat->column_indices->data().get(),
1544                         loTriFactor->solveInfo,
1545                         tempGPU->data().get(), xarray
1546                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1547                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1548                       #endif
1549                         );CHKERRCUSPARSE(stat);
1550 
1551   /* Then, solve U */
1552   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1553                         upTriFactor->csrMat->num_rows,
1554                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555                         upTriFactor->csrMat->num_entries,
1556                       #endif
1557                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1558                         upTriFactor->csrMat->values->data().get(),
1559                         upTriFactor->csrMat->row_offsets->data().get(),
1560                         upTriFactor->csrMat->column_indices->data().get(),
1561                         upTriFactor->solveInfo,
1562                         xarray, tempGPU->data().get()
1563                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1564                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1565                       #endif
1566                         );CHKERRCUSPARSE(stat);
1567 
1568   /* Last, reorder with the column permutation */
1569   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1570                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1571                xGPU);
1572 
1573   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1574   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1575   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1576   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1577   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1578   PetscFunctionReturn(0);
1579 }
1580 
1581 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1582 {
1583   const PetscScalar                 *barray;
1584   PetscScalar                       *xarray;
1585   cusparseStatus_t                  stat;
1586   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1587   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1588   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1589   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1590   PetscErrorCode                    ierr;
1591   cudaError_t                       cerr;
1592 
1593   PetscFunctionBegin;
1594   /* Get the GPU pointers */
1595   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1596   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1597 
1598   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1599   /* First, solve L */
1600   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1601                         loTriFactor->csrMat->num_rows,
1602                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1603                         loTriFactor->csrMat->num_entries,
1604                       #endif
1605                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1606                         loTriFactor->csrMat->values->data().get(),
1607                         loTriFactor->csrMat->row_offsets->data().get(),
1608                         loTriFactor->csrMat->column_indices->data().get(),
1609                         loTriFactor->solveInfo,
1610                         barray, tempGPU->data().get()
1611                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1612                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1613                       #endif
1614                         );CHKERRCUSPARSE(stat);
1615 
1616   /* Next, solve U */
1617   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1618                         upTriFactor->csrMat->num_rows,
1619                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1620                         upTriFactor->csrMat->num_entries,
1621                       #endif
1622                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1623                         upTriFactor->csrMat->values->data().get(),
1624                         upTriFactor->csrMat->row_offsets->data().get(),
1625                         upTriFactor->csrMat->column_indices->data().get(),
1626                         upTriFactor->solveInfo,
1627                         tempGPU->data().get(), xarray
1628                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1629                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1630                       #endif
1631                         );CHKERRCUSPARSE(stat);
1632 
1633   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1634   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1635   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1636   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1637   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1638   PetscFunctionReturn(0);
1639 }
1640 
1641 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1642 {
1643   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1644   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1645   cudaError_t        cerr;
1646   PetscErrorCode     ierr;
1647 
1648   PetscFunctionBegin;
1649   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1650     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1651 
1652     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1653     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1654     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1655     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1656     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1657     A->offloadmask = PETSC_OFFLOAD_BOTH;
1658   }
1659   PetscFunctionReturn(0);
1660 }
1661 
1662 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1663 {
1664   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1665   PetscErrorCode ierr;
1666 
1667   PetscFunctionBegin;
1668   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1669   *array = a->a;
1670   A->offloadmask = PETSC_OFFLOAD_CPU;
1671   PetscFunctionReturn(0);
1672 }
1673 
1674 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1675 {
1676   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1677   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1678   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1679   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1680   PetscErrorCode               ierr;
1681   cusparseStatus_t             stat;
1682   cudaError_t                  err;
1683 
1684   PetscFunctionBegin;
1685   if (A->boundtocpu) PetscFunctionReturn(0);
1686   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1687     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1688       /* Copy values only */
1689       CsrMatrix *matrix,*matrixT;
1690       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1691 
1692       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1693       matrix->values->assign(a->a, a->a+a->nz);
1694       err  = WaitForCUDA();CHKERRCUDA(err);
1695       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1696       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1697 
1698       /* Update matT when it was built before */
1699       if (cusparsestruct->matTranspose) {
1700         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1701         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1702         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1703         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1704                             A->cmap->n, matrix->num_entries,
1705                             matrix->values->data().get(),
1706                             cusparsestruct->rowoffsets_gpu->data().get(),
1707                             matrix->column_indices->data().get(),
1708                             matrixT->values->data().get(),
1709                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1710                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1711                             CUSPARSE_ACTION_NUMERIC,indexBase,
1712                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1713                           #else
1714                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1715                             CUSPARSE_ACTION_NUMERIC, indexBase
1716                           #endif
1717                            );CHKERRCUSPARSE(stat);
1718         err  = WaitForCUDA();CHKERRCUDA(err);
1719         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1720       }
1721     } else {
1722       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1723       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1724       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1725       delete cusparsestruct->workVector;
1726       delete cusparsestruct->rowoffsets_gpu;
1727       try {
1728         if (a->compressedrow.use) {
1729           m    = a->compressedrow.nrows;
1730           ii   = a->compressedrow.i;
1731           ridx = a->compressedrow.rindex;
1732         } else {
1733           m    = A->rmap->n;
1734           ii   = a->i;
1735           ridx = NULL;
1736         }
1737         cusparsestruct->nrows = m;
1738 
1739         /* create cusparse matrix */
1740         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1741         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1742         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1743         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1744 
1745         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1746         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1747         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1748         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1749         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1750         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1751         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1752 
1753         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1754         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1755           /* set the matrix */
1756           CsrMatrix *mat= new CsrMatrix;
1757           mat->num_rows = m;
1758           mat->num_cols = A->cmap->n;
1759           mat->num_entries = a->nz;
1760           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1761           mat->row_offsets->assign(ii, ii + m+1);
1762 
1763           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1764           mat->column_indices->assign(a->j, a->j+a->nz);
1765 
1766           mat->values = new THRUSTARRAY(a->nz);
1767           mat->values->assign(a->a, a->a+a->nz);
1768 
1769           /* assign the pointer */
1770           matstruct->mat = mat;
1771          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1772           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1773             stat = cusparseCreateCsr(&matstruct->matDescr,
1774                                     mat->num_rows, mat->num_cols, mat->num_entries,
1775                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1776                                     mat->values->data().get(),
1777                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1778                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1779           }
1780          #endif
1781         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1782          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1783           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1784          #else
1785           CsrMatrix *mat= new CsrMatrix;
1786           mat->num_rows = m;
1787           mat->num_cols = A->cmap->n;
1788           mat->num_entries = a->nz;
1789           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1790           mat->row_offsets->assign(ii, ii + m+1);
1791 
1792           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1793           mat->column_indices->assign(a->j, a->j+a->nz);
1794 
1795           mat->values = new THRUSTARRAY(a->nz);
1796           mat->values->assign(a->a, a->a+a->nz);
1797 
1798           cusparseHybMat_t hybMat;
1799           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1800           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1801             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1802           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1803               matstruct->descr, mat->values->data().get(),
1804               mat->row_offsets->data().get(),
1805               mat->column_indices->data().get(),
1806               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1807           /* assign the pointer */
1808           matstruct->mat = hybMat;
1809 
1810           if (mat) {
1811             if (mat->values) delete (THRUSTARRAY*)mat->values;
1812             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1813             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1814             delete (CsrMatrix*)mat;
1815           }
1816          #endif
1817         }
1818 
1819         /* assign the compressed row indices */
1820         if (a->compressedrow.use) {
1821           cusparsestruct->workVector = new THRUSTARRAY(m);
1822           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1823           matstruct->cprowIndices->assign(ridx,ridx+m);
1824           tmp = m;
1825         } else {
1826           cusparsestruct->workVector = NULL;
1827           matstruct->cprowIndices    = NULL;
1828           tmp = 0;
1829         }
1830         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1831 
1832         /* assign the pointer */
1833         cusparsestruct->mat = matstruct;
1834       } catch(char *ex) {
1835         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1836       }
1837       err  = WaitForCUDA();CHKERRCUDA(err);
1838       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1839       cusparsestruct->nonzerostate = A->nonzerostate;
1840     }
1841     A->offloadmask = PETSC_OFFLOAD_BOTH;
1842   }
1843   PetscFunctionReturn(0);
1844 }
1845 
1846 struct VecCUDAPlusEquals
1847 {
1848   template <typename Tuple>
1849   __host__ __device__
1850   void operator()(Tuple t)
1851   {
1852     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1853   }
1854 };
1855 
1856 struct VecCUDAEquals
1857 {
1858   template <typename Tuple>
1859   __host__ __device__
1860   void operator()(Tuple t)
1861   {
1862     thrust::get<1>(t) = thrust::get<0>(t);
1863   }
1864 };
1865 
1866 struct VecCUDAEqualsReverse
1867 {
1868   template <typename Tuple>
1869   __host__ __device__
1870   void operator()(Tuple t)
1871   {
1872     thrust::get<0>(t) = thrust::get<1>(t);
1873   }
1874 };
1875 
1876 struct MatMatCusparse {
1877   PetscBool            cisdense;
1878   PetscScalar          *Bt;
1879   Mat                  X;
1880 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1881   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1882   cusparseDnMatDescr_t matBDescr;
1883   cusparseDnMatDescr_t matCDescr;
1884   size_t               spmmBufferSize;
1885   void                 *spmmBuffer;
1886   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1887 #endif
1888 };
1889 
1890 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1891 {
1892   PetscErrorCode ierr;
1893   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1894   cudaError_t    cerr;
1895 
1896   PetscFunctionBegin;
1897   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1898  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1899   cusparseStatus_t stat;
1900   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1901   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1902   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1903  #endif
1904   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1905   ierr = PetscFree(data);CHKERRQ(ierr);
1906   PetscFunctionReturn(0);
1907 }
1908 
1909 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1910 
1911 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1912 {
1913   Mat_Product                  *product = C->product;
1914   Mat                          A,B;
1915   PetscInt                     m,n,blda,clda;
1916   PetscBool                    flg,biscuda;
1917   Mat_SeqAIJCUSPARSE           *cusp;
1918   cusparseStatus_t             stat;
1919   cusparseOperation_t          opA;
1920   const PetscScalar            *barray;
1921   PetscScalar                  *carray;
1922   PetscErrorCode               ierr;
1923   MatMatCusparse               *mmdata;
1924   Mat_SeqAIJCUSPARSEMultStruct *mat;
1925   CsrMatrix                    *csrmat;
1926   cudaError_t                  cerr;
1927 
1928   PetscFunctionBegin;
1929   MatCheckProduct(C,1);
1930   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1931   mmdata = (MatMatCusparse*)product->data;
1932   A    = product->A;
1933   B    = product->B;
1934   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1935   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1936   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1937      Instead of silently accepting the wrong answer, I prefer to raise the error */
1938   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1939   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1940   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1941   switch (product->type) {
1942   case MATPRODUCT_AB:
1943   case MATPRODUCT_PtAP:
1944     mat = cusp->mat;
1945     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1946     m   = A->rmap->n;
1947     n   = B->cmap->n;
1948     break;
1949   case MATPRODUCT_AtB:
1950     if (!cusp->transgen) {
1951       mat = cusp->mat;
1952       opA = CUSPARSE_OPERATION_TRANSPOSE;
1953     } else {
1954       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1955       mat  = cusp->matTranspose;
1956       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1957     }
1958     m = A->cmap->n;
1959     n = B->cmap->n;
1960     break;
1961   case MATPRODUCT_ABt:
1962   case MATPRODUCT_RARt:
1963     mat = cusp->mat;
1964     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1965     m   = A->rmap->n;
1966     n   = B->rmap->n;
1967     break;
1968   default:
1969     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1970   }
1971   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1972   csrmat = (CsrMatrix*)mat->mat;
1973   /* if the user passed a CPU matrix, copy the data to the GPU */
1974   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1975   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1976   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1977 
1978   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1979   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1980     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1981     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1982   } else {
1983     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1984     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1985   }
1986 
1987   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1988  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1989   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1990   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1991   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1992     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1993     if (!mmdata->matBDescr) {
1994       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1995       mmdata->Blda = blda;
1996     }
1997 
1998     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1999     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2000       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2001       mmdata->Clda = clda;
2002     }
2003 
2004     if (!mat->matDescr) {
2005       stat = cusparseCreateCsr(&mat->matDescr,
2006                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2007                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2008                               csrmat->values->data().get(),
2009                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2010                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2011     }
2012     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2013                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2014                                    mmdata->matCDescr,cusparse_scalartype,
2015                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
2016     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
2017     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
2018     mmdata->initialized = PETSC_TRUE;
2019   } else {
2020     /* to be safe, always update pointers of the mats */
2021     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2022     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2023     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2024   }
2025 
2026   /* do cusparseSpMM, which supports transpose on B */
2027   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2028                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2029                       mmdata->matCDescr,cusparse_scalartype,
2030                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
2031  #else
2032   PetscInt k;
2033   /* cusparseXcsrmm does not support transpose on B */
2034   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2035     cublasHandle_t cublasv2handle;
2036     cublasStatus_t cerr;
2037 
2038     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2039     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2040                        B->cmap->n,B->rmap->n,
2041                        &PETSC_CUSPARSE_ONE ,barray,blda,
2042                        &PETSC_CUSPARSE_ZERO,barray,blda,
2043                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2044     blda = B->cmap->n;
2045     k    = B->cmap->n;
2046   } else {
2047     k    = B->rmap->n;
2048   }
2049 
2050   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2051   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2052                            csrmat->num_entries,mat->alpha_one,mat->descr,
2053                            csrmat->values->data().get(),
2054                            csrmat->row_offsets->data().get(),
2055                            csrmat->column_indices->data().get(),
2056                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2057                            carray,clda);CHKERRCUSPARSE(stat);
2058  #endif
2059   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2060   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2061   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2062   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2063   if (product->type == MATPRODUCT_RARt) {
2064     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2065     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2066   } else if (product->type == MATPRODUCT_PtAP) {
2067     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2068     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2069   } else {
2070     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2071   }
2072   if (mmdata->cisdense) {
2073     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2074   }
2075   if (!biscuda) {
2076     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2077   }
2078   PetscFunctionReturn(0);
2079 }
2080 
2081 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2082 {
2083   Mat_Product        *product = C->product;
2084   Mat                A,B;
2085   PetscInt           m,n;
2086   PetscBool          cisdense,flg;
2087   PetscErrorCode     ierr;
2088   MatMatCusparse     *mmdata;
2089   Mat_SeqAIJCUSPARSE *cusp;
2090 
2091   PetscFunctionBegin;
2092   MatCheckProduct(C,1);
2093   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2094   A    = product->A;
2095   B    = product->B;
2096   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2097   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2098   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2099   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2100   switch (product->type) {
2101   case MATPRODUCT_AB:
2102     m = A->rmap->n;
2103     n = B->cmap->n;
2104     break;
2105   case MATPRODUCT_AtB:
2106     m = A->cmap->n;
2107     n = B->cmap->n;
2108     break;
2109   case MATPRODUCT_ABt:
2110     m = A->rmap->n;
2111     n = B->rmap->n;
2112     break;
2113   case MATPRODUCT_PtAP:
2114     m = B->cmap->n;
2115     n = B->cmap->n;
2116     break;
2117   case MATPRODUCT_RARt:
2118     m = B->rmap->n;
2119     n = B->rmap->n;
2120     break;
2121   default:
2122     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2123   }
2124   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2125   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2126   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2127   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2128 
2129   /* product data */
2130   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2131   mmdata->cisdense = cisdense;
2132  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2133   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2134   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2135     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2136   }
2137  #endif
2138   /* for these products we need intermediate storage */
2139   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2140     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2141     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2142     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2143       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2144     } else {
2145       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2146     }
2147   }
2148   C->product->data    = mmdata;
2149   C->product->destroy = MatDestroy_MatMatCusparse;
2150 
2151   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2152   PetscFunctionReturn(0);
2153 }
2154 
2155 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2156 
2157 /* handles dense B */
2158 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2159 {
2160   Mat_Product    *product = C->product;
2161   PetscErrorCode ierr;
2162 
2163   PetscFunctionBegin;
2164   MatCheckProduct(C,1);
2165   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2166   if (product->A->boundtocpu) {
2167     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2168     PetscFunctionReturn(0);
2169   }
2170   switch (product->type) {
2171   case MATPRODUCT_AB:
2172   case MATPRODUCT_AtB:
2173   case MATPRODUCT_ABt:
2174   case MATPRODUCT_PtAP:
2175   case MATPRODUCT_RARt:
2176     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2177   default:
2178     break;
2179   }
2180   PetscFunctionReturn(0);
2181 }
2182 
2183 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2184 {
2185   PetscErrorCode ierr;
2186 
2187   PetscFunctionBegin;
2188   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2189   PetscFunctionReturn(0);
2190 }
2191 
2192 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2193 {
2194   PetscErrorCode ierr;
2195 
2196   PetscFunctionBegin;
2197   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2198   PetscFunctionReturn(0);
2199 }
2200 
2201 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2202 {
2203   PetscErrorCode ierr;
2204 
2205   PetscFunctionBegin;
2206   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2207   PetscFunctionReturn(0);
2208 }
2209 
2210 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2211 {
2212   PetscErrorCode ierr;
2213 
2214   PetscFunctionBegin;
2215   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2216   PetscFunctionReturn(0);
2217 }
2218 
2219 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2220 {
2221   PetscErrorCode ierr;
2222 
2223   PetscFunctionBegin;
2224   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2225   PetscFunctionReturn(0);
2226 }
2227 
2228 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2229 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2230 {
2231   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2232   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2233   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2234   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2235   PetscErrorCode               ierr;
2236   cudaError_t                  cerr;
2237   cusparseStatus_t             stat;
2238   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2239   PetscBool                    compressed;
2240 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2241   PetscInt                     nx,ny;
2242 #endif
2243 
2244   PetscFunctionBegin;
2245   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2246   if (!a->nonzerorowcnt) {
2247     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2248     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2249     PetscFunctionReturn(0);
2250   }
2251   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2252   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2253   if (!trans) {
2254     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2255     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2256   } else {
2257     if (herm || !cusparsestruct->transgen) {
2258       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2259       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2260     } else {
2261       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2262       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2263     }
2264   }
2265   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2266   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2267 
2268   try {
2269     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2270     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2271     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2272 
2273     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2274     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2275       /* z = A x + beta y.
2276          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2277          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2278       */
2279       xptr = xarray;
2280       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2281       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2282      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2283       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2284           allocated to accommodate different uses. So we get the length info directly from mat.
2285        */
2286       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2287         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2288         nx = mat->num_cols;
2289         ny = mat->num_rows;
2290       }
2291      #endif
2292     } else {
2293       /* z = A^T x + beta y
2294          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2295          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2296        */
2297       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2298       dptr = zarray;
2299       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2300       if (compressed) { /* Scatter x to work vector */
2301         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2302         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2303                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2304                          VecCUDAEqualsReverse());
2305       }
2306      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2307       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2308         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2309         nx = mat->num_rows;
2310         ny = mat->num_cols;
2311       }
2312      #endif
2313     }
2314 
2315     /* csr_spmv does y = alpha op(A) x + beta y */
2316     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2317      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2318       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2319       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2320         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2321         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2322         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2323                                 matstruct->matDescr,
2324                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2325                                 matstruct->cuSpMV[opA].vecYDescr,
2326                                 cusparse_scalartype,
2327                                 cusparsestruct->spmvAlg,
2328                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2329         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2330 
2331         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2332       } else {
2333         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2334         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2335         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2336       }
2337 
2338       stat = cusparseSpMV(cusparsestruct->handle, opA,
2339                                matstruct->alpha_one,
2340                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2341                                matstruct->cuSpMV[opA].vecXDescr,
2342                                beta,
2343                                matstruct->cuSpMV[opA].vecYDescr,
2344                                cusparse_scalartype,
2345                                cusparsestruct->spmvAlg,
2346                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2347      #else
2348       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2349       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2350                                mat->num_rows, mat->num_cols,
2351                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2352                                mat->values->data().get(), mat->row_offsets->data().get(),
2353                                mat->column_indices->data().get(), xptr, beta,
2354                                dptr);CHKERRCUSPARSE(stat);
2355      #endif
2356     } else {
2357       if (cusparsestruct->nrows) {
2358        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2359         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2360        #else
2361         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2362         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2363                                  matstruct->alpha_one, matstruct->descr, hybMat,
2364                                  xptr, beta,
2365                                  dptr);CHKERRCUSPARSE(stat);
2366        #endif
2367       }
2368     }
2369     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2370     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2371 
2372     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2373       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2374         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2375           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2376         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2377           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2378         }
2379       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2380         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2381       }
2382 
2383       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2384       if (compressed) {
2385         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2386         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2387         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2388                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2389                          VecCUDAPlusEquals());
2390         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2391         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2392       }
2393     } else {
2394       if (yy && yy != zz) {
2395         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2396       }
2397     }
2398     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2399     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2400     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2401   } catch(char *ex) {
2402     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2403   }
2404   if (yy) {
2405     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2406   } else {
2407     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2408   }
2409   PetscFunctionReturn(0);
2410 }
2411 
2412 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2413 {
2414   PetscErrorCode ierr;
2415 
2416   PetscFunctionBegin;
2417   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2418   PetscFunctionReturn(0);
2419 }
2420 
2421 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2422 {
2423   PetscErrorCode              ierr;
2424   PetscSplitCSRDataStructure  *d_mat = NULL;
2425   PetscFunctionBegin;
2426   if (A->factortype == MAT_FACTOR_NONE) {
2427     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2428   }
2429   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2430   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2431   if (d_mat) {
2432     A->offloadmask = PETSC_OFFLOAD_GPU;
2433   }
2434 
2435   PetscFunctionReturn(0);
2436 }
2437 
2438 /* --------------------------------------------------------------------------------*/
2439 /*@
2440    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2441    (the default parallel PETSc format). This matrix will ultimately pushed down
2442    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2443    assembly performance the user should preallocate the matrix storage by setting
2444    the parameter nz (or the array nnz).  By setting these parameters accurately,
2445    performance during matrix assembly can be increased by more than a factor of 50.
2446 
2447    Collective
2448 
2449    Input Parameters:
2450 +  comm - MPI communicator, set to PETSC_COMM_SELF
2451 .  m - number of rows
2452 .  n - number of columns
2453 .  nz - number of nonzeros per row (same for all rows)
2454 -  nnz - array containing the number of nonzeros in the various rows
2455          (possibly different for each row) or NULL
2456 
2457    Output Parameter:
2458 .  A - the matrix
2459 
2460    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2461    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2462    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2463 
2464    Notes:
2465    If nnz is given then nz is ignored
2466 
2467    The AIJ format (also called the Yale sparse matrix format or
2468    compressed row storage), is fully compatible with standard Fortran 77
2469    storage.  That is, the stored row and column indices can begin at
2470    either one (as in Fortran) or zero.  See the users' manual for details.
2471 
2472    Specify the preallocated storage with either nz or nnz (not both).
2473    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2474    allocation.  For large problems you MUST preallocate memory or you
2475    will get TERRIBLE performance, see the users' manual chapter on matrices.
2476 
2477    By default, this format uses inodes (identical nodes) when possible, to
2478    improve numerical efficiency of matrix-vector products and solves. We
2479    search for consecutive rows with the same nonzero structure, thereby
2480    reusing matrix information to achieve increased efficiency.
2481 
2482    Level: intermediate
2483 
2484 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2485 @*/
2486 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2487 {
2488   PetscErrorCode ierr;
2489 
2490   PetscFunctionBegin;
2491   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2492   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2493   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2494   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2495   PetscFunctionReturn(0);
2496 }
2497 
2498 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2499 {
2500   PetscErrorCode              ierr;
2501   PetscSplitCSRDataStructure  *d_mat = NULL;
2502 
2503   PetscFunctionBegin;
2504   if (A->factortype == MAT_FACTOR_NONE) {
2505     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2506     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2507     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
2508   } else {
2509     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2510   }
2511   if (d_mat) {
2512     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2513     cudaError_t                err;
2514     PetscSplitCSRDataStructure h_mat;
2515     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2516     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2517     if (a->compressedrow.use) {
2518       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2519     }
2520     err = cudaFree(d_mat);CHKERRCUDA(err);
2521   }
2522   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2523   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2524   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2525   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
2526   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2527   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2528   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2529   PetscFunctionReturn(0);
2530 }
2531 
2532 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2533 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2534 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2535 {
2536   PetscErrorCode ierr;
2537 
2538   PetscFunctionBegin;
2539   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2540   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2541   PetscFunctionReturn(0);
2542 }
2543 
2544 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc.
2545 {
2546   PetscErrorCode ierr;
2547   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
2548   PetscBool      flgx,flgy;
2549 
2550   PetscFunctionBegin;
2551   if (a == (PetscScalar)0.0) PetscFunctionReturn(0);
2552   PetscValidHeaderSpecific(Y,MAT_CLASSID,1);
2553   PetscValidHeaderSpecific(X,MAT_CLASSID,3);
2554   ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr);
2555   ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr);
2556   if (!flgx || !flgy) {
2557     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
2558     PetscFunctionReturn(0);
2559   }
2560   if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE");
2561   if (str == DIFFERENT_NONZERO_PATTERN) {
2562     if (x->nz == y->nz) {
2563       PetscBool e;
2564       ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
2565       if (e) {
2566         ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
2567         if (e) {
2568           str = SAME_NONZERO_PATTERN;
2569         }
2570       }
2571     }
2572   }
2573   if (str != SAME_NONZERO_PATTERN) {
2574     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
2575     PetscFunctionReturn(0);
2576   } else {
2577     Mat_SeqAIJCUSPARSE           *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr;
2578     Mat_SeqAIJCUSPARSE           *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr;
2579     if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
2580     if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
2581     if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) {
2582       if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) {
2583 	ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
2584       }
2585       if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) {
2586 	ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr);
2587       }
2588       ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2589       ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
2590       ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
2591     } else {
2592       cublasHandle_t cublasv2handle;
2593       cublasStatus_t cberr;
2594       cudaError_t    err;
2595       PetscScalar    alpha = a;
2596       PetscBLASInt   one = 1, bnz = 1;
2597       CsrMatrix      *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat;
2598       CsrMatrix      *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat;
2599       PetscScalar    *aa_y, *aa_x;
2600       aa_y = matrix_y->values->data().get();
2601       aa_x = matrix_x->values->data().get();
2602       ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2603       ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
2604       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2605       cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr);
2606       err  = WaitForCUDA();CHKERRCUDA(err);
2607       ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
2608       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2609       ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
2610       ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr);
2611       if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU;
2612       else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state");
2613       ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
2614     }
2615   }
2616   PetscFunctionReturn(0);
2617 }
2618 
2619 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2620 {
2621   PetscErrorCode             ierr;
2622   PetscBool                  both = PETSC_FALSE;
2623   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2624 
2625   PetscFunctionBegin;
2626   if (A->factortype == MAT_FACTOR_NONE) {
2627     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2628     if (spptr->mat) {
2629       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
2630       if (matrix->values) {
2631         both = PETSC_TRUE;
2632         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2633       }
2634     }
2635     if (spptr->matTranspose) {
2636       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
2637       if (matrix->values) {
2638         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2639       }
2640     }
2641   }
2642   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
2643   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
2644   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
2645   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2646   else A->offloadmask = PETSC_OFFLOAD_CPU;
2647 
2648   PetscFunctionReturn(0);
2649 }
2650 
2651 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2652 {
2653   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2654   PetscErrorCode ierr;
2655 
2656   PetscFunctionBegin;
2657   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2658   if (flg) {
2659     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2660 
2661     A->ops->axpy                      = MatAXPY_SeqAIJ;
2662     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
2663     A->ops->mult                      = MatMult_SeqAIJ;
2664     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2665     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2666     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2667     A->ops->multhermitiantranspose    = NULL;
2668     A->ops->multhermitiantransposeadd = NULL;
2669     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2670     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2671     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2672     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2673     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
2674   } else {
2675     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
2676     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
2677     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2678     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2679     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2680     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2681     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2682     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2683     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2684     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2685     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2686     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2687     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
2688   }
2689   A->boundtocpu = flg;
2690   a->inode.use = flg;
2691   PetscFunctionReturn(0);
2692 }
2693 
2694 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2695 {
2696   PetscErrorCode   ierr;
2697   cusparseStatus_t stat;
2698   Mat              B;
2699 
2700   PetscFunctionBegin;
2701   if (reuse == MAT_INITIAL_MATRIX) {
2702     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2703   } else if (reuse == MAT_REUSE_MATRIX) {
2704     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2705   }
2706   B = *newmat;
2707 
2708   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2709   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2710 
2711   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2712     if (B->factortype == MAT_FACTOR_NONE) {
2713       Mat_SeqAIJCUSPARSE *spptr;
2714 
2715       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2716       spptr->format = MAT_CUSPARSE_CSR;
2717       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2718       B->spptr = spptr;
2719       spptr->deviceMat = NULL;
2720     } else {
2721       Mat_SeqAIJCUSPARSETriFactors *spptr;
2722 
2723       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2724       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2725       B->spptr = spptr;
2726     }
2727     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2728   }
2729   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2730   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2731   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2732   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2733   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2734 
2735   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2736   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2737   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2738   PetscFunctionReturn(0);
2739 }
2740 
2741 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2742 {
2743   PetscErrorCode ierr;
2744 
2745   PetscFunctionBegin;
2746   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
2747   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2748   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2749   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2750   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2751   ierr = PetscOptionsEnd();CHKERRQ(ierr);
2752   PetscFunctionReturn(0);
2753 }
2754 
2755 /*MC
2756    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2757 
2758    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2759    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2760    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2761 
2762    Options Database Keys:
2763 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2764 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2765 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2766 
2767   Level: beginner
2768 
2769 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2770 M*/
2771 
2772 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2773 
2774 
2775 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2776 {
2777   PetscErrorCode ierr;
2778 
2779   PetscFunctionBegin;
2780   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2781   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2782   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2783   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2784   PetscFunctionReturn(0);
2785 }
2786 
2787 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2788 {
2789   PetscErrorCode   ierr;
2790   cusparseStatus_t stat;
2791 
2792   PetscFunctionBegin;
2793   if (*cusparsestruct) {
2794     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2795     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2796     delete (*cusparsestruct)->workVector;
2797     delete (*cusparsestruct)->rowoffsets_gpu;
2798     delete (*cusparsestruct)->cooPerm;
2799     delete (*cusparsestruct)->cooPerm_a;
2800     delete (*cusparsestruct)->cooPerm_v;
2801     delete (*cusparsestruct)->cooPerm_w;
2802     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
2803    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2804     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2805    #endif
2806     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2807   }
2808   PetscFunctionReturn(0);
2809 }
2810 
2811 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2812 {
2813   PetscFunctionBegin;
2814   if (*mat) {
2815     delete (*mat)->values;
2816     delete (*mat)->column_indices;
2817     delete (*mat)->row_offsets;
2818     delete *mat;
2819     *mat = 0;
2820   }
2821   PetscFunctionReturn(0);
2822 }
2823 
2824 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2825 {
2826   cusparseStatus_t stat;
2827   PetscErrorCode   ierr;
2828 
2829   PetscFunctionBegin;
2830   if (*trifactor) {
2831     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2832     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2833     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2834     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2835     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
2836    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2837     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2838    #endif
2839     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
2840   }
2841   PetscFunctionReturn(0);
2842 }
2843 
2844 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2845 {
2846   CsrMatrix        *mat;
2847   cusparseStatus_t stat;
2848   cudaError_t      err;
2849 
2850   PetscFunctionBegin;
2851   if (*matstruct) {
2852     if ((*matstruct)->mat) {
2853       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2854        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2855         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2856        #else
2857         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2858         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2859        #endif
2860       } else {
2861         mat = (CsrMatrix*)(*matstruct)->mat;
2862         CsrMatrix_Destroy(&mat);
2863       }
2864     }
2865     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2866     delete (*matstruct)->cprowIndices;
2867     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2868     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2869     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2870 
2871    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2872     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2873     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2874     for (int i=0; i<3; i++) {
2875       if (mdata->cuSpMV[i].initialized) {
2876         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2877         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2878         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2879       }
2880     }
2881    #endif
2882     delete *matstruct;
2883     *matstruct = NULL;
2884   }
2885   PetscFunctionReturn(0);
2886 }
2887 
2888 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2889 {
2890   PetscErrorCode ierr;
2891 
2892   PetscFunctionBegin;
2893   if (*trifactors) {
2894     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2895     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2896     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2897     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2898     delete (*trifactors)->rpermIndices;
2899     delete (*trifactors)->cpermIndices;
2900     delete (*trifactors)->workVector;
2901     (*trifactors)->rpermIndices = NULL;
2902     (*trifactors)->cpermIndices = NULL;
2903     (*trifactors)->workVector = NULL;
2904   }
2905   PetscFunctionReturn(0);
2906 }
2907 
2908 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2909 {
2910   PetscErrorCode   ierr;
2911   cusparseHandle_t handle;
2912   cusparseStatus_t stat;
2913 
2914   PetscFunctionBegin;
2915   if (*trifactors) {
2916     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2917     if (handle = (*trifactors)->handle) {
2918       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2919     }
2920     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2921   }
2922   PetscFunctionReturn(0);
2923 }
2924 
2925 struct IJCompare
2926 {
2927   __host__ __device__
2928   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2929   {
2930     if (t1.get<0>() < t2.get<0>()) return true;
2931     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
2932     return false;
2933   }
2934 };
2935 
2936 struct IJEqual
2937 {
2938   __host__ __device__
2939   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2940   {
2941     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
2942     return true;
2943   }
2944 };
2945 
2946 struct IJDiff
2947 {
2948   __host__ __device__
2949   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2950   {
2951     return t1 == t2 ? 0 : 1;
2952   }
2953 };
2954 
2955 struct IJSum
2956 {
2957   __host__ __device__
2958   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2959   {
2960     return t1||t2;
2961   }
2962 };
2963 
2964 #include <thrust/iterator/discard_iterator.h>
2965 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
2966 {
2967   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2968   CsrMatrix          *matrix;
2969   PetscErrorCode     ierr;
2970   cudaError_t        cerr;
2971   PetscInt           n;
2972 
2973   PetscFunctionBegin;
2974   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
2975   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
2976   if (!cusp->cooPerm) {
2977     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2978     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2979     PetscFunctionReturn(0);
2980   }
2981   n = cusp->cooPerm->size();
2982   matrix = (CsrMatrix*)cusp->mat->mat;
2983   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
2984   if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); }
2985   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
2986   if (v) {
2987     cusp->cooPerm_v->assign(v,v+n);
2988     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
2989   }
2990   else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.);
2991   if (imode == ADD_VALUES) {
2992     if (cusp->cooPerm_a) {
2993       if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size());
2994       auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin());
2995       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
2996       thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
2997     } else {
2998       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2999                                                                 matrix->values->begin()));
3000       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
3001                                                                 matrix->values->end()));
3002       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3003     }
3004   } else {
3005     if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence)
3006                               if we are inserting two different values into the same location */
3007       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
3008                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin())));
3009       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
3010                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end())));
3011       thrust::for_each(zibit,zieit,VecCUDAEquals());
3012     } else {
3013       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
3014                                                                 matrix->values->begin()));
3015       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
3016                                                                 matrix->values->end()));
3017       thrust::for_each(zibit,zieit,VecCUDAEquals());
3018     }
3019   }
3020   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3021   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
3022   A->offloadmask = PETSC_OFFLOAD_GPU;
3023   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3024   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3025   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
3026   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3027   PetscFunctionReturn(0);
3028 }
3029 
3030 #include <thrust/binary_search.h>
3031 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3032 {
3033   PetscErrorCode     ierr;
3034   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3035   CsrMatrix          *matrix;
3036   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3037   PetscInt           cooPerm_n, nzr = 0;
3038   cudaError_t        cerr;
3039 
3040   PetscFunctionBegin;
3041   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
3042   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3043   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3044   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3045   if (n != cooPerm_n) {
3046     delete cusp->cooPerm;
3047     delete cusp->cooPerm_v;
3048     delete cusp->cooPerm_w;
3049     delete cusp->cooPerm_a;
3050     cusp->cooPerm = NULL;
3051     cusp->cooPerm_v = NULL;
3052     cusp->cooPerm_w = NULL;
3053     cusp->cooPerm_a = NULL;
3054   }
3055   if (n) {
3056     THRUSTINTARRAY d_i(n);
3057     THRUSTINTARRAY d_j(n);
3058     THRUSTINTARRAY ii(A->rmap->n);
3059 
3060     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3061     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3062 
3063     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3064     d_i.assign(coo_i,coo_i+n);
3065     d_j.assign(coo_j,coo_j+n);
3066     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3067     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3068 
3069     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3070     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3071     *cusp->cooPerm_a = d_i;
3072     THRUSTINTARRAY w = d_j;
3073 
3074     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3075     if (nekey == ekey) { /* all entries are unique */
3076       delete cusp->cooPerm_a;
3077       cusp->cooPerm_a = NULL;
3078     } else { /* I couldn't come up with a more elegant algorithm */
3079       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3080       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3081       (*cusp->cooPerm_a)[0] = 0;
3082       w[0] = 0;
3083       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3084       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3085     }
3086     thrust::counting_iterator<PetscInt> search_begin(0);
3087     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3088                         search_begin, search_begin + A->rmap->n,
3089                         ii.begin());
3090 
3091     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3092     a->singlemalloc = PETSC_FALSE;
3093     a->free_a       = PETSC_TRUE;
3094     a->free_ij      = PETSC_TRUE;
3095     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3096     a->i[0] = 0;
3097     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3098     a->nz = a->maxnz = a->i[A->rmap->n];
3099     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3100     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3101     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3102     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3103     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3104     for (PetscInt i = 0; i < A->rmap->n; i++) {
3105       const PetscInt nnzr = a->i[i+1] - a->i[i];
3106       nzr += (PetscInt)!!(nnzr);
3107       a->ilen[i] = a->imax[i] = nnzr;
3108     }
3109     A->preallocated = PETSC_TRUE;
3110     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3111   } else {
3112     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3113   }
3114   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3115   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
3116 
3117   /* We want to allocate the CUSPARSE struct for matvec now.
3118      The code is so convoluted now that I prefer to copy garbage to the GPU */
3119   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3120   A->offloadmask = PETSC_OFFLOAD_CPU;
3121   A->nonzerostate++;
3122   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3123   {
3124     matrix = (CsrMatrix*)cusp->mat->mat;
3125     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3126     thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3127     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3128   }
3129 
3130   A->offloadmask = PETSC_OFFLOAD_CPU;
3131   A->assembled = PETSC_FALSE;
3132   A->was_assembled = PETSC_FALSE;
3133   PetscFunctionReturn(0);
3134 }
3135