xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision a587d1398bd356a7db20a09283619c724e82d622)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
75 
76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
84 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
85 
86 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
87 {
88   cusparseStatus_t   stat;
89   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
90 
91   PetscFunctionBegin;
92   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
93   cusparsestruct->stream = stream;
94   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
95   PetscFunctionReturn(0);
96 }
97 
98 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
99 {
100   cusparseStatus_t   stat;
101   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
102 
103   PetscFunctionBegin;
104   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
105   if (cusparsestruct->handle != handle) {
106     if (cusparsestruct->handle) {
107       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
108     }
109     cusparsestruct->handle = handle;
110   }
111   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
112   PetscFunctionReturn(0);
113 }
114 
115 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
116 {
117   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
118   PetscBool          flg;
119   PetscErrorCode     ierr;
120 
121   PetscFunctionBegin;
122   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
123   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
124   if (cusparsestruct->handle) cusparsestruct->handle = 0;
125   PetscFunctionReturn(0);
126 }
127 
128 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
129 {
130   PetscFunctionBegin;
131   *type = MATSOLVERCUSPARSE;
132   PetscFunctionReturn(0);
133 }
134 
135 /*MC
136   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
137   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
138   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
139   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
140   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
141   algorithms are not recommended. This class does NOT support direct solver operations.
142 
143   Level: beginner
144 
145 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
146 M*/
147 
148 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
149 {
150   PetscErrorCode ierr;
151   PetscInt       n = A->rmap->n;
152 
153   PetscFunctionBegin;
154   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
155   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
156   (*B)->factortype = ftype;
157   (*B)->useordering = PETSC_TRUE;
158   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
159 
160   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
161     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
162     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
163     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
164   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
165     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
166     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
167   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
168 
169   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
170   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
171   PetscFunctionReturn(0);
172 }
173 
174 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
175 {
176   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   switch (op) {
180   case MAT_CUSPARSE_MULT:
181     cusparsestruct->format = format;
182     break;
183   case MAT_CUSPARSE_ALL:
184     cusparsestruct->format = format;
185     break;
186   default:
187     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
188   }
189   PetscFunctionReturn(0);
190 }
191 
192 /*@
193    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
194    operation. Only the MatMult operation can use different GPU storage formats
195    for MPIAIJCUSPARSE matrices.
196    Not Collective
197 
198    Input Parameters:
199 +  A - Matrix of type SEQAIJCUSPARSE
200 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
201 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
202 
203    Output Parameter:
204 
205    Level: intermediate
206 
207 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
208 @*/
209 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
210 {
211   PetscErrorCode ierr;
212 
213   PetscFunctionBegin;
214   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
215   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
216   PetscFunctionReturn(0);
217 }
218 
219 /*@
220    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
221 
222    Collective on mat
223 
224    Input Parameters:
225 +  A - Matrix of type SEQAIJCUSPARSE
226 -  transgen - the boolean flag
227 
228    Level: intermediate
229 
230 .seealso: MATSEQAIJCUSPARSE
231 @*/
232 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
233 {
234   PetscErrorCode ierr;
235   PetscBool      flg;
236 
237   PetscFunctionBegin;
238   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
239   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
240   if (flg) {
241     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
242 
243     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
244     cusp->transgen = transgen;
245     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
246       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
247     }
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
253 {
254   PetscErrorCode           ierr;
255   MatCUSPARSEStorageFormat format;
256   PetscBool                flg;
257   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
258 
259   PetscFunctionBegin;
260   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
261   if (A->factortype == MAT_FACTOR_NONE) {
262     PetscBool transgen = cusparsestruct->transgen;
263 
264     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
265     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
266 
267     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
268                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
269     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
270 
271     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
272                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
273     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
274    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
275     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
276     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
277                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
278     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
279     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
280 
281     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
282     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
283                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
284     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
285 
286     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
287     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
288                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
289     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
290    #endif
291   }
292   ierr = PetscOptionsTail();CHKERRQ(ierr);
293   PetscFunctionReturn(0);
294 }
295 
296 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
297 {
298   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
299   PetscErrorCode               ierr;
300 
301   PetscFunctionBegin;
302   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
303   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
304   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
305   PetscFunctionReturn(0);
306 }
307 
308 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
309 {
310   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
311   PetscErrorCode               ierr;
312 
313   PetscFunctionBegin;
314   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
315   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
316   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
317   PetscFunctionReturn(0);
318 }
319 
320 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
321 {
322   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
323   PetscErrorCode               ierr;
324 
325   PetscFunctionBegin;
326   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
327   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
328   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
329   PetscFunctionReturn(0);
330 }
331 
332 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
333 {
334   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335   PetscErrorCode               ierr;
336 
337   PetscFunctionBegin;
338   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
339   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
340   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
341   PetscFunctionReturn(0);
342 }
343 
344 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
345 {
346   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
347   PetscInt                          n = A->rmap->n;
348   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
349   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
350   cusparseStatus_t                  stat;
351   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
352   const MatScalar                   *aa = a->a,*v;
353   PetscInt                          *AiLo, *AjLo;
354   PetscScalar                       *AALo;
355   PetscInt                          i,nz, nzLower, offset, rowOffset;
356   PetscErrorCode                    ierr;
357   cudaError_t                       cerr;
358 
359   PetscFunctionBegin;
360   if (!n) PetscFunctionReturn(0);
361   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
362     try {
363       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
364       nzLower=n+ai[n]-ai[1];
365       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
366       if (!loTriFactor) {
367 
368         /* Allocate Space for the lower triangular matrix */
369         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
370         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
371 
372         /* Fill the lower triangular matrix */
373         AiLo[0]  = (PetscInt) 0;
374         AiLo[n]  = nzLower;
375         AjLo[0]  = (PetscInt) 0;
376         AALo[0]  = (MatScalar) 1.0;
377         v        = aa;
378         vi       = aj;
379         offset   = 1;
380         rowOffset= 1;
381         for (i=1; i<n; i++) {
382           nz = ai[i+1] - ai[i];
383           /* additional 1 for the term on the diagonal */
384           AiLo[i]    = rowOffset;
385           rowOffset += nz+1;
386 
387           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
388           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
389 
390           offset      += nz;
391           AjLo[offset] = (PetscInt) i;
392           AALo[offset] = (MatScalar) 1.0;
393           offset      += 1;
394 
395           v  += nz;
396           vi += nz;
397         }
398 
399         /* allocate space for the triangular factor information */
400         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
401         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
402         /* Create the matrix description */
403         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
404         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
405        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
406         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
407        #else
408         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
409        #endif
410         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
411         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
412 
413         /* set the operation */
414         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
415 
416         /* set the matrix */
417         loTriFactor->csrMat = new CsrMatrix;
418         loTriFactor->csrMat->num_rows = n;
419         loTriFactor->csrMat->num_cols = n;
420         loTriFactor->csrMat->num_entries = nzLower;
421 
422         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
423         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
424 
425         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
426         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
427 
428         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
429         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
430 
431         /* Create the solve analysis information */
432         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
433         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
434       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
435         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
436                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
437                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
438                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
439                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
440         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
441       #endif
442 
443         /* perform the solve analysis */
444         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
445                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
446                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
447                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
448                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
449                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
450                                #endif
451                                 );CHKERRCUSPARSE(stat);
452         cerr = WaitForCUDA();CHKERRCUDA(cerr);
453         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
454 
455         /* assign the pointer */
456         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
457 
458         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
459         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
460         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
461       } else { /* update values only */
462         /* Fill the lower triangular matrix */
463         AALo[0]  = 1.0;
464         v        = aa;
465         vi       = aj;
466         offset   = 1;
467         for (i=1; i<n; i++) {
468           nz = ai[i+1] - ai[i];
469           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
470           offset      += nz;
471           AALo[offset] = 1.0;
472           offset      += 1;
473           v  += nz;
474         }
475         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
476         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
477       }
478       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
479     } catch(char *ex) {
480       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
481     }
482   }
483   PetscFunctionReturn(0);
484 }
485 
486 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
487 {
488   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
489   PetscInt                          n = A->rmap->n;
490   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
491   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492   cusparseStatus_t                  stat;
493   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
494   const MatScalar                   *aa = a->a,*v;
495   PetscInt                          *AiUp, *AjUp;
496   PetscScalar                       *AAUp;
497   PetscInt                          i,nz, nzUpper, offset;
498   PetscErrorCode                    ierr;
499   cudaError_t                       cerr;
500 
501   PetscFunctionBegin;
502   if (!n) PetscFunctionReturn(0);
503   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
504     try {
505       /* next, figure out the number of nonzeros in the upper triangular matrix. */
506       nzUpper = adiag[0]-adiag[n];
507       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
508       if (!upTriFactor) {
509         /* Allocate Space for the upper triangular matrix */
510         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
511         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
512 
513         /* Fill the upper triangular matrix */
514         AiUp[0]=(PetscInt) 0;
515         AiUp[n]=nzUpper;
516         offset = nzUpper;
517         for (i=n-1; i>=0; i--) {
518           v  = aa + adiag[i+1] + 1;
519           vi = aj + adiag[i+1] + 1;
520 
521           /* number of elements NOT on the diagonal */
522           nz = adiag[i] - adiag[i+1]-1;
523 
524           /* decrement the offset */
525           offset -= (nz+1);
526 
527           /* first, set the diagonal elements */
528           AjUp[offset] = (PetscInt) i;
529           AAUp[offset] = (MatScalar)1./v[nz];
530           AiUp[i]      = AiUp[i+1] - (nz+1);
531 
532           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
533           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
534         }
535 
536         /* allocate space for the triangular factor information */
537         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
538         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
539 
540         /* Create the matrix description */
541         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
542         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
543        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
544         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
545        #else
546         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
547        #endif
548         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
549         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
550 
551         /* set the operation */
552         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
553 
554         /* set the matrix */
555         upTriFactor->csrMat = new CsrMatrix;
556         upTriFactor->csrMat->num_rows = n;
557         upTriFactor->csrMat->num_cols = n;
558         upTriFactor->csrMat->num_entries = nzUpper;
559 
560         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
561         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
562 
563         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
564         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
565 
566         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
567         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
568 
569         /* Create the solve analysis information */
570         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
571         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
572       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
573         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
574                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
575                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
576                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
577                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
578         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
579       #endif
580 
581         /* perform the solve analysis */
582         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
583                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
584                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
585                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
586                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
587                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
588                                #endif
589                                 );CHKERRCUSPARSE(stat);
590         cerr = WaitForCUDA();CHKERRCUDA(cerr);
591         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
592 
593         /* assign the pointer */
594         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
595 
596         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
597         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
598         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
599       } else {
600         /* Fill the upper triangular matrix */
601         offset = nzUpper;
602         for (i=n-1; i>=0; i--) {
603           v  = aa + adiag[i+1] + 1;
604 
605           /* number of elements NOT on the diagonal */
606           nz = adiag[i] - adiag[i+1]-1;
607 
608           /* decrement the offset */
609           offset -= (nz+1);
610 
611           /* first, set the diagonal elements */
612           AAUp[offset] = 1./v[nz];
613           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
614         }
615         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
616         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
617       }
618       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
619     } catch(char *ex) {
620       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
621     }
622   }
623   PetscFunctionReturn(0);
624 }
625 
626 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
627 {
628   PetscErrorCode               ierr;
629   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
630   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
631   IS                           isrow = a->row,iscol = a->icol;
632   PetscBool                    row_identity,col_identity;
633   PetscInt                     n = A->rmap->n;
634 
635   PetscFunctionBegin;
636   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
637   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
638   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
639 
640   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
641   cusparseTriFactors->nnz=a->nz;
642 
643   A->offloadmask = PETSC_OFFLOAD_BOTH;
644   /* lower triangular indices */
645   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
646   if (!row_identity && !cusparseTriFactors->rpermIndices) {
647     const PetscInt *r;
648 
649     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
650     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
651     cusparseTriFactors->rpermIndices->assign(r, r+n);
652     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
653     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
654   }
655 
656   /* upper triangular indices */
657   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
658   if (!col_identity && !cusparseTriFactors->cpermIndices) {
659     const PetscInt *c;
660 
661     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
662     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
663     cusparseTriFactors->cpermIndices->assign(c, c+n);
664     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
665     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
666   }
667   PetscFunctionReturn(0);
668 }
669 
670 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
671 {
672   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
673   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
674   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
675   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
676   cusparseStatus_t                  stat;
677   PetscErrorCode                    ierr;
678   cudaError_t                       cerr;
679   PetscInt                          *AiUp, *AjUp;
680   PetscScalar                       *AAUp;
681   PetscScalar                       *AALo;
682   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
683   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
684   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
685   const MatScalar                   *aa = b->a,*v;
686 
687   PetscFunctionBegin;
688   if (!n) PetscFunctionReturn(0);
689   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
690     try {
691       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
692       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
693       if (!upTriFactor && !loTriFactor) {
694         /* Allocate Space for the upper triangular matrix */
695         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
696         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
697 
698         /* Fill the upper triangular matrix */
699         AiUp[0]=(PetscInt) 0;
700         AiUp[n]=nzUpper;
701         offset = 0;
702         for (i=0; i<n; i++) {
703           /* set the pointers */
704           v  = aa + ai[i];
705           vj = aj + ai[i];
706           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
707 
708           /* first, set the diagonal elements */
709           AjUp[offset] = (PetscInt) i;
710           AAUp[offset] = (MatScalar)1.0/v[nz];
711           AiUp[i]      = offset;
712           AALo[offset] = (MatScalar)1.0/v[nz];
713 
714           offset+=1;
715           if (nz>0) {
716             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
717             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
718             for (j=offset; j<offset+nz; j++) {
719               AAUp[j] = -AAUp[j];
720               AALo[j] = AAUp[j]/v[nz];
721             }
722             offset+=nz;
723           }
724         }
725 
726         /* allocate space for the triangular factor information */
727         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
728         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
729 
730         /* Create the matrix description */
731         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
732         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
733        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
734         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
735        #else
736         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
737        #endif
738         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
739         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
740 
741         /* set the matrix */
742         upTriFactor->csrMat = new CsrMatrix;
743         upTriFactor->csrMat->num_rows = A->rmap->n;
744         upTriFactor->csrMat->num_cols = A->cmap->n;
745         upTriFactor->csrMat->num_entries = a->nz;
746 
747         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
748         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
749 
750         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
751         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
752 
753         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
754         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
755 
756         /* set the operation */
757         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
758 
759         /* Create the solve analysis information */
760         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
761         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
762       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
764                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
765                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
766                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
767                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
768         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
769       #endif
770 
771         /* perform the solve analysis */
772         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
773                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
774                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
775                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
776                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
777                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
778                                 #endif
779                                 );CHKERRCUSPARSE(stat);
780         cerr = WaitForCUDA();CHKERRCUDA(cerr);
781         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
782 
783         /* assign the pointer */
784         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
785 
786         /* allocate space for the triangular factor information */
787         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
788         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
789 
790         /* Create the matrix description */
791         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
792         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
793        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
794         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
795        #else
796         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
797        #endif
798         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
799         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
800 
801         /* set the operation */
802         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
803 
804         /* set the matrix */
805         loTriFactor->csrMat = new CsrMatrix;
806         loTriFactor->csrMat->num_rows = A->rmap->n;
807         loTriFactor->csrMat->num_cols = A->cmap->n;
808         loTriFactor->csrMat->num_entries = a->nz;
809 
810         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
811         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
812 
813         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
814         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
815 
816         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
817         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
818 
819         /* Create the solve analysis information */
820         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
821         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
822       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
824                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
825                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
826                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
827                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
828         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
829       #endif
830 
831         /* perform the solve analysis */
832         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
833                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
834                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
835                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
836                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
837                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
838                                 #endif
839                                 );CHKERRCUSPARSE(stat);
840         cerr = WaitForCUDA();CHKERRCUDA(cerr);
841         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
842 
843         /* assign the pointer */
844         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
845 
846         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
847         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
848         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
849       } else {
850         /* Fill the upper triangular matrix */
851         offset = 0;
852         for (i=0; i<n; i++) {
853           /* set the pointers */
854           v  = aa + ai[i];
855           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
856 
857           /* first, set the diagonal elements */
858           AAUp[offset] = 1.0/v[nz];
859           AALo[offset] = 1.0/v[nz];
860 
861           offset+=1;
862           if (nz>0) {
863             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
864             for (j=offset; j<offset+nz; j++) {
865               AAUp[j] = -AAUp[j];
866               AALo[j] = AAUp[j]/v[nz];
867             }
868             offset+=nz;
869           }
870         }
871         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
872         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
873         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
874         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
875         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
876       }
877       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
878       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
879     } catch(char *ex) {
880       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
881     }
882   }
883   PetscFunctionReturn(0);
884 }
885 
886 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
887 {
888   PetscErrorCode               ierr;
889   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
890   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
891   IS                           ip = a->row;
892   PetscBool                    perm_identity;
893   PetscInt                     n = A->rmap->n;
894 
895   PetscFunctionBegin;
896   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
897   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
898   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
899   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
900 
901   A->offloadmask = PETSC_OFFLOAD_BOTH;
902 
903   /* lower triangular indices */
904   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
905   if (!perm_identity) {
906     IS             iip;
907     const PetscInt *irip,*rip;
908 
909     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
910     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
911     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
912     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
913     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
914     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
915     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
916     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
917     ierr = ISDestroy(&iip);CHKERRQ(ierr);
918     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
919     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
920   }
921   PetscFunctionReturn(0);
922 }
923 
924 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
925 {
926   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
927   IS             isrow = b->row,iscol = b->col;
928   PetscBool      row_identity,col_identity;
929   PetscErrorCode ierr;
930 
931   PetscFunctionBegin;
932   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
933   B->offloadmask = PETSC_OFFLOAD_CPU;
934   /* determine which version of MatSolve needs to be used. */
935   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
936   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
937   if (row_identity && col_identity) {
938     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
939     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
940     B->ops->matsolve = NULL;
941     B->ops->matsolvetranspose = NULL;
942   } else {
943     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
944     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
945     B->ops->matsolve = NULL;
946     B->ops->matsolvetranspose = NULL;
947   }
948 
949   /* get the triangular factors */
950   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
951   PetscFunctionReturn(0);
952 }
953 
954 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
955 {
956   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
957   IS             ip = b->row;
958   PetscBool      perm_identity;
959   PetscErrorCode ierr;
960 
961   PetscFunctionBegin;
962   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
963   B->offloadmask = PETSC_OFFLOAD_CPU;
964   /* determine which version of MatSolve needs to be used. */
965   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
966   if (perm_identity) {
967     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
968     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
969     B->ops->matsolve = NULL;
970     B->ops->matsolvetranspose = NULL;
971   } else {
972     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
973     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
974     B->ops->matsolve = NULL;
975     B->ops->matsolvetranspose = NULL;
976   }
977 
978   /* get the triangular factors */
979   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
980   PetscFunctionReturn(0);
981 }
982 
983 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
984 {
985   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
986   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
987   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
988   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
989   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
990   cusparseStatus_t                  stat;
991   cusparseIndexBase_t               indexBase;
992   cusparseMatrixType_t              matrixType;
993   cusparseFillMode_t                fillMode;
994   cusparseDiagType_t                diagType;
995   cudaError_t                       cerr;
996   PetscErrorCode                    ierr;
997 
998   PetscFunctionBegin;
999   /* allocate space for the transpose of the lower triangular factor */
1000   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1001   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1002 
1003   /* set the matrix descriptors of the lower triangular factor */
1004   matrixType = cusparseGetMatType(loTriFactor->descr);
1005   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1006   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1007     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1008   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1009 
1010   /* Create the matrix description */
1011   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1012   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1013   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1014   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1015   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1016 
1017   /* set the operation */
1018   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1019 
1020   /* allocate GPU space for the CSC of the lower triangular factor*/
1021   loTriFactorT->csrMat = new CsrMatrix;
1022   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1023   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1024   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1025   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1026   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1027   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1028 
1029   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1030 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1031   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1032                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1033                                        loTriFactor->csrMat->values->data().get(),
1034                                        loTriFactor->csrMat->row_offsets->data().get(),
1035                                        loTriFactor->csrMat->column_indices->data().get(),
1036                                        loTriFactorT->csrMat->values->data().get(),
1037                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1038                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1039                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1040   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1041 #endif
1042 
1043   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1044   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1045                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1046                           loTriFactor->csrMat->values->data().get(),
1047                           loTriFactor->csrMat->row_offsets->data().get(),
1048                           loTriFactor->csrMat->column_indices->data().get(),
1049                           loTriFactorT->csrMat->values->data().get(),
1050                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1051                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052                           CUSPARSE_ACTION_NUMERIC, indexBase,
1053                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1054                         #else
1055                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1056                           CUSPARSE_ACTION_NUMERIC, indexBase
1057                         #endif
1058                         );CHKERRCUSPARSE(stat);
1059   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1060   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1061 
1062   /* Create the solve analysis information */
1063   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1064   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1065 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1066   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1067                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1068                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1069                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1070                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1071   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1072 #endif
1073 
1074   /* perform the solve analysis */
1075   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1079                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1081                           #endif
1082                           );CHKERRCUSPARSE(stat);
1083   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1084   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1085 
1086   /* assign the pointer */
1087   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1088 
1089   /*********************************************/
1090   /* Now the Transpose of the Upper Tri Factor */
1091   /*********************************************/
1092 
1093   /* allocate space for the transpose of the upper triangular factor */
1094   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1095   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1096 
1097   /* set the matrix descriptors of the upper triangular factor */
1098   matrixType = cusparseGetMatType(upTriFactor->descr);
1099   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1100   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1101     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1102   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1103 
1104   /* Create the matrix description */
1105   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1106   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1107   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1108   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1109   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1110 
1111   /* set the operation */
1112   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1113 
1114   /* allocate GPU space for the CSC of the upper triangular factor*/
1115   upTriFactorT->csrMat = new CsrMatrix;
1116   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1117   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1118   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1119   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1120   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1121   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1122 
1123   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1125   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1126                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1127                                 upTriFactor->csrMat->values->data().get(),
1128                                 upTriFactor->csrMat->row_offsets->data().get(),
1129                                 upTriFactor->csrMat->column_indices->data().get(),
1130                                 upTriFactorT->csrMat->values->data().get(),
1131                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1132                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1133                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1134   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1135 #endif
1136 
1137   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1138   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1139                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1140                           upTriFactor->csrMat->values->data().get(),
1141                           upTriFactor->csrMat->row_offsets->data().get(),
1142                           upTriFactor->csrMat->column_indices->data().get(),
1143                           upTriFactorT->csrMat->values->data().get(),
1144                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1145                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146                           CUSPARSE_ACTION_NUMERIC, indexBase,
1147                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1148                         #else
1149                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1150                           CUSPARSE_ACTION_NUMERIC, indexBase
1151                         #endif
1152                         );CHKERRCUSPARSE(stat);
1153   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1154   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1155 
1156   /* Create the solve analysis information */
1157   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1158   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1159   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1160   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1161                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1162                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1163                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1164                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1165   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1166   #endif
1167 
1168   /* perform the solve analysis */
1169   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1170                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1171                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1172                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1173                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1175                           #endif
1176                           );CHKERRCUSPARSE(stat);
1177   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1178   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1179 
1180   /* assign the pointer */
1181   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1186 {
1187   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1188   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1189   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1190   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1191   cusparseStatus_t             stat;
1192   cusparseIndexBase_t          indexBase;
1193   cudaError_t                  err;
1194   PetscErrorCode               ierr;
1195 
1196   PetscFunctionBegin;
1197   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
1198   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1199   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1200   /* create cusparse matrix */
1201   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1202   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1203   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1204   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1205   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1206 
1207   /* set alpha and beta */
1208   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1209   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1210   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1211   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1212   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1213   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1214   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1215 
1216   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1217     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1218     CsrMatrix *matrixT= new CsrMatrix;
1219     matrixT->num_rows = A->cmap->n;
1220     matrixT->num_cols = A->rmap->n;
1221     matrixT->num_entries = a->nz;
1222     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1223     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1224     matrixT->values = new THRUSTARRAY(a->nz);
1225 
1226     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1227     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1228 
1229     /* compute the transpose, i.e. the CSC */
1230    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1231     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1232                                   A->cmap->n, matrix->num_entries,
1233                                   matrix->values->data().get(),
1234                                   cusparsestruct->rowoffsets_gpu->data().get(),
1235                                   matrix->column_indices->data().get(),
1236                                   matrixT->values->data().get(),
1237                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1238                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1239                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1240     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1241    #endif
1242 
1243     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1244                             A->cmap->n, matrix->num_entries,
1245                             matrix->values->data().get(),
1246                             cusparsestruct->rowoffsets_gpu->data().get(),
1247                             matrix->column_indices->data().get(),
1248                             matrixT->values->data().get(),
1249                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1250                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1251                             CUSPARSE_ACTION_NUMERIC,indexBase,
1252                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1253                           #else
1254                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1255                             CUSPARSE_ACTION_NUMERIC, indexBase
1256                           #endif
1257                            );CHKERRCUSPARSE(stat);
1258     matstructT->mat = matrixT;
1259 
1260    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1261     stat = cusparseCreateCsr(&matstructT->matDescr,
1262                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1263                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1264                              matrixT->values->data().get(),
1265                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1266                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1267    #endif
1268   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1269    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1270     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1271    #else
1272     CsrMatrix *temp  = new CsrMatrix;
1273     CsrMatrix *tempT = new CsrMatrix;
1274     /* First convert HYB to CSR */
1275     temp->num_rows = A->rmap->n;
1276     temp->num_cols = A->cmap->n;
1277     temp->num_entries = a->nz;
1278     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1279     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1280     temp->values = new THRUSTARRAY(a->nz);
1281 
1282     stat = cusparse_hyb2csr(cusparsestruct->handle,
1283                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1284                             temp->values->data().get(),
1285                             temp->row_offsets->data().get(),
1286                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1287 
1288     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1289     tempT->num_rows = A->rmap->n;
1290     tempT->num_cols = A->cmap->n;
1291     tempT->num_entries = a->nz;
1292     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1293     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1294     tempT->values = new THRUSTARRAY(a->nz);
1295 
1296     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1297                             temp->num_cols, temp->num_entries,
1298                             temp->values->data().get(),
1299                             temp->row_offsets->data().get(),
1300                             temp->column_indices->data().get(),
1301                             tempT->values->data().get(),
1302                             tempT->column_indices->data().get(),
1303                             tempT->row_offsets->data().get(),
1304                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1305 
1306     /* Last, convert CSC to HYB */
1307     cusparseHybMat_t hybMat;
1308     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1309     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1310       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1311     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1312                             matstructT->descr, tempT->values->data().get(),
1313                             tempT->row_offsets->data().get(),
1314                             tempT->column_indices->data().get(),
1315                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1316 
1317     /* assign the pointer */
1318     matstructT->mat = hybMat;
1319     /* delete temporaries */
1320     if (tempT) {
1321       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1322       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1323       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1324       delete (CsrMatrix*) tempT;
1325     }
1326     if (temp) {
1327       if (temp->values) delete (THRUSTARRAY*) temp->values;
1328       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1329       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1330       delete (CsrMatrix*) temp;
1331     }
1332    #endif
1333   }
1334   err  = WaitForCUDA();CHKERRCUDA(err);
1335   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1336   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1337   /* the compressed row indices is not used for matTranspose */
1338   matstructT->cprowIndices = NULL;
1339   /* assign the pointer */
1340   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1341   PetscFunctionReturn(0);
1342 }
1343 
1344 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1345 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1346 {
1347   PetscInt                              n = xx->map->n;
1348   const PetscScalar                     *barray;
1349   PetscScalar                           *xarray;
1350   thrust::device_ptr<const PetscScalar> bGPU;
1351   thrust::device_ptr<PetscScalar>       xGPU;
1352   cusparseStatus_t                      stat;
1353   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1354   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1355   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1356   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1357   PetscErrorCode                        ierr;
1358   cudaError_t                           cerr;
1359 
1360   PetscFunctionBegin;
1361   /* Analyze the matrix and create the transpose ... on the fly */
1362   if (!loTriFactorT && !upTriFactorT) {
1363     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1364     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1365     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1366   }
1367 
1368   /* Get the GPU pointers */
1369   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1370   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1371   xGPU = thrust::device_pointer_cast(xarray);
1372   bGPU = thrust::device_pointer_cast(barray);
1373 
1374   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1375   /* First, reorder with the row permutation */
1376   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1377                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1378                xGPU);
1379 
1380   /* First, solve U */
1381   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1382                         upTriFactorT->csrMat->num_rows,
1383                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1384                         upTriFactorT->csrMat->num_entries,
1385                       #endif
1386                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1387                         upTriFactorT->csrMat->values->data().get(),
1388                         upTriFactorT->csrMat->row_offsets->data().get(),
1389                         upTriFactorT->csrMat->column_indices->data().get(),
1390                         upTriFactorT->solveInfo,
1391                         xarray, tempGPU->data().get()
1392                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1393                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1394                       #endif
1395                         );CHKERRCUSPARSE(stat);
1396 
1397   /* Then, solve L */
1398   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1399                         loTriFactorT->csrMat->num_rows,
1400                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1401                         loTriFactorT->csrMat->num_entries,
1402                       #endif
1403                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1404                         loTriFactorT->csrMat->values->data().get(),
1405                         loTriFactorT->csrMat->row_offsets->data().get(),
1406                         loTriFactorT->csrMat->column_indices->data().get(),
1407                         loTriFactorT->solveInfo,
1408                         tempGPU->data().get(), xarray
1409                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1410                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1411                       #endif
1412                         );CHKERRCUSPARSE(stat);
1413 
1414   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1415   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1416                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1417                tempGPU->begin());
1418 
1419   /* Copy the temporary to the full solution. */
1420   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1421 
1422   /* restore */
1423   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1424   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1425   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1426   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1427   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1428   PetscFunctionReturn(0);
1429 }
1430 
1431 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1432 {
1433   const PetscScalar                 *barray;
1434   PetscScalar                       *xarray;
1435   cusparseStatus_t                  stat;
1436   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1437   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1438   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1439   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1440   PetscErrorCode                    ierr;
1441   cudaError_t                       cerr;
1442 
1443   PetscFunctionBegin;
1444   /* Analyze the matrix and create the transpose ... on the fly */
1445   if (!loTriFactorT && !upTriFactorT) {
1446     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1447     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1448     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1449   }
1450 
1451   /* Get the GPU pointers */
1452   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1453   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1454 
1455   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1456   /* First, solve U */
1457   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1458                         upTriFactorT->csrMat->num_rows,
1459                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1460                         upTriFactorT->csrMat->num_entries,
1461                       #endif
1462                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1463                         upTriFactorT->csrMat->values->data().get(),
1464                         upTriFactorT->csrMat->row_offsets->data().get(),
1465                         upTriFactorT->csrMat->column_indices->data().get(),
1466                         upTriFactorT->solveInfo,
1467                         barray, tempGPU->data().get()
1468                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1469                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1470                       #endif
1471                         );CHKERRCUSPARSE(stat);
1472 
1473   /* Then, solve L */
1474   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1475                         loTriFactorT->csrMat->num_rows,
1476                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1477                         loTriFactorT->csrMat->num_entries,
1478                       #endif
1479                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1480                         loTriFactorT->csrMat->values->data().get(),
1481                         loTriFactorT->csrMat->row_offsets->data().get(),
1482                         loTriFactorT->csrMat->column_indices->data().get(),
1483                         loTriFactorT->solveInfo,
1484                         tempGPU->data().get(), xarray
1485                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1486                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1487                       #endif
1488                         );CHKERRCUSPARSE(stat);
1489 
1490   /* restore */
1491   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1492   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1493   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1494   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1495   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1496   PetscFunctionReturn(0);
1497 }
1498 
1499 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1500 {
1501   const PetscScalar                     *barray;
1502   PetscScalar                           *xarray;
1503   thrust::device_ptr<const PetscScalar> bGPU;
1504   thrust::device_ptr<PetscScalar>       xGPU;
1505   cusparseStatus_t                      stat;
1506   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1507   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1508   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1509   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1510   PetscErrorCode                        ierr;
1511   cudaError_t                           cerr;
1512 
1513   PetscFunctionBegin;
1514 
1515   /* Get the GPU pointers */
1516   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1517   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1518   xGPU = thrust::device_pointer_cast(xarray);
1519   bGPU = thrust::device_pointer_cast(barray);
1520 
1521   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1522   /* First, reorder with the row permutation */
1523   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1524                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1525                tempGPU->begin());
1526 
1527   /* Next, solve L */
1528   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1529                         loTriFactor->csrMat->num_rows,
1530                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531                         loTriFactor->csrMat->num_entries,
1532                       #endif
1533                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1534                         loTriFactor->csrMat->values->data().get(),
1535                         loTriFactor->csrMat->row_offsets->data().get(),
1536                         loTriFactor->csrMat->column_indices->data().get(),
1537                         loTriFactor->solveInfo,
1538                         tempGPU->data().get(), xarray
1539                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1541                       #endif
1542                         );CHKERRCUSPARSE(stat);
1543 
1544   /* Then, solve U */
1545   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1546                         upTriFactor->csrMat->num_rows,
1547                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548                         upTriFactor->csrMat->num_entries,
1549                       #endif
1550                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1551                         upTriFactor->csrMat->values->data().get(),
1552                         upTriFactor->csrMat->row_offsets->data().get(),
1553                         upTriFactor->csrMat->column_indices->data().get(),
1554                         upTriFactor->solveInfo,
1555                         xarray, tempGPU->data().get()
1556                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1557                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1558                       #endif
1559                         );CHKERRCUSPARSE(stat);
1560 
1561   /* Last, reorder with the column permutation */
1562   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1563                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1564                xGPU);
1565 
1566   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1567   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1568   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1569   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1570   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1571   PetscFunctionReturn(0);
1572 }
1573 
1574 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1575 {
1576   const PetscScalar                 *barray;
1577   PetscScalar                       *xarray;
1578   cusparseStatus_t                  stat;
1579   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1580   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1581   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1582   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1583   PetscErrorCode                    ierr;
1584   cudaError_t                       cerr;
1585 
1586   PetscFunctionBegin;
1587   /* Get the GPU pointers */
1588   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1589   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1590 
1591   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1592   /* First, solve L */
1593   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1594                         loTriFactor->csrMat->num_rows,
1595                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1596                         loTriFactor->csrMat->num_entries,
1597                       #endif
1598                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1599                         loTriFactor->csrMat->values->data().get(),
1600                         loTriFactor->csrMat->row_offsets->data().get(),
1601                         loTriFactor->csrMat->column_indices->data().get(),
1602                         loTriFactor->solveInfo,
1603                         barray, tempGPU->data().get()
1604                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1605                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1606                       #endif
1607                         );CHKERRCUSPARSE(stat);
1608 
1609   /* Next, solve U */
1610   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1611                         upTriFactor->csrMat->num_rows,
1612                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1613                         upTriFactor->csrMat->num_entries,
1614                       #endif
1615                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1616                         upTriFactor->csrMat->values->data().get(),
1617                         upTriFactor->csrMat->row_offsets->data().get(),
1618                         upTriFactor->csrMat->column_indices->data().get(),
1619                         upTriFactor->solveInfo,
1620                         tempGPU->data().get(), xarray
1621                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1622                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1623                       #endif
1624                         );CHKERRCUSPARSE(stat);
1625 
1626   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1627   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1628   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1629   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1630   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1631   PetscFunctionReturn(0);
1632 }
1633 
1634 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1635 {
1636   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1637   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1638   cudaError_t        cerr;
1639   PetscErrorCode     ierr;
1640 
1641   PetscFunctionBegin;
1642   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1643     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1644 
1645     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1646     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1647     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1648     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1649     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1650     A->offloadmask = PETSC_OFFLOAD_BOTH;
1651   }
1652   PetscFunctionReturn(0);
1653 }
1654 
1655 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1656 {
1657   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1658   PetscErrorCode ierr;
1659 
1660   PetscFunctionBegin;
1661   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1662   *array = a->a;
1663   A->offloadmask = PETSC_OFFLOAD_CPU;
1664   PetscFunctionReturn(0);
1665 }
1666 
1667 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1668 {
1669   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1670   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1671   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1672   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1673   PetscErrorCode               ierr;
1674   cusparseStatus_t             stat;
1675   cudaError_t                  err;
1676 
1677   PetscFunctionBegin;
1678   if (A->boundtocpu) PetscFunctionReturn(0);
1679   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1680     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1681       /* Copy values only */
1682       CsrMatrix *matrix,*matrixT;
1683       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1684 
1685       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1686       matrix->values->assign(a->a, a->a+a->nz);
1687       err  = WaitForCUDA();CHKERRCUDA(err);
1688       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1689       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1690 
1691       /* Update matT when it was built before */
1692       if (cusparsestruct->matTranspose) {
1693         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1694         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1695         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1696         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1697                             A->cmap->n, matrix->num_entries,
1698                             matrix->values->data().get(),
1699                             cusparsestruct->rowoffsets_gpu->data().get(),
1700                             matrix->column_indices->data().get(),
1701                             matrixT->values->data().get(),
1702                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1703                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1704                             CUSPARSE_ACTION_NUMERIC,indexBase,
1705                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1706                           #else
1707                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1708                             CUSPARSE_ACTION_NUMERIC, indexBase
1709                           #endif
1710                            );CHKERRCUSPARSE(stat);
1711         err  = WaitForCUDA();CHKERRCUDA(err);
1712         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1713       }
1714     } else {
1715       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1716       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1717       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1718       delete cusparsestruct->workVector;
1719       delete cusparsestruct->rowoffsets_gpu;
1720       try {
1721         if (a->compressedrow.use) {
1722           m    = a->compressedrow.nrows;
1723           ii   = a->compressedrow.i;
1724           ridx = a->compressedrow.rindex;
1725         } else {
1726           m    = A->rmap->n;
1727           ii   = a->i;
1728           ridx = NULL;
1729         }
1730         cusparsestruct->nrows = m;
1731 
1732         /* create cusparse matrix */
1733         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1734         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1735         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1736         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1737 
1738         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1739         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1740         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1741         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1742         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1743         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1744         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1745 
1746         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1747         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1748           /* set the matrix */
1749           CsrMatrix *mat= new CsrMatrix;
1750           mat->num_rows = m;
1751           mat->num_cols = A->cmap->n;
1752           mat->num_entries = a->nz;
1753           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1754           mat->row_offsets->assign(ii, ii + m+1);
1755 
1756           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1757           mat->column_indices->assign(a->j, a->j+a->nz);
1758 
1759           mat->values = new THRUSTARRAY(a->nz);
1760           mat->values->assign(a->a, a->a+a->nz);
1761 
1762           /* assign the pointer */
1763           matstruct->mat = mat;
1764          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1765           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1766             stat = cusparseCreateCsr(&matstruct->matDescr,
1767                                     mat->num_rows, mat->num_cols, mat->num_entries,
1768                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1769                                     mat->values->data().get(),
1770                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1771                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1772           }
1773          #endif
1774         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1775          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1776           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1777          #else
1778           CsrMatrix *mat= new CsrMatrix;
1779           mat->num_rows = m;
1780           mat->num_cols = A->cmap->n;
1781           mat->num_entries = a->nz;
1782           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1783           mat->row_offsets->assign(ii, ii + m+1);
1784 
1785           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1786           mat->column_indices->assign(a->j, a->j+a->nz);
1787 
1788           mat->values = new THRUSTARRAY(a->nz);
1789           mat->values->assign(a->a, a->a+a->nz);
1790 
1791           cusparseHybMat_t hybMat;
1792           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1793           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1794             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1795           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1796               matstruct->descr, mat->values->data().get(),
1797               mat->row_offsets->data().get(),
1798               mat->column_indices->data().get(),
1799               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1800           /* assign the pointer */
1801           matstruct->mat = hybMat;
1802 
1803           if (mat) {
1804             if (mat->values) delete (THRUSTARRAY*)mat->values;
1805             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1806             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1807             delete (CsrMatrix*)mat;
1808           }
1809          #endif
1810         }
1811 
1812         /* assign the compressed row indices */
1813         if (a->compressedrow.use) {
1814           cusparsestruct->workVector = new THRUSTARRAY(m);
1815           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1816           matstruct->cprowIndices->assign(ridx,ridx+m);
1817           tmp = m;
1818         } else {
1819           cusparsestruct->workVector = NULL;
1820           matstruct->cprowIndices    = NULL;
1821           tmp = 0;
1822         }
1823         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1824 
1825         /* assign the pointer */
1826         cusparsestruct->mat = matstruct;
1827       } catch(char *ex) {
1828         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1829       }
1830       err  = WaitForCUDA();CHKERRCUDA(err);
1831       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1832       cusparsestruct->nonzerostate = A->nonzerostate;
1833     }
1834     A->offloadmask = PETSC_OFFLOAD_BOTH;
1835   }
1836   PetscFunctionReturn(0);
1837 }
1838 
1839 struct VecCUDAPlusEquals
1840 {
1841   template <typename Tuple>
1842   __host__ __device__
1843   void operator()(Tuple t)
1844   {
1845     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1846   }
1847 };
1848 
1849 struct VecCUDAEquals
1850 {
1851   template <typename Tuple>
1852   __host__ __device__
1853   void operator()(Tuple t)
1854   {
1855     thrust::get<1>(t) = thrust::get<0>(t);
1856   }
1857 };
1858 
1859 struct VecCUDAEqualsReverse
1860 {
1861   template <typename Tuple>
1862   __host__ __device__
1863   void operator()(Tuple t)
1864   {
1865     thrust::get<0>(t) = thrust::get<1>(t);
1866   }
1867 };
1868 
1869 struct MatMatCusparse {
1870   PetscBool            cisdense;
1871   PetscScalar          *Bt;
1872   Mat                  X;
1873 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1874   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1875   cusparseDnMatDescr_t matBDescr;
1876   cusparseDnMatDescr_t matCDescr;
1877   size_t               spmmBufferSize;
1878   void                 *spmmBuffer;
1879   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1880 #endif
1881 };
1882 
1883 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1884 {
1885   PetscErrorCode ierr;
1886   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1887   cudaError_t    cerr;
1888 
1889   PetscFunctionBegin;
1890   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1891  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1892   cusparseStatus_t stat;
1893   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1894   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1895   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1896  #endif
1897   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1898   ierr = PetscFree(data);CHKERRQ(ierr);
1899   PetscFunctionReturn(0);
1900 }
1901 
1902 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1903 
1904 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1905 {
1906   Mat_Product                  *product = C->product;
1907   Mat                          A,B;
1908   PetscInt                     m,n,blda,clda;
1909   PetscBool                    flg,biscuda;
1910   Mat_SeqAIJCUSPARSE           *cusp;
1911   cusparseStatus_t             stat;
1912   cusparseOperation_t          opA;
1913   const PetscScalar            *barray;
1914   PetscScalar                  *carray;
1915   PetscErrorCode               ierr;
1916   MatMatCusparse               *mmdata;
1917   Mat_SeqAIJCUSPARSEMultStruct *mat;
1918   CsrMatrix                    *csrmat;
1919   cudaError_t                  cerr;
1920 
1921   PetscFunctionBegin;
1922   MatCheckProduct(C,1);
1923   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1924   mmdata = (MatMatCusparse*)product->data;
1925   A    = product->A;
1926   B    = product->B;
1927   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1928   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1929   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1930      Instead of silently accepting the wrong answer, I prefer to raise the error */
1931   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1932   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1933   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1934   switch (product->type) {
1935   case MATPRODUCT_AB:
1936   case MATPRODUCT_PtAP:
1937     mat = cusp->mat;
1938     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1939     m   = A->rmap->n;
1940     n   = B->cmap->n;
1941     break;
1942   case MATPRODUCT_AtB:
1943     if (!cusp->transgen) {
1944       mat = cusp->mat;
1945       opA = CUSPARSE_OPERATION_TRANSPOSE;
1946     } else {
1947       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1948       mat  = cusp->matTranspose;
1949       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1950     }
1951     m = A->cmap->n;
1952     n = B->cmap->n;
1953     break;
1954   case MATPRODUCT_ABt:
1955   case MATPRODUCT_RARt:
1956     mat = cusp->mat;
1957     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1958     m   = A->rmap->n;
1959     n   = B->rmap->n;
1960     break;
1961   default:
1962     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1963   }
1964   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1965   csrmat = (CsrMatrix*)mat->mat;
1966   /* if the user passed a CPU matrix, copy the data to the GPU */
1967   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1968   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1969   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1970 
1971   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1972   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1973     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1974     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1975   } else {
1976     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1977     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1978   }
1979 
1980   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1981  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1982   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1983   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1984   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1985     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1986     if (!mmdata->matBDescr) {
1987       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1988       mmdata->Blda = blda;
1989     }
1990 
1991     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1992     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1993       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1994       mmdata->Clda = clda;
1995     }
1996 
1997     if (!mat->matDescr) {
1998       stat = cusparseCreateCsr(&mat->matDescr,
1999                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2000                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2001                               csrmat->values->data().get(),
2002                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2003                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2004     }
2005     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2006                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2007                                    mmdata->matCDescr,cusparse_scalartype,
2008                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
2009     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
2010     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
2011     mmdata->initialized = PETSC_TRUE;
2012   } else {
2013     /* to be safe, always update pointers of the mats */
2014     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2015     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2016     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2017   }
2018 
2019   /* do cusparseSpMM, which supports transpose on B */
2020   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2021                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2022                       mmdata->matCDescr,cusparse_scalartype,
2023                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
2024  #else
2025   PetscInt k;
2026   /* cusparseXcsrmm does not support transpose on B */
2027   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2028     cublasHandle_t cublasv2handle;
2029     cublasStatus_t cerr;
2030 
2031     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2032     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2033                        B->cmap->n,B->rmap->n,
2034                        &PETSC_CUSPARSE_ONE ,barray,blda,
2035                        &PETSC_CUSPARSE_ZERO,barray,blda,
2036                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2037     blda = B->cmap->n;
2038     k    = B->cmap->n;
2039   } else {
2040     k    = B->rmap->n;
2041   }
2042 
2043   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2044   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2045                            csrmat->num_entries,mat->alpha_one,mat->descr,
2046                            csrmat->values->data().get(),
2047                            csrmat->row_offsets->data().get(),
2048                            csrmat->column_indices->data().get(),
2049                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2050                            carray,clda);CHKERRCUSPARSE(stat);
2051  #endif
2052   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2053   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2054   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2055   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2056   if (product->type == MATPRODUCT_RARt) {
2057     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2058     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2059   } else if (product->type == MATPRODUCT_PtAP) {
2060     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2061     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2062   } else {
2063     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2064   }
2065   if (mmdata->cisdense) {
2066     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2067   }
2068   if (!biscuda) {
2069     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2070   }
2071   PetscFunctionReturn(0);
2072 }
2073 
2074 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2075 {
2076   Mat_Product        *product = C->product;
2077   Mat                A,B;
2078   PetscInt           m,n;
2079   PetscBool          cisdense,flg;
2080   PetscErrorCode     ierr;
2081   MatMatCusparse     *mmdata;
2082   Mat_SeqAIJCUSPARSE *cusp;
2083 
2084   PetscFunctionBegin;
2085   MatCheckProduct(C,1);
2086   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2087   A    = product->A;
2088   B    = product->B;
2089   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2090   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2091   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2092   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2093   switch (product->type) {
2094   case MATPRODUCT_AB:
2095     m = A->rmap->n;
2096     n = B->cmap->n;
2097     break;
2098   case MATPRODUCT_AtB:
2099     m = A->cmap->n;
2100     n = B->cmap->n;
2101     break;
2102   case MATPRODUCT_ABt:
2103     m = A->rmap->n;
2104     n = B->rmap->n;
2105     break;
2106   case MATPRODUCT_PtAP:
2107     m = B->cmap->n;
2108     n = B->cmap->n;
2109     break;
2110   case MATPRODUCT_RARt:
2111     m = B->rmap->n;
2112     n = B->rmap->n;
2113     break;
2114   default:
2115     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2116   }
2117   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2118   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2119   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2120   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2121 
2122   /* product data */
2123   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2124   mmdata->cisdense = cisdense;
2125  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2126   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2127   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2128     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2129   }
2130  #endif
2131   /* for these products we need intermediate storage */
2132   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2133     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2134     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2135     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2136       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2137     } else {
2138       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2139     }
2140   }
2141   C->product->data    = mmdata;
2142   C->product->destroy = MatDestroy_MatMatCusparse;
2143 
2144   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2145   PetscFunctionReturn(0);
2146 }
2147 
2148 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2149 
2150 /* handles dense B */
2151 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2152 {
2153   Mat_Product    *product = C->product;
2154   PetscErrorCode ierr;
2155 
2156   PetscFunctionBegin;
2157   MatCheckProduct(C,1);
2158   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2159   if (product->A->boundtocpu) {
2160     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2161     PetscFunctionReturn(0);
2162   }
2163   switch (product->type) {
2164   case MATPRODUCT_AB:
2165   case MATPRODUCT_AtB:
2166   case MATPRODUCT_ABt:
2167   case MATPRODUCT_PtAP:
2168   case MATPRODUCT_RARt:
2169     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2170   default:
2171     break;
2172   }
2173   PetscFunctionReturn(0);
2174 }
2175 
2176 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2177 {
2178   PetscErrorCode ierr;
2179 
2180   PetscFunctionBegin;
2181   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2182   PetscFunctionReturn(0);
2183 }
2184 
2185 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2186 {
2187   PetscErrorCode ierr;
2188 
2189   PetscFunctionBegin;
2190   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2191   PetscFunctionReturn(0);
2192 }
2193 
2194 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2195 {
2196   PetscErrorCode ierr;
2197 
2198   PetscFunctionBegin;
2199   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2200   PetscFunctionReturn(0);
2201 }
2202 
2203 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2204 {
2205   PetscErrorCode ierr;
2206 
2207   PetscFunctionBegin;
2208   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2209   PetscFunctionReturn(0);
2210 }
2211 
2212 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2213 {
2214   PetscErrorCode ierr;
2215 
2216   PetscFunctionBegin;
2217   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2218   PetscFunctionReturn(0);
2219 }
2220 
2221 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2222 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2223 {
2224   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2225   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2226   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2227   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2228   PetscErrorCode               ierr;
2229   cudaError_t                  cerr;
2230   cusparseStatus_t             stat;
2231   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2232   PetscBool                    compressed;
2233 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2234   PetscInt                     nx,ny;
2235 #endif
2236 
2237   PetscFunctionBegin;
2238   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2239   if (!a->nonzerorowcnt) {
2240     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2241     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2242     PetscFunctionReturn(0);
2243   }
2244   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2245   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2246   if (!trans) {
2247     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2248     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2249   } else {
2250     if (herm || !cusparsestruct->transgen) {
2251       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2252       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2253     } else {
2254       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2255       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2256     }
2257   }
2258   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2259   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2260 
2261   try {
2262     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2263     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2264     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2265 
2266     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2267     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2268       /* z = A x + beta y.
2269          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2270          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2271       */
2272       xptr = xarray;
2273       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2274       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2275      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2276       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2277           allocated to accommodate different uses. So we get the length info directly from mat.
2278        */
2279       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2280         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2281         nx = mat->num_cols;
2282         ny = mat->num_rows;
2283       }
2284      #endif
2285     } else {
2286       /* z = A^T x + beta y
2287          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2288          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2289        */
2290       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2291       dptr = zarray;
2292       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2293       if (compressed) { /* Scatter x to work vector */
2294         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2295         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2296                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2297                          VecCUDAEqualsReverse());
2298       }
2299      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2300       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2301         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2302         nx = mat->num_rows;
2303         ny = mat->num_cols;
2304       }
2305      #endif
2306     }
2307 
2308     /* csr_spmv does y = alpha op(A) x + beta y */
2309     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2310      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2311       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2312       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2313         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2314         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2315         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2316                                 matstruct->matDescr,
2317                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2318                                 matstruct->cuSpMV[opA].vecYDescr,
2319                                 cusparse_scalartype,
2320                                 cusparsestruct->spmvAlg,
2321                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2322         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2323 
2324         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2325       } else {
2326         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2327         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2328         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2329       }
2330 
2331       stat = cusparseSpMV(cusparsestruct->handle, opA,
2332                                matstruct->alpha_one,
2333                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2334                                matstruct->cuSpMV[opA].vecXDescr,
2335                                beta,
2336                                matstruct->cuSpMV[opA].vecYDescr,
2337                                cusparse_scalartype,
2338                                cusparsestruct->spmvAlg,
2339                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2340      #else
2341       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2342       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2343                                mat->num_rows, mat->num_cols,
2344                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2345                                mat->values->data().get(), mat->row_offsets->data().get(),
2346                                mat->column_indices->data().get(), xptr, beta,
2347                                dptr);CHKERRCUSPARSE(stat);
2348      #endif
2349     } else {
2350       if (cusparsestruct->nrows) {
2351        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2352         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2353        #else
2354         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2355         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2356                                  matstruct->alpha_one, matstruct->descr, hybMat,
2357                                  xptr, beta,
2358                                  dptr);CHKERRCUSPARSE(stat);
2359        #endif
2360       }
2361     }
2362     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2363     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2364 
2365     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2366       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2367         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2368           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2369         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2370           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2371         }
2372       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2373         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2374       }
2375 
2376       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2377       if (compressed) {
2378         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2379         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2380         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2381                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2382                          VecCUDAPlusEquals());
2383         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2384         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2385       }
2386     } else {
2387       if (yy && yy != zz) {
2388         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2389       }
2390     }
2391     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2392     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2393     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2394   } catch(char *ex) {
2395     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2396   }
2397   if (yy) {
2398     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2399   } else {
2400     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2401   }
2402   PetscFunctionReturn(0);
2403 }
2404 
2405 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2406 {
2407   PetscErrorCode ierr;
2408 
2409   PetscFunctionBegin;
2410   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2411   PetscFunctionReturn(0);
2412 }
2413 
2414 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2415 {
2416   PetscErrorCode              ierr;
2417   PetscSplitCSRDataStructure  *d_mat = NULL;
2418   PetscFunctionBegin;
2419   if (A->factortype == MAT_FACTOR_NONE) {
2420     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2421   }
2422   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2423   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2424   if (d_mat) {
2425     A->offloadmask = PETSC_OFFLOAD_GPU;
2426   }
2427 
2428   PetscFunctionReturn(0);
2429 }
2430 
2431 /* --------------------------------------------------------------------------------*/
2432 /*@
2433    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2434    (the default parallel PETSc format). This matrix will ultimately pushed down
2435    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2436    assembly performance the user should preallocate the matrix storage by setting
2437    the parameter nz (or the array nnz).  By setting these parameters accurately,
2438    performance during matrix assembly can be increased by more than a factor of 50.
2439 
2440    Collective
2441 
2442    Input Parameters:
2443 +  comm - MPI communicator, set to PETSC_COMM_SELF
2444 .  m - number of rows
2445 .  n - number of columns
2446 .  nz - number of nonzeros per row (same for all rows)
2447 -  nnz - array containing the number of nonzeros in the various rows
2448          (possibly different for each row) or NULL
2449 
2450    Output Parameter:
2451 .  A - the matrix
2452 
2453    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2454    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2455    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2456 
2457    Notes:
2458    If nnz is given then nz is ignored
2459 
2460    The AIJ format (also called the Yale sparse matrix format or
2461    compressed row storage), is fully compatible with standard Fortran 77
2462    storage.  That is, the stored row and column indices can begin at
2463    either one (as in Fortran) or zero.  See the users' manual for details.
2464 
2465    Specify the preallocated storage with either nz or nnz (not both).
2466    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2467    allocation.  For large problems you MUST preallocate memory or you
2468    will get TERRIBLE performance, see the users' manual chapter on matrices.
2469 
2470    By default, this format uses inodes (identical nodes) when possible, to
2471    improve numerical efficiency of matrix-vector products and solves. We
2472    search for consecutive rows with the same nonzero structure, thereby
2473    reusing matrix information to achieve increased efficiency.
2474 
2475    Level: intermediate
2476 
2477 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2478 @*/
2479 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2480 {
2481   PetscErrorCode ierr;
2482 
2483   PetscFunctionBegin;
2484   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2485   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2486   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2487   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2488   PetscFunctionReturn(0);
2489 }
2490 
2491 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2492 {
2493   PetscErrorCode              ierr;
2494   PetscSplitCSRDataStructure  *d_mat = NULL;
2495 
2496   PetscFunctionBegin;
2497   if (A->factortype == MAT_FACTOR_NONE) {
2498     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2499     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2500     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
2501   } else {
2502     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2503   }
2504   if (d_mat) {
2505     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2506     cudaError_t                err;
2507     PetscSplitCSRDataStructure h_mat;
2508     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2509     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2510     if (a->compressedrow.use) {
2511       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2512     }
2513     err = cudaFree(d_mat);CHKERRCUDA(err);
2514   }
2515   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2516   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2517   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2518   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
2519   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2520   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2521   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2522   PetscFunctionReturn(0);
2523 }
2524 
2525 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2526 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2527 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2528 {
2529   PetscErrorCode ierr;
2530 
2531   PetscFunctionBegin;
2532   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2533   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2534   PetscFunctionReturn(0);
2535 }
2536 
2537 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc.
2538 {
2539   PetscErrorCode ierr;
2540   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
2541   PetscBool      flgx,flgy;
2542 
2543   PetscFunctionBegin;
2544   if (a == (PetscScalar)0.0) PetscFunctionReturn(0);
2545   PetscValidHeaderSpecific(Y,MAT_CLASSID,1);
2546   PetscValidHeaderSpecific(X,MAT_CLASSID,3);
2547   ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr);
2548   ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr);
2549   if (!flgx || !flgy) {
2550     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
2551     PetscFunctionReturn(0);
2552   }
2553   if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE");
2554   if (str == DIFFERENT_NONZERO_PATTERN) {
2555     if (x->nz == y->nz) {
2556       PetscBool e;
2557       ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
2558       if (e) {
2559         ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
2560         if (e) {
2561           str = SAME_NONZERO_PATTERN;
2562         }
2563       }
2564     }
2565   }
2566   if (str != SAME_NONZERO_PATTERN) {
2567     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
2568     PetscFunctionReturn(0);
2569   } else {
2570     Mat_SeqAIJCUSPARSE           *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr;
2571     Mat_SeqAIJCUSPARSE           *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr;
2572     if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
2573     if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
2574     if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) {
2575       if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) {
2576 	ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
2577       }
2578       if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) {
2579 	ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr);
2580       }
2581       ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2582       ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
2583       ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
2584     } else {
2585       cublasHandle_t cublasv2handle;
2586       cublasStatus_t cberr;
2587       cudaError_t    err;
2588       PetscScalar    alpha = a;
2589       PetscBLASInt   one = 1, bnz = 1;
2590       CsrMatrix      *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat;
2591       CsrMatrix      *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat;
2592       PetscScalar    *aa_y, *aa_x;
2593       aa_y = matrix_y->values->data().get();
2594       aa_x = matrix_x->values->data().get();
2595       ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2596       ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
2597       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2598       cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr);
2599       err  = WaitForCUDA();CHKERRCUDA(err);
2600       ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
2601       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2602       ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
2603       ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr);
2604       if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU;
2605       else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state");
2606       ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
2607     }
2608   }
2609   PetscFunctionReturn(0);
2610 }
2611 
2612 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2613 {
2614   PetscErrorCode             ierr;
2615   PetscBool                  both = PETSC_FALSE;
2616   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2617 
2618   PetscFunctionBegin;
2619   if (A->factortype == MAT_FACTOR_NONE) {
2620     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2621     if (spptr->mat) {
2622       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
2623       if (matrix->values) {
2624         both = PETSC_TRUE;
2625         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2626       }
2627     }
2628     if (spptr->matTranspose) {
2629       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
2630       if (matrix->values) {
2631         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2632       }
2633     }
2634   }
2635   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
2636   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
2637   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
2638   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2639   else A->offloadmask = PETSC_OFFLOAD_CPU;
2640 
2641   PetscFunctionReturn(0);
2642 }
2643 
2644 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2645 {
2646   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2647   PetscErrorCode ierr;
2648 
2649   PetscFunctionBegin;
2650   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2651   if (flg) {
2652     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2653 
2654     A->ops->axpy                      = MatAXPY_SeqAIJ;
2655     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
2656     A->ops->mult                      = MatMult_SeqAIJ;
2657     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2658     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2659     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2660     A->ops->multhermitiantranspose    = NULL;
2661     A->ops->multhermitiantransposeadd = NULL;
2662     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2663     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2664     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2665     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2666     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
2667   } else {
2668     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
2669     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
2670     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2671     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2672     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2673     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2674     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2675     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2676     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2677     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2678     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2679     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2680     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
2681   }
2682   A->boundtocpu = flg;
2683   a->inode.use = flg;
2684   PetscFunctionReturn(0);
2685 }
2686 
2687 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2688 {
2689   PetscErrorCode   ierr;
2690   cusparseStatus_t stat;
2691   Mat              B;
2692 
2693   PetscFunctionBegin;
2694   if (reuse == MAT_INITIAL_MATRIX) {
2695     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2696   } else if (reuse == MAT_REUSE_MATRIX) {
2697     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2698   }
2699   B = *newmat;
2700 
2701   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2702   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2703 
2704   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2705     if (B->factortype == MAT_FACTOR_NONE) {
2706       Mat_SeqAIJCUSPARSE *spptr;
2707 
2708       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2709       spptr->format = MAT_CUSPARSE_CSR;
2710       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2711       B->spptr = spptr;
2712       spptr->deviceMat = NULL;
2713     } else {
2714       Mat_SeqAIJCUSPARSETriFactors *spptr;
2715 
2716       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2717       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2718       B->spptr = spptr;
2719     }
2720     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2721   }
2722   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2723   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2724   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2725   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2726   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2727 
2728   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2729   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2730   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2731   PetscFunctionReturn(0);
2732 }
2733 
2734 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2735 {
2736   PetscErrorCode ierr;
2737 
2738   PetscFunctionBegin;
2739   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
2740   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2741   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2742   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2743   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2744   ierr = PetscOptionsEnd();CHKERRQ(ierr);
2745   PetscFunctionReturn(0);
2746 }
2747 
2748 /*MC
2749    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2750 
2751    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2752    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2753    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2754 
2755    Options Database Keys:
2756 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2757 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2758 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2759 
2760   Level: beginner
2761 
2762 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2763 M*/
2764 
2765 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2766 
2767 
2768 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2769 {
2770   PetscErrorCode ierr;
2771 
2772   PetscFunctionBegin;
2773   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2774   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2775   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2776   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2777   PetscFunctionReturn(0);
2778 }
2779 
2780 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2781 {
2782   PetscErrorCode   ierr;
2783   cusparseStatus_t stat;
2784 
2785   PetscFunctionBegin;
2786   if (*cusparsestruct) {
2787     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2788     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2789     delete (*cusparsestruct)->workVector;
2790     delete (*cusparsestruct)->rowoffsets_gpu;
2791     delete (*cusparsestruct)->cooPerm;
2792     delete (*cusparsestruct)->cooPerm_a;
2793     delete (*cusparsestruct)->cooPerm_v;
2794     delete (*cusparsestruct)->cooPerm_w;
2795     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
2796    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2797     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2798    #endif
2799     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2800   }
2801   PetscFunctionReturn(0);
2802 }
2803 
2804 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2805 {
2806   PetscFunctionBegin;
2807   if (*mat) {
2808     delete (*mat)->values;
2809     delete (*mat)->column_indices;
2810     delete (*mat)->row_offsets;
2811     delete *mat;
2812     *mat = 0;
2813   }
2814   PetscFunctionReturn(0);
2815 }
2816 
2817 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2818 {
2819   cusparseStatus_t stat;
2820   PetscErrorCode   ierr;
2821 
2822   PetscFunctionBegin;
2823   if (*trifactor) {
2824     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2825     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2826     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2827     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2828    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2829     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2830    #endif
2831     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
2832   }
2833   PetscFunctionReturn(0);
2834 }
2835 
2836 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2837 {
2838   CsrMatrix        *mat;
2839   cusparseStatus_t stat;
2840   cudaError_t      err;
2841 
2842   PetscFunctionBegin;
2843   if (*matstruct) {
2844     if ((*matstruct)->mat) {
2845       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2846        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2847         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2848        #else
2849         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2850         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2851        #endif
2852       } else {
2853         mat = (CsrMatrix*)(*matstruct)->mat;
2854         CsrMatrix_Destroy(&mat);
2855       }
2856     }
2857     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2858     delete (*matstruct)->cprowIndices;
2859     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2860     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2861     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2862 
2863    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2864     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2865     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2866     for (int i=0; i<3; i++) {
2867       if (mdata->cuSpMV[i].initialized) {
2868         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2869         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2870         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2871       }
2872     }
2873    #endif
2874     delete *matstruct;
2875     *matstruct = NULL;
2876   }
2877   PetscFunctionReturn(0);
2878 }
2879 
2880 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2881 {
2882   PetscErrorCode ierr;
2883 
2884   PetscFunctionBegin;
2885   if (*trifactors) {
2886     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2887     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2888     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2889     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2890     delete (*trifactors)->rpermIndices;
2891     delete (*trifactors)->cpermIndices;
2892     delete (*trifactors)->workVector;
2893     (*trifactors)->rpermIndices = NULL;
2894     (*trifactors)->cpermIndices = NULL;
2895     (*trifactors)->workVector = NULL;
2896   }
2897   PetscFunctionReturn(0);
2898 }
2899 
2900 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2901 {
2902   PetscErrorCode   ierr;
2903   cusparseHandle_t handle;
2904   cusparseStatus_t stat;
2905 
2906   PetscFunctionBegin;
2907   if (*trifactors) {
2908     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2909     if (handle = (*trifactors)->handle) {
2910       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2911     }
2912     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2913   }
2914   PetscFunctionReturn(0);
2915 }
2916 
2917 struct IJCompare
2918 {
2919   __host__ __device__
2920   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2921   {
2922     if (t1.get<0>() < t2.get<0>()) return true;
2923     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
2924     return false;
2925   }
2926 };
2927 
2928 struct IJEqual
2929 {
2930   __host__ __device__
2931   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2932   {
2933     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
2934     return true;
2935   }
2936 };
2937 
2938 struct IJDiff
2939 {
2940   __host__ __device__
2941   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2942   {
2943     return t1 == t2 ? 0 : 1;
2944   }
2945 };
2946 
2947 struct IJSum
2948 {
2949   __host__ __device__
2950   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2951   {
2952     return t1||t2;
2953   }
2954 };
2955 
2956 #include <thrust/iterator/discard_iterator.h>
2957 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
2958 {
2959   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2960   CsrMatrix          *matrix;
2961   PetscErrorCode     ierr;
2962   cudaError_t        cerr;
2963   PetscInt           n;
2964 
2965   PetscFunctionBegin;
2966   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
2967   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
2968   if (!cusp->cooPerm) {
2969     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2970     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2971     PetscFunctionReturn(0);
2972   }
2973   n = cusp->cooPerm->size();
2974   matrix = (CsrMatrix*)cusp->mat->mat;
2975   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
2976   if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); }
2977   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
2978   if (v) {
2979     cusp->cooPerm_v->assign(v,v+n);
2980     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
2981   }
2982   else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.);
2983   if (imode == ADD_VALUES) {
2984     if (cusp->cooPerm_a) {
2985       if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size());
2986       auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin());
2987       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
2988       thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
2989     } else {
2990       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2991                                                                 matrix->values->begin()));
2992       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2993                                                                 matrix->values->end()));
2994       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
2995     }
2996   } else {
2997     if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence)
2998                               if we are inserting two different values into the same location */
2999       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
3000                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin())));
3001       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
3002                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end())));
3003       thrust::for_each(zibit,zieit,VecCUDAEquals());
3004     } else {
3005       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
3006                                                                 matrix->values->begin()));
3007       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
3008                                                                 matrix->values->end()));
3009       thrust::for_each(zibit,zieit,VecCUDAEquals());
3010     }
3011   }
3012   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3013   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
3014   A->offloadmask = PETSC_OFFLOAD_GPU;
3015   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3016   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3017   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
3018   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3019   PetscFunctionReturn(0);
3020 }
3021 
3022 #include <thrust/binary_search.h>
3023 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3024 {
3025   PetscErrorCode     ierr;
3026   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3027   CsrMatrix          *matrix;
3028   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3029   PetscInt           cooPerm_n, nzr = 0;
3030   cudaError_t        cerr;
3031 
3032   PetscFunctionBegin;
3033   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
3034   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3035   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3036   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3037   if (n != cooPerm_n) {
3038     delete cusp->cooPerm;
3039     delete cusp->cooPerm_v;
3040     delete cusp->cooPerm_w;
3041     delete cusp->cooPerm_a;
3042     cusp->cooPerm = NULL;
3043     cusp->cooPerm_v = NULL;
3044     cusp->cooPerm_w = NULL;
3045     cusp->cooPerm_a = NULL;
3046   }
3047   if (n) {
3048     THRUSTINTARRAY d_i(n);
3049     THRUSTINTARRAY d_j(n);
3050     THRUSTINTARRAY ii(A->rmap->n);
3051 
3052     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3053     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3054 
3055     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3056     d_i.assign(coo_i,coo_i+n);
3057     d_j.assign(coo_j,coo_j+n);
3058     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3059     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3060 
3061     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3062     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3063     *cusp->cooPerm_a = d_i;
3064     THRUSTINTARRAY w = d_j;
3065 
3066     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3067     if (nekey == ekey) { /* all entries are unique */
3068       delete cusp->cooPerm_a;
3069       cusp->cooPerm_a = NULL;
3070     } else { /* I couldn't come up with a more elegant algorithm */
3071       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3072       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3073       (*cusp->cooPerm_a)[0] = 0;
3074       w[0] = 0;
3075       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3076       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3077     }
3078     thrust::counting_iterator<PetscInt> search_begin(0);
3079     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3080                         search_begin, search_begin + A->rmap->n,
3081                         ii.begin());
3082 
3083     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3084     a->singlemalloc = PETSC_FALSE;
3085     a->free_a       = PETSC_TRUE;
3086     a->free_ij      = PETSC_TRUE;
3087     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3088     a->i[0] = 0;
3089     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3090     a->nz = a->maxnz = a->i[A->rmap->n];
3091     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3092     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3093     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3094     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3095     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3096     for (PetscInt i = 0; i < A->rmap->n; i++) {
3097       const PetscInt nnzr = a->i[i+1] - a->i[i];
3098       nzr += (PetscInt)!!(nnzr);
3099       a->ilen[i] = a->imax[i] = nnzr;
3100     }
3101     A->preallocated = PETSC_TRUE;
3102     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3103   } else {
3104     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3105   }
3106   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3107   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
3108 
3109   /* We want to allocate the CUSPARSE struct for matvec now.
3110      The code is so convoluted now that I prefer to copy garbage to the GPU */
3111   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3112   A->offloadmask = PETSC_OFFLOAD_CPU;
3113   A->nonzerostate++;
3114   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3115   {
3116     matrix = (CsrMatrix*)cusp->mat->mat;
3117     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3118     thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3119     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3120   }
3121 
3122   A->offloadmask = PETSC_OFFLOAD_CPU;
3123   A->assembled = PETSC_FALSE;
3124   A->was_assembled = PETSC_FALSE;
3125   PetscFunctionReturn(0);
3126 }
3127