xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 2cbc15d9fc4a2cb47580507275a5895d7ad1e7ad)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
74 
75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
81 
82 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
83 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
84 
85 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
86 {
87   cusparseStatus_t   stat;
88   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
89 
90   PetscFunctionBegin;
91   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
92   cusparsestruct->stream = stream;
93   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
94   PetscFunctionReturn(0);
95 }
96 
97 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
98 {
99   cusparseStatus_t   stat;
100   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
101 
102   PetscFunctionBegin;
103   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
104   if (cusparsestruct->handle != handle) {
105     if (cusparsestruct->handle) {
106       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
107     }
108     cusparsestruct->handle = handle;
109   }
110   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
111   PetscFunctionReturn(0);
112 }
113 
114 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
115 {
116   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
117   PetscBool          flg;
118   PetscErrorCode     ierr;
119 
120   PetscFunctionBegin;
121   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
122   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
123   if (cusparsestruct->handle) cusparsestruct->handle = 0;
124   PetscFunctionReturn(0);
125 }
126 
127 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
128 {
129   PetscFunctionBegin;
130   *type = MATSOLVERCUSPARSE;
131   PetscFunctionReturn(0);
132 }
133 
134 /*MC
135   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
136   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
137   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
138   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
139   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
140   algorithms are not recommended. This class does NOT support direct solver operations.
141 
142   Level: beginner
143 
144 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
145 M*/
146 
147 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
148 {
149   PetscErrorCode ierr;
150   PetscInt       n = A->rmap->n;
151 
152   PetscFunctionBegin;
153   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
154   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
155   (*B)->factortype = ftype;
156   (*B)->useordering = PETSC_TRUE;
157   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
158 
159   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
160     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
161     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
162     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
163   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
164     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
165     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
166   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
167 
168   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
169   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
170   PetscFunctionReturn(0);
171 }
172 
173 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174 {
175   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
176 
177   PetscFunctionBegin;
178   switch (op) {
179   case MAT_CUSPARSE_MULT:
180     cusparsestruct->format = format;
181     break;
182   case MAT_CUSPARSE_ALL:
183     cusparsestruct->format = format;
184     break;
185   default:
186     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
187   }
188   PetscFunctionReturn(0);
189 }
190 
191 /*@
192    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
193    operation. Only the MatMult operation can use different GPU storage formats
194    for MPIAIJCUSPARSE matrices.
195    Not Collective
196 
197    Input Parameters:
198 +  A - Matrix of type SEQAIJCUSPARSE
199 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
200 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
201 
202    Output Parameter:
203 
204    Level: intermediate
205 
206 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
207 @*/
208 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
209 {
210   PetscErrorCode ierr;
211 
212   PetscFunctionBegin;
213   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
214   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 /*@
219    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
220 
221    Collective on mat
222 
223    Input Parameters:
224 +  A - Matrix of type SEQAIJCUSPARSE
225 -  transgen - the boolean flag
226 
227    Level: intermediate
228 
229 .seealso: MATSEQAIJCUSPARSE
230 @*/
231 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
232 {
233   PetscErrorCode ierr;
234   PetscBool      flg;
235 
236   PetscFunctionBegin;
237   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
238   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
239   if (flg) {
240     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
241 
242     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
243     cusp->transgen = transgen;
244     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
245       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
246     }
247   }
248   PetscFunctionReturn(0);
249 }
250 
251 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
252 {
253   PetscErrorCode           ierr;
254   MatCUSPARSEStorageFormat format;
255   PetscBool                flg;
256   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
257 
258   PetscFunctionBegin;
259   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
260   if (A->factortype == MAT_FACTOR_NONE) {
261     PetscBool transgen = cusparsestruct->transgen;
262 
263     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
264     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
265 
266     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
267                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
268     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
269 
270     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
271                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
272     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
273    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
274     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
275     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
276                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
277     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
278     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
279 
280     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
281     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
282                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
283     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
284 
285     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
286     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
287                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
288     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
289    #endif
290   }
291   ierr = PetscOptionsTail();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
296 {
297   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
298   PetscErrorCode               ierr;
299 
300   PetscFunctionBegin;
301   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
302   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
303   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
304   PetscFunctionReturn(0);
305 }
306 
307 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
308 {
309   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
310   PetscErrorCode               ierr;
311 
312   PetscFunctionBegin;
313   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
314   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
315   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
316   PetscFunctionReturn(0);
317 }
318 
319 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
320 {
321   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
322   PetscErrorCode               ierr;
323 
324   PetscFunctionBegin;
325   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
326   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
327   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
328   PetscFunctionReturn(0);
329 }
330 
331 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
332 {
333   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
334   PetscErrorCode               ierr;
335 
336   PetscFunctionBegin;
337   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
338   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
339   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
340   PetscFunctionReturn(0);
341 }
342 
343 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
344 {
345   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
346   PetscInt                          n = A->rmap->n;
347   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
348   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
349   cusparseStatus_t                  stat;
350   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
351   const MatScalar                   *aa = a->a,*v;
352   PetscInt                          *AiLo, *AjLo;
353   PetscInt                          i,nz, nzLower, offset, rowOffset;
354   PetscErrorCode                    ierr;
355   cudaError_t                       cerr;
356 
357   PetscFunctionBegin;
358   if (!n) PetscFunctionReturn(0);
359   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
360     try {
361       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
362       nzLower=n+ai[n]-ai[1];
363       if (!loTriFactor) {
364 	PetscScalar                       *AALo;
365 
366 	cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
367 
368         /* Allocate Space for the lower triangular matrix */
369         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
370         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
371 
372         /* Fill the lower triangular matrix */
373         AiLo[0]  = (PetscInt) 0;
374         AiLo[n]  = nzLower;
375         AjLo[0]  = (PetscInt) 0;
376         AALo[0]  = (MatScalar) 1.0;
377         v        = aa;
378         vi       = aj;
379         offset   = 1;
380         rowOffset= 1;
381         for (i=1; i<n; i++) {
382           nz = ai[i+1] - ai[i];
383           /* additional 1 for the term on the diagonal */
384           AiLo[i]    = rowOffset;
385           rowOffset += nz+1;
386 
387           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
388           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
389 
390           offset      += nz;
391           AjLo[offset] = (PetscInt) i;
392           AALo[offset] = (MatScalar) 1.0;
393           offset      += 1;
394 
395           v  += nz;
396           vi += nz;
397         }
398 
399         /* allocate space for the triangular factor information */
400         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
401         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
402         /* Create the matrix description */
403         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
404         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
405        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
406         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
407        #else
408         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
409        #endif
410         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
411         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
412 
413         /* set the operation */
414         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
415 
416         /* set the matrix */
417         loTriFactor->csrMat = new CsrMatrix;
418         loTriFactor->csrMat->num_rows = n;
419         loTriFactor->csrMat->num_cols = n;
420         loTriFactor->csrMat->num_entries = nzLower;
421 
422         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
423         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
424 
425         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
426         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
427 
428         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
429         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
430 
431         /* Create the solve analysis information */
432         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
433         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
434       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
435         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
436                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
437                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
438                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
439                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
440         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
441       #endif
442 
443         /* perform the solve analysis */
444         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
445                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
446                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
447                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
448                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
449                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
450                                #endif
451                                 );CHKERRCUSPARSE(stat);
452         cerr = WaitForCUDA();CHKERRCUDA(cerr);
453         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
454 
455         /* assign the pointer */
456         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
457 	loTriFactor->AA_h = AALo;
458         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
459         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
460         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
461       } else { /* update values only */
462 	if (!loTriFactor->AA_h) {
463 	  cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
464 	}
465         /* Fill the lower triangular matrix */
466         loTriFactor->AA_h[0]  = 1.0;
467         v        = aa;
468         vi       = aj;
469         offset   = 1;
470         for (i=1; i<n; i++) {
471           nz = ai[i+1] - ai[i];
472           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
473           offset      += nz;
474           loTriFactor->AA_h[offset] = 1.0;
475           offset      += 1;
476           v  += nz;
477         }
478         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
479         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
480       }
481     } catch(char *ex) {
482       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
483     }
484   }
485   PetscFunctionReturn(0);
486 }
487 
488 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
489 {
490   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
491   PetscInt                          n = A->rmap->n;
492   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
493   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
494   cusparseStatus_t                  stat;
495   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
496   const MatScalar                   *aa = a->a,*v;
497   PetscInt                          *AiUp, *AjUp;
498   PetscInt                          i,nz, nzUpper, offset;
499   PetscErrorCode                    ierr;
500   cudaError_t                       cerr;
501 
502   PetscFunctionBegin;
503   if (!n) PetscFunctionReturn(0);
504   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
505     try {
506       /* next, figure out the number of nonzeros in the upper triangular matrix. */
507       nzUpper = adiag[0]-adiag[n];
508       if (!upTriFactor) {
509 	PetscScalar *AAUp;
510 
511 	cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
512 
513         /* Allocate Space for the upper triangular matrix */
514         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
515         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
516 
517         /* Fill the upper triangular matrix */
518         AiUp[0]=(PetscInt) 0;
519         AiUp[n]=nzUpper;
520         offset = nzUpper;
521         for (i=n-1; i>=0; i--) {
522           v  = aa + adiag[i+1] + 1;
523           vi = aj + adiag[i+1] + 1;
524 
525           /* number of elements NOT on the diagonal */
526           nz = adiag[i] - adiag[i+1]-1;
527 
528           /* decrement the offset */
529           offset -= (nz+1);
530 
531           /* first, set the diagonal elements */
532           AjUp[offset] = (PetscInt) i;
533           AAUp[offset] = (MatScalar)1./v[nz];
534           AiUp[i]      = AiUp[i+1] - (nz+1);
535 
536           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
537           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
538         }
539 
540         /* allocate space for the triangular factor information */
541         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
542         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
543 
544         /* Create the matrix description */
545         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
546         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
547        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
548         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
549        #else
550         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
551        #endif
552         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
553         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
554 
555         /* set the operation */
556         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
557 
558         /* set the matrix */
559         upTriFactor->csrMat = new CsrMatrix;
560         upTriFactor->csrMat->num_rows = n;
561         upTriFactor->csrMat->num_cols = n;
562         upTriFactor->csrMat->num_entries = nzUpper;
563 
564         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
565         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
566 
567         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
568         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
569 
570         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
571         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
572 
573         /* Create the solve analysis information */
574         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
575         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
576       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
577         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
578                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
579                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
580                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
581                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
582         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
583       #endif
584 
585         /* perform the solve analysis */
586         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
587                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
588                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
589                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
590                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
591                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
592                                #endif
593                                 );CHKERRCUSPARSE(stat);
594         cerr = WaitForCUDA();CHKERRCUDA(cerr);
595         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
596 
597         /* assign the pointer */
598         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
599 	upTriFactor->AA_h = AAUp;
600         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
601         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
602         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
603       } else {
604 	if (!upTriFactor->AA_h) {
605 	  cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
606 	}
607         /* Fill the upper triangular matrix */
608         offset = nzUpper;
609         for (i=n-1; i>=0; i--) {
610           v  = aa + adiag[i+1] + 1;
611 
612           /* number of elements NOT on the diagonal */
613           nz = adiag[i] - adiag[i+1]-1;
614 
615           /* decrement the offset */
616           offset -= (nz+1);
617 
618           /* first, set the diagonal elements */
619           upTriFactor->AA_h[offset] = 1./v[nz];
620           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
621         }
622         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
623         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
624       }
625     } catch(char *ex) {
626       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
627     }
628   }
629   PetscFunctionReturn(0);
630 }
631 
632 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
633 {
634   PetscErrorCode               ierr;
635   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
636   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
637   IS                           isrow = a->row,iscol = a->icol;
638   PetscBool                    row_identity,col_identity;
639   PetscInt                     n = A->rmap->n;
640 
641   PetscFunctionBegin;
642   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
643   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
644   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
645 
646   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
647   cusparseTriFactors->nnz=a->nz;
648 
649   A->offloadmask = PETSC_OFFLOAD_BOTH;
650   /* lower triangular indices */
651   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
652   if (!row_identity && !cusparseTriFactors->rpermIndices) {
653     const PetscInt *r;
654 
655     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
656     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
657     cusparseTriFactors->rpermIndices->assign(r, r+n);
658     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
659     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
660   }
661 
662   /* upper triangular indices */
663   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
664   if (!col_identity && !cusparseTriFactors->cpermIndices) {
665     const PetscInt *c;
666 
667     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
668     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
669     cusparseTriFactors->cpermIndices->assign(c, c+n);
670     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
671     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
672   }
673   PetscFunctionReturn(0);
674 }
675 
676 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
677 {
678   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
679   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
680   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
681   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
682   cusparseStatus_t                  stat;
683   PetscErrorCode                    ierr;
684   cudaError_t                       cerr;
685   PetscInt                          *AiUp, *AjUp;
686   PetscScalar                       *AAUp;
687   PetscScalar                       *AALo;
688   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
689   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
690   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
691   const MatScalar                   *aa = b->a,*v;
692 
693   PetscFunctionBegin;
694   if (!n) PetscFunctionReturn(0);
695   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
696     try {
697       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
698       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
699       if (!upTriFactor && !loTriFactor) {
700         /* Allocate Space for the upper triangular matrix */
701         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
702         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
703 
704         /* Fill the upper triangular matrix */
705         AiUp[0]=(PetscInt) 0;
706         AiUp[n]=nzUpper;
707         offset = 0;
708         for (i=0; i<n; i++) {
709           /* set the pointers */
710           v  = aa + ai[i];
711           vj = aj + ai[i];
712           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
713 
714           /* first, set the diagonal elements */
715           AjUp[offset] = (PetscInt) i;
716           AAUp[offset] = (MatScalar)1.0/v[nz];
717           AiUp[i]      = offset;
718           AALo[offset] = (MatScalar)1.0/v[nz];
719 
720           offset+=1;
721           if (nz>0) {
722             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
723             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
724             for (j=offset; j<offset+nz; j++) {
725               AAUp[j] = -AAUp[j];
726               AALo[j] = AAUp[j]/v[nz];
727             }
728             offset+=nz;
729           }
730         }
731 
732         /* allocate space for the triangular factor information */
733         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
734         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
735 
736         /* Create the matrix description */
737         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
738         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
739        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
740         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
741        #else
742         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
743        #endif
744         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
745         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
746 
747         /* set the matrix */
748         upTriFactor->csrMat = new CsrMatrix;
749         upTriFactor->csrMat->num_rows = A->rmap->n;
750         upTriFactor->csrMat->num_cols = A->cmap->n;
751         upTriFactor->csrMat->num_entries = a->nz;
752 
753         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
754         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
755 
756         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
757         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
758 
759         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
760         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
761 
762         /* set the operation */
763         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
764 
765         /* Create the solve analysis information */
766         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
767         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
768       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
769         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
770                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
771                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
772                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
773                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
774         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
775       #endif
776 
777         /* perform the solve analysis */
778         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
779                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
780                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
781                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
782                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
783                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
784                                 #endif
785                                 );CHKERRCUSPARSE(stat);
786         cerr = WaitForCUDA();CHKERRCUDA(cerr);
787         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
788 
789         /* assign the pointer */
790         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
791 
792         /* allocate space for the triangular factor information */
793         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
794         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
795 
796         /* Create the matrix description */
797         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
798         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
799        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
800         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
801        #else
802         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
803        #endif
804         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
805         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
806 
807         /* set the operation */
808         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
809 
810         /* set the matrix */
811         loTriFactor->csrMat = new CsrMatrix;
812         loTriFactor->csrMat->num_rows = A->rmap->n;
813         loTriFactor->csrMat->num_cols = A->cmap->n;
814         loTriFactor->csrMat->num_entries = a->nz;
815 
816         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
817         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
818 
819         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
820         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
821 
822         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
823         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
824 
825         /* Create the solve analysis information */
826         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
827         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
828       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
829         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
830                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
831                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
832                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
833                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
834         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
835       #endif
836 
837         /* perform the solve analysis */
838         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
839                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
840                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
841                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
842                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
843                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
844                                 #endif
845                                 );CHKERRCUSPARSE(stat);
846         cerr = WaitForCUDA();CHKERRCUDA(cerr);
847         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
848 
849         /* assign the pointer */
850         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
851 
852         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
853         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
854         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
855       } else {
856         /* Fill the upper triangular matrix */
857         offset = 0;
858         for (i=0; i<n; i++) {
859           /* set the pointers */
860           v  = aa + ai[i];
861           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
862 
863           /* first, set the diagonal elements */
864           AAUp[offset] = 1.0/v[nz];
865           AALo[offset] = 1.0/v[nz];
866 
867           offset+=1;
868           if (nz>0) {
869             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
870             for (j=offset; j<offset+nz; j++) {
871               AAUp[j] = -AAUp[j];
872               AALo[j] = AAUp[j]/v[nz];
873             }
874             offset+=nz;
875           }
876         }
877         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
878         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
879         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
880         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
881         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
882       }
883       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
884       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
885     } catch(char *ex) {
886       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
887     }
888   }
889   PetscFunctionReturn(0);
890 }
891 
892 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
893 {
894   PetscErrorCode               ierr;
895   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
896   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
897   IS                           ip = a->row;
898   PetscBool                    perm_identity;
899   PetscInt                     n = A->rmap->n;
900 
901   PetscFunctionBegin;
902   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
903   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
904   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
905   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
906 
907   A->offloadmask = PETSC_OFFLOAD_BOTH;
908 
909   /* lower triangular indices */
910   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
911   if (!perm_identity) {
912     IS             iip;
913     const PetscInt *irip,*rip;
914 
915     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
916     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
917     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
918     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
919     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
920     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
921     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
922     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
923     ierr = ISDestroy(&iip);CHKERRQ(ierr);
924     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
925     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
926   }
927   PetscFunctionReturn(0);
928 }
929 
930 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
931 {
932   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
933   IS             isrow = b->row,iscol = b->col;
934   PetscBool      row_identity,col_identity;
935   PetscErrorCode ierr;
936 
937   PetscFunctionBegin;
938   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
939   B->offloadmask = PETSC_OFFLOAD_CPU;
940   /* determine which version of MatSolve needs to be used. */
941   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
942   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
943   if (row_identity && col_identity) {
944     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
945     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
946     B->ops->matsolve = NULL;
947     B->ops->matsolvetranspose = NULL;
948   } else {
949     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
950     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
951     B->ops->matsolve = NULL;
952     B->ops->matsolvetranspose = NULL;
953   }
954 
955   /* get the triangular factors */
956   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
957   PetscFunctionReturn(0);
958 }
959 
960 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
961 {
962   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
963   IS             ip = b->row;
964   PetscBool      perm_identity;
965   PetscErrorCode ierr;
966 
967   PetscFunctionBegin;
968   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
969   B->offloadmask = PETSC_OFFLOAD_CPU;
970   /* determine which version of MatSolve needs to be used. */
971   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
972   if (perm_identity) {
973     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
974     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
975     B->ops->matsolve = NULL;
976     B->ops->matsolvetranspose = NULL;
977   } else {
978     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
979     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
980     B->ops->matsolve = NULL;
981     B->ops->matsolvetranspose = NULL;
982   }
983 
984   /* get the triangular factors */
985   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
986   PetscFunctionReturn(0);
987 }
988 
989 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
990 {
991   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
992   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
993   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
994   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
995   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
996   cusparseStatus_t                  stat;
997   cusparseIndexBase_t               indexBase;
998   cusparseMatrixType_t              matrixType;
999   cusparseFillMode_t                fillMode;
1000   cusparseDiagType_t                diagType;
1001   cudaError_t                       cerr;
1002   PetscErrorCode                    ierr;
1003 
1004   PetscFunctionBegin;
1005   /* allocate space for the transpose of the lower triangular factor */
1006   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1007   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1008 
1009   /* set the matrix descriptors of the lower triangular factor */
1010   matrixType = cusparseGetMatType(loTriFactor->descr);
1011   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1012   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1013     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1014   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1015 
1016   /* Create the matrix description */
1017   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1018   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1019   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1020   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1021   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1022 
1023   /* set the operation */
1024   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1025 
1026   /* allocate GPU space for the CSC of the lower triangular factor*/
1027   loTriFactorT->csrMat = new CsrMatrix;
1028   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1029   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1030   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1031   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1032   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1033   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1034 
1035   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1036 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1037   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1038                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1039                                        loTriFactor->csrMat->values->data().get(),
1040                                        loTriFactor->csrMat->row_offsets->data().get(),
1041                                        loTriFactor->csrMat->column_indices->data().get(),
1042                                        loTriFactorT->csrMat->values->data().get(),
1043                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1044                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1045                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1046   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1047 #endif
1048 
1049   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1050   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1051                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1052                           loTriFactor->csrMat->values->data().get(),
1053                           loTriFactor->csrMat->row_offsets->data().get(),
1054                           loTriFactor->csrMat->column_indices->data().get(),
1055                           loTriFactorT->csrMat->values->data().get(),
1056                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1057                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1058                           CUSPARSE_ACTION_NUMERIC, indexBase,
1059                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1060                         #else
1061                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1062                           CUSPARSE_ACTION_NUMERIC, indexBase
1063                         #endif
1064                         );CHKERRCUSPARSE(stat);
1065   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1066   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1067 
1068   /* Create the solve analysis information */
1069   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1070   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1071 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1072   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1073                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1074                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1075                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1076                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1077   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1078 #endif
1079 
1080   /* perform the solve analysis */
1081   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1083                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1084                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1085                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1086                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1087                           #endif
1088                           );CHKERRCUSPARSE(stat);
1089   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1090   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1091 
1092   /* assign the pointer */
1093   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1094 
1095   /*********************************************/
1096   /* Now the Transpose of the Upper Tri Factor */
1097   /*********************************************/
1098 
1099   /* allocate space for the transpose of the upper triangular factor */
1100   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1101   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1102 
1103   /* set the matrix descriptors of the upper triangular factor */
1104   matrixType = cusparseGetMatType(upTriFactor->descr);
1105   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1106   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1107     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1108   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1109 
1110   /* Create the matrix description */
1111   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1112   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1113   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1114   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1115   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1116 
1117   /* set the operation */
1118   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1119 
1120   /* allocate GPU space for the CSC of the upper triangular factor*/
1121   upTriFactorT->csrMat = new CsrMatrix;
1122   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1123   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1124   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1125   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1126   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1127   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1128 
1129   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1130 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1131   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1132                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1133                                 upTriFactor->csrMat->values->data().get(),
1134                                 upTriFactor->csrMat->row_offsets->data().get(),
1135                                 upTriFactor->csrMat->column_indices->data().get(),
1136                                 upTriFactorT->csrMat->values->data().get(),
1137                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1138                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1139                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1140   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1141 #endif
1142 
1143   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1144   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1145                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1146                           upTriFactor->csrMat->values->data().get(),
1147                           upTriFactor->csrMat->row_offsets->data().get(),
1148                           upTriFactor->csrMat->column_indices->data().get(),
1149                           upTriFactorT->csrMat->values->data().get(),
1150                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1151                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1152                           CUSPARSE_ACTION_NUMERIC, indexBase,
1153                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1154                         #else
1155                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1156                           CUSPARSE_ACTION_NUMERIC, indexBase
1157                         #endif
1158                         );CHKERRCUSPARSE(stat);
1159   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1160   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1161 
1162   /* Create the solve analysis information */
1163   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1164   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1165   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1166   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1167                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1168                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1170                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1171   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1172   #endif
1173 
1174   /* perform the solve analysis */
1175   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1176                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1177                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1178                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1179                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1180                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1181                           #endif
1182                           );CHKERRCUSPARSE(stat);
1183   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1184   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1185 
1186   /* assign the pointer */
1187   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1192 {
1193   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1194   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1195   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1196   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1197   cusparseStatus_t             stat;
1198   cusparseIndexBase_t          indexBase;
1199   cudaError_t                  err;
1200   PetscErrorCode               ierr;
1201 
1202   PetscFunctionBegin;
1203   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
1204   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1205   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1206   /* create cusparse matrix */
1207   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1208   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1209   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1210   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1211   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1212 
1213   /* set alpha and beta */
1214   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1215   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1216   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1217   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1218   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1219   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1220   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1221 
1222   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1223     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1224     CsrMatrix *matrixT= new CsrMatrix;
1225     matrixT->num_rows = A->cmap->n;
1226     matrixT->num_cols = A->rmap->n;
1227     matrixT->num_entries = a->nz;
1228     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1229     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1230     matrixT->values = new THRUSTARRAY(a->nz);
1231 
1232     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1233     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1234 
1235     /* compute the transpose, i.e. the CSC */
1236    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1237     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1238                                   A->cmap->n, matrix->num_entries,
1239                                   matrix->values->data().get(),
1240                                   cusparsestruct->rowoffsets_gpu->data().get(),
1241                                   matrix->column_indices->data().get(),
1242                                   matrixT->values->data().get(),
1243                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1244                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1245                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1246     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1247    #endif
1248 
1249     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1250                             A->cmap->n, matrix->num_entries,
1251                             matrix->values->data().get(),
1252                             cusparsestruct->rowoffsets_gpu->data().get(),
1253                             matrix->column_indices->data().get(),
1254                             matrixT->values->data().get(),
1255                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1256                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1257                             CUSPARSE_ACTION_NUMERIC,indexBase,
1258                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1259                           #else
1260                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1261                             CUSPARSE_ACTION_NUMERIC, indexBase
1262                           #endif
1263                            );CHKERRCUSPARSE(stat);
1264     matstructT->mat = matrixT;
1265 
1266    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1267     stat = cusparseCreateCsr(&matstructT->matDescr,
1268                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1269                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1270                              matrixT->values->data().get(),
1271                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1272                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1273    #endif
1274   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1275    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1276     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1277    #else
1278     CsrMatrix *temp  = new CsrMatrix;
1279     CsrMatrix *tempT = new CsrMatrix;
1280     /* First convert HYB to CSR */
1281     temp->num_rows = A->rmap->n;
1282     temp->num_cols = A->cmap->n;
1283     temp->num_entries = a->nz;
1284     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1285     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1286     temp->values = new THRUSTARRAY(a->nz);
1287 
1288     stat = cusparse_hyb2csr(cusparsestruct->handle,
1289                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1290                             temp->values->data().get(),
1291                             temp->row_offsets->data().get(),
1292                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1293 
1294     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1295     tempT->num_rows = A->rmap->n;
1296     tempT->num_cols = A->cmap->n;
1297     tempT->num_entries = a->nz;
1298     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1299     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1300     tempT->values = new THRUSTARRAY(a->nz);
1301 
1302     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1303                             temp->num_cols, temp->num_entries,
1304                             temp->values->data().get(),
1305                             temp->row_offsets->data().get(),
1306                             temp->column_indices->data().get(),
1307                             tempT->values->data().get(),
1308                             tempT->column_indices->data().get(),
1309                             tempT->row_offsets->data().get(),
1310                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1311 
1312     /* Last, convert CSC to HYB */
1313     cusparseHybMat_t hybMat;
1314     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1315     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1316       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1317     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1318                             matstructT->descr, tempT->values->data().get(),
1319                             tempT->row_offsets->data().get(),
1320                             tempT->column_indices->data().get(),
1321                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1322 
1323     /* assign the pointer */
1324     matstructT->mat = hybMat;
1325     /* delete temporaries */
1326     if (tempT) {
1327       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1328       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1329       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1330       delete (CsrMatrix*) tempT;
1331     }
1332     if (temp) {
1333       if (temp->values) delete (THRUSTARRAY*) temp->values;
1334       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1335       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1336       delete (CsrMatrix*) temp;
1337     }
1338    #endif
1339   }
1340   err  = WaitForCUDA();CHKERRCUDA(err);
1341   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1342   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1343   /* the compressed row indices is not used for matTranspose */
1344   matstructT->cprowIndices = NULL;
1345   /* assign the pointer */
1346   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1351 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1352 {
1353   PetscInt                              n = xx->map->n;
1354   const PetscScalar                     *barray;
1355   PetscScalar                           *xarray;
1356   thrust::device_ptr<const PetscScalar> bGPU;
1357   thrust::device_ptr<PetscScalar>       xGPU;
1358   cusparseStatus_t                      stat;
1359   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1360   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1361   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1362   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1363   PetscErrorCode                        ierr;
1364   cudaError_t                           cerr;
1365 
1366   PetscFunctionBegin;
1367   /* Analyze the matrix and create the transpose ... on the fly */
1368   if (!loTriFactorT && !upTriFactorT) {
1369     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1370     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1371     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1372   }
1373 
1374   /* Get the GPU pointers */
1375   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1376   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1377   xGPU = thrust::device_pointer_cast(xarray);
1378   bGPU = thrust::device_pointer_cast(barray);
1379 
1380   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1381   /* First, reorder with the row permutation */
1382   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1383                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1384                xGPU);
1385 
1386   /* First, solve U */
1387   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1388                         upTriFactorT->csrMat->num_rows,
1389                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1390                         upTriFactorT->csrMat->num_entries,
1391                       #endif
1392                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1393                         upTriFactorT->csrMat->values->data().get(),
1394                         upTriFactorT->csrMat->row_offsets->data().get(),
1395                         upTriFactorT->csrMat->column_indices->data().get(),
1396                         upTriFactorT->solveInfo,
1397                         xarray, tempGPU->data().get()
1398                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1399                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1400                       #endif
1401                         );CHKERRCUSPARSE(stat);
1402 
1403   /* Then, solve L */
1404   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1405                         loTriFactorT->csrMat->num_rows,
1406                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1407                         loTriFactorT->csrMat->num_entries,
1408                       #endif
1409                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1410                         loTriFactorT->csrMat->values->data().get(),
1411                         loTriFactorT->csrMat->row_offsets->data().get(),
1412                         loTriFactorT->csrMat->column_indices->data().get(),
1413                         loTriFactorT->solveInfo,
1414                         tempGPU->data().get(), xarray
1415                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1416                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1417                       #endif
1418                         );CHKERRCUSPARSE(stat);
1419 
1420   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1421   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1422                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1423                tempGPU->begin());
1424 
1425   /* Copy the temporary to the full solution. */
1426   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1427 
1428   /* restore */
1429   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1430   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1431   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1432   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1433   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1434   PetscFunctionReturn(0);
1435 }
1436 
1437 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1438 {
1439   const PetscScalar                 *barray;
1440   PetscScalar                       *xarray;
1441   cusparseStatus_t                  stat;
1442   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1443   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1444   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1445   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1446   PetscErrorCode                    ierr;
1447   cudaError_t                       cerr;
1448 
1449   PetscFunctionBegin;
1450   /* Analyze the matrix and create the transpose ... on the fly */
1451   if (!loTriFactorT && !upTriFactorT) {
1452     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1453     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1454     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1455   }
1456 
1457   /* Get the GPU pointers */
1458   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1459   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1460 
1461   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1462   /* First, solve U */
1463   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1464                         upTriFactorT->csrMat->num_rows,
1465                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1466                         upTriFactorT->csrMat->num_entries,
1467                       #endif
1468                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1469                         upTriFactorT->csrMat->values->data().get(),
1470                         upTriFactorT->csrMat->row_offsets->data().get(),
1471                         upTriFactorT->csrMat->column_indices->data().get(),
1472                         upTriFactorT->solveInfo,
1473                         barray, tempGPU->data().get()
1474                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1475                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1476                       #endif
1477                         );CHKERRCUSPARSE(stat);
1478 
1479   /* Then, solve L */
1480   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1481                         loTriFactorT->csrMat->num_rows,
1482                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1483                         loTriFactorT->csrMat->num_entries,
1484                       #endif
1485                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1486                         loTriFactorT->csrMat->values->data().get(),
1487                         loTriFactorT->csrMat->row_offsets->data().get(),
1488                         loTriFactorT->csrMat->column_indices->data().get(),
1489                         loTriFactorT->solveInfo,
1490                         tempGPU->data().get(), xarray
1491                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1492                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1493                       #endif
1494                         );CHKERRCUSPARSE(stat);
1495 
1496   /* restore */
1497   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1498   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1499   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1500   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1501   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1502   PetscFunctionReturn(0);
1503 }
1504 
1505 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1506 {
1507   const PetscScalar                     *barray;
1508   PetscScalar                           *xarray;
1509   thrust::device_ptr<const PetscScalar> bGPU;
1510   thrust::device_ptr<PetscScalar>       xGPU;
1511   cusparseStatus_t                      stat;
1512   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1513   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1514   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1515   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1516   PetscErrorCode                        ierr;
1517   cudaError_t                           cerr;
1518 
1519   PetscFunctionBegin;
1520 
1521   /* Get the GPU pointers */
1522   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1523   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1524   xGPU = thrust::device_pointer_cast(xarray);
1525   bGPU = thrust::device_pointer_cast(barray);
1526 
1527   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1528   /* First, reorder with the row permutation */
1529   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1530                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1531                tempGPU->begin());
1532 
1533   /* Next, solve L */
1534   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1535                         loTriFactor->csrMat->num_rows,
1536                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1537                         loTriFactor->csrMat->num_entries,
1538                       #endif
1539                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1540                         loTriFactor->csrMat->values->data().get(),
1541                         loTriFactor->csrMat->row_offsets->data().get(),
1542                         loTriFactor->csrMat->column_indices->data().get(),
1543                         loTriFactor->solveInfo,
1544                         tempGPU->data().get(), xarray
1545                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1546                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1547                       #endif
1548                         );CHKERRCUSPARSE(stat);
1549 
1550   /* Then, solve U */
1551   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1552                         upTriFactor->csrMat->num_rows,
1553                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1554                         upTriFactor->csrMat->num_entries,
1555                       #endif
1556                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1557                         upTriFactor->csrMat->values->data().get(),
1558                         upTriFactor->csrMat->row_offsets->data().get(),
1559                         upTriFactor->csrMat->column_indices->data().get(),
1560                         upTriFactor->solveInfo,
1561                         xarray, tempGPU->data().get()
1562                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1563                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1564                       #endif
1565                         );CHKERRCUSPARSE(stat);
1566 
1567   /* Last, reorder with the column permutation */
1568   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1569                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1570                xGPU);
1571 
1572   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1573   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1574   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1575   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1576   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1581 {
1582   const PetscScalar                 *barray;
1583   PetscScalar                       *xarray;
1584   cusparseStatus_t                  stat;
1585   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1586   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1587   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1588   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1589   PetscErrorCode                    ierr;
1590   cudaError_t                       cerr;
1591 
1592   PetscFunctionBegin;
1593   /* Get the GPU pointers */
1594   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1595   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1596 
1597   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1598   /* First, solve L */
1599   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1600                         loTriFactor->csrMat->num_rows,
1601                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1602                         loTriFactor->csrMat->num_entries,
1603                       #endif
1604                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1605                         loTriFactor->csrMat->values->data().get(),
1606                         loTriFactor->csrMat->row_offsets->data().get(),
1607                         loTriFactor->csrMat->column_indices->data().get(),
1608                         loTriFactor->solveInfo,
1609                         barray, tempGPU->data().get()
1610                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1612                       #endif
1613                         );CHKERRCUSPARSE(stat);
1614 
1615   /* Next, solve U */
1616   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1617                         upTriFactor->csrMat->num_rows,
1618                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1619                         upTriFactor->csrMat->num_entries,
1620                       #endif
1621                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1622                         upTriFactor->csrMat->values->data().get(),
1623                         upTriFactor->csrMat->row_offsets->data().get(),
1624                         upTriFactor->csrMat->column_indices->data().get(),
1625                         upTriFactor->solveInfo,
1626                         tempGPU->data().get(), xarray
1627                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1629                       #endif
1630                         );CHKERRCUSPARSE(stat);
1631 
1632   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1633   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1634   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1635   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1636   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1637   PetscFunctionReturn(0);
1638 }
1639 
1640 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1641 {
1642   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1643   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1644   cudaError_t        cerr;
1645   PetscErrorCode     ierr;
1646 
1647   PetscFunctionBegin;
1648   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1649     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1650 
1651     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1652     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1653     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1654     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1655     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1656     A->offloadmask = PETSC_OFFLOAD_BOTH;
1657   }
1658   PetscFunctionReturn(0);
1659 }
1660 
1661 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1662 {
1663   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1664   PetscErrorCode ierr;
1665 
1666   PetscFunctionBegin;
1667   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1668   *array = a->a;
1669   A->offloadmask = PETSC_OFFLOAD_CPU;
1670   PetscFunctionReturn(0);
1671 }
1672 
1673 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1674 {
1675   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1676   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1677   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1678   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1679   PetscErrorCode               ierr;
1680   cusparseStatus_t             stat;
1681   cudaError_t                  err;
1682 
1683   PetscFunctionBegin;
1684   if (A->boundtocpu) PetscFunctionReturn(0);
1685   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1686     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1687       /* Copy values only */
1688       CsrMatrix *matrix,*matrixT;
1689       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1690 
1691       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1692       matrix->values->assign(a->a, a->a+a->nz);
1693       err  = WaitForCUDA();CHKERRCUDA(err);
1694       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1695       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1696 
1697       /* Update matT when it was built before */
1698       if (cusparsestruct->matTranspose) {
1699         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1700         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1701         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1702         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1703                             A->cmap->n, matrix->num_entries,
1704                             matrix->values->data().get(),
1705                             cusparsestruct->rowoffsets_gpu->data().get(),
1706                             matrix->column_indices->data().get(),
1707                             matrixT->values->data().get(),
1708                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1709                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1710                             CUSPARSE_ACTION_NUMERIC,indexBase,
1711                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1712                           #else
1713                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1714                             CUSPARSE_ACTION_NUMERIC, indexBase
1715                           #endif
1716                            );CHKERRCUSPARSE(stat);
1717         err  = WaitForCUDA();CHKERRCUDA(err);
1718         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1719       }
1720     } else {
1721       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1722       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1723       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1724       delete cusparsestruct->workVector;
1725       delete cusparsestruct->rowoffsets_gpu;
1726       try {
1727         if (a->compressedrow.use) {
1728           m    = a->compressedrow.nrows;
1729           ii   = a->compressedrow.i;
1730           ridx = a->compressedrow.rindex;
1731         } else {
1732           m    = A->rmap->n;
1733           ii   = a->i;
1734           ridx = NULL;
1735         }
1736         cusparsestruct->nrows = m;
1737 
1738         /* create cusparse matrix */
1739         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1740         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1741         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1742         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1743 
1744         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1745         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1746         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1747         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1748         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1749         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1750         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1751 
1752         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1753         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1754           /* set the matrix */
1755           CsrMatrix *mat= new CsrMatrix;
1756           mat->num_rows = m;
1757           mat->num_cols = A->cmap->n;
1758           mat->num_entries = a->nz;
1759           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1760           mat->row_offsets->assign(ii, ii + m+1);
1761 
1762           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1763           mat->column_indices->assign(a->j, a->j+a->nz);
1764 
1765           mat->values = new THRUSTARRAY(a->nz);
1766           mat->values->assign(a->a, a->a+a->nz);
1767 
1768           /* assign the pointer */
1769           matstruct->mat = mat;
1770          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1771           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1772             stat = cusparseCreateCsr(&matstruct->matDescr,
1773                                     mat->num_rows, mat->num_cols, mat->num_entries,
1774                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1775                                     mat->values->data().get(),
1776                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1777                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1778           }
1779          #endif
1780         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1781          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1782           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1783          #else
1784           CsrMatrix *mat= new CsrMatrix;
1785           mat->num_rows = m;
1786           mat->num_cols = A->cmap->n;
1787           mat->num_entries = a->nz;
1788           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1789           mat->row_offsets->assign(ii, ii + m+1);
1790 
1791           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1792           mat->column_indices->assign(a->j, a->j+a->nz);
1793 
1794           mat->values = new THRUSTARRAY(a->nz);
1795           mat->values->assign(a->a, a->a+a->nz);
1796 
1797           cusparseHybMat_t hybMat;
1798           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1799           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1800             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1801           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1802               matstruct->descr, mat->values->data().get(),
1803               mat->row_offsets->data().get(),
1804               mat->column_indices->data().get(),
1805               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1806           /* assign the pointer */
1807           matstruct->mat = hybMat;
1808 
1809           if (mat) {
1810             if (mat->values) delete (THRUSTARRAY*)mat->values;
1811             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1812             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1813             delete (CsrMatrix*)mat;
1814           }
1815          #endif
1816         }
1817 
1818         /* assign the compressed row indices */
1819         if (a->compressedrow.use) {
1820           cusparsestruct->workVector = new THRUSTARRAY(m);
1821           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1822           matstruct->cprowIndices->assign(ridx,ridx+m);
1823           tmp = m;
1824         } else {
1825           cusparsestruct->workVector = NULL;
1826           matstruct->cprowIndices    = NULL;
1827           tmp = 0;
1828         }
1829         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1830 
1831         /* assign the pointer */
1832         cusparsestruct->mat = matstruct;
1833       } catch(char *ex) {
1834         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1835       }
1836       err  = WaitForCUDA();CHKERRCUDA(err);
1837       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1838       cusparsestruct->nonzerostate = A->nonzerostate;
1839     }
1840     A->offloadmask = PETSC_OFFLOAD_BOTH;
1841   }
1842   PetscFunctionReturn(0);
1843 }
1844 
1845 struct VecCUDAPlusEquals
1846 {
1847   template <typename Tuple>
1848   __host__ __device__
1849   void operator()(Tuple t)
1850   {
1851     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1852   }
1853 };
1854 
1855 struct VecCUDAEquals
1856 {
1857   template <typename Tuple>
1858   __host__ __device__
1859   void operator()(Tuple t)
1860   {
1861     thrust::get<1>(t) = thrust::get<0>(t);
1862   }
1863 };
1864 
1865 struct VecCUDAEqualsReverse
1866 {
1867   template <typename Tuple>
1868   __host__ __device__
1869   void operator()(Tuple t)
1870   {
1871     thrust::get<0>(t) = thrust::get<1>(t);
1872   }
1873 };
1874 
1875 struct MatMatCusparse {
1876   PetscBool            cisdense;
1877   PetscScalar          *Bt;
1878   Mat                  X;
1879 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1880   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1881   cusparseDnMatDescr_t matBDescr;
1882   cusparseDnMatDescr_t matCDescr;
1883   size_t               spmmBufferSize;
1884   void                 *spmmBuffer;
1885   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1886 #endif
1887 };
1888 
1889 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1890 {
1891   PetscErrorCode ierr;
1892   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1893   cudaError_t    cerr;
1894 
1895   PetscFunctionBegin;
1896   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1897  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1898   cusparseStatus_t stat;
1899   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1900   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1901   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1902  #endif
1903   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1904   ierr = PetscFree(data);CHKERRQ(ierr);
1905   PetscFunctionReturn(0);
1906 }
1907 
1908 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1909 
1910 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1911 {
1912   Mat_Product                  *product = C->product;
1913   Mat                          A,B;
1914   PetscInt                     m,n,blda,clda;
1915   PetscBool                    flg,biscuda;
1916   Mat_SeqAIJCUSPARSE           *cusp;
1917   cusparseStatus_t             stat;
1918   cusparseOperation_t          opA;
1919   const PetscScalar            *barray;
1920   PetscScalar                  *carray;
1921   PetscErrorCode               ierr;
1922   MatMatCusparse               *mmdata;
1923   Mat_SeqAIJCUSPARSEMultStruct *mat;
1924   CsrMatrix                    *csrmat;
1925   cudaError_t                  cerr;
1926 
1927   PetscFunctionBegin;
1928   MatCheckProduct(C,1);
1929   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1930   mmdata = (MatMatCusparse*)product->data;
1931   A    = product->A;
1932   B    = product->B;
1933   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1934   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1935   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1936      Instead of silently accepting the wrong answer, I prefer to raise the error */
1937   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1938   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1939   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1940   switch (product->type) {
1941   case MATPRODUCT_AB:
1942   case MATPRODUCT_PtAP:
1943     mat = cusp->mat;
1944     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1945     m   = A->rmap->n;
1946     n   = B->cmap->n;
1947     break;
1948   case MATPRODUCT_AtB:
1949     if (!cusp->transgen) {
1950       mat = cusp->mat;
1951       opA = CUSPARSE_OPERATION_TRANSPOSE;
1952     } else {
1953       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1954       mat  = cusp->matTranspose;
1955       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1956     }
1957     m = A->cmap->n;
1958     n = B->cmap->n;
1959     break;
1960   case MATPRODUCT_ABt:
1961   case MATPRODUCT_RARt:
1962     mat = cusp->mat;
1963     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1964     m   = A->rmap->n;
1965     n   = B->rmap->n;
1966     break;
1967   default:
1968     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1969   }
1970   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1971   csrmat = (CsrMatrix*)mat->mat;
1972   /* if the user passed a CPU matrix, copy the data to the GPU */
1973   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1974   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1975   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1976 
1977   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1978   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1979     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1980     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1981   } else {
1982     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1983     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1984   }
1985 
1986   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1987  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1988   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1989   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1990   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1991     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1992     if (!mmdata->matBDescr) {
1993       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1994       mmdata->Blda = blda;
1995     }
1996 
1997     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1998     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1999       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2000       mmdata->Clda = clda;
2001     }
2002 
2003     if (!mat->matDescr) {
2004       stat = cusparseCreateCsr(&mat->matDescr,
2005                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2006                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2007                               csrmat->values->data().get(),
2008                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2009                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2010     }
2011     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2012                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2013                                    mmdata->matCDescr,cusparse_scalartype,
2014                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
2015     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
2016     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
2017     mmdata->initialized = PETSC_TRUE;
2018   } else {
2019     /* to be safe, always update pointers of the mats */
2020     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2021     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2022     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2023   }
2024 
2025   /* do cusparseSpMM, which supports transpose on B */
2026   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2027                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2028                       mmdata->matCDescr,cusparse_scalartype,
2029                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
2030  #else
2031   PetscInt k;
2032   /* cusparseXcsrmm does not support transpose on B */
2033   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2034     cublasHandle_t cublasv2handle;
2035     cublasStatus_t cerr;
2036 
2037     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2038     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2039                        B->cmap->n,B->rmap->n,
2040                        &PETSC_CUSPARSE_ONE ,barray,blda,
2041                        &PETSC_CUSPARSE_ZERO,barray,blda,
2042                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2043     blda = B->cmap->n;
2044     k    = B->cmap->n;
2045   } else {
2046     k    = B->rmap->n;
2047   }
2048 
2049   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2050   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2051                            csrmat->num_entries,mat->alpha_one,mat->descr,
2052                            csrmat->values->data().get(),
2053                            csrmat->row_offsets->data().get(),
2054                            csrmat->column_indices->data().get(),
2055                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2056                            carray,clda);CHKERRCUSPARSE(stat);
2057  #endif
2058   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2059   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2060   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2061   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2062   if (product->type == MATPRODUCT_RARt) {
2063     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2064     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2065   } else if (product->type == MATPRODUCT_PtAP) {
2066     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2067     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2068   } else {
2069     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2070   }
2071   if (mmdata->cisdense) {
2072     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2073   }
2074   if (!biscuda) {
2075     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2076   }
2077   PetscFunctionReturn(0);
2078 }
2079 
2080 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2081 {
2082   Mat_Product        *product = C->product;
2083   Mat                A,B;
2084   PetscInt           m,n;
2085   PetscBool          cisdense,flg;
2086   PetscErrorCode     ierr;
2087   MatMatCusparse     *mmdata;
2088   Mat_SeqAIJCUSPARSE *cusp;
2089 
2090   PetscFunctionBegin;
2091   MatCheckProduct(C,1);
2092   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2093   A    = product->A;
2094   B    = product->B;
2095   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2096   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2097   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2098   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2099   switch (product->type) {
2100   case MATPRODUCT_AB:
2101     m = A->rmap->n;
2102     n = B->cmap->n;
2103     break;
2104   case MATPRODUCT_AtB:
2105     m = A->cmap->n;
2106     n = B->cmap->n;
2107     break;
2108   case MATPRODUCT_ABt:
2109     m = A->rmap->n;
2110     n = B->rmap->n;
2111     break;
2112   case MATPRODUCT_PtAP:
2113     m = B->cmap->n;
2114     n = B->cmap->n;
2115     break;
2116   case MATPRODUCT_RARt:
2117     m = B->rmap->n;
2118     n = B->rmap->n;
2119     break;
2120   default:
2121     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2122   }
2123   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2124   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2125   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2126   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2127 
2128   /* product data */
2129   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2130   mmdata->cisdense = cisdense;
2131  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2132   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2133   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2134     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2135   }
2136  #endif
2137   /* for these products we need intermediate storage */
2138   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2139     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2140     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2141     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2142       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2143     } else {
2144       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2145     }
2146   }
2147   C->product->data    = mmdata;
2148   C->product->destroy = MatDestroy_MatMatCusparse;
2149 
2150   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2151   PetscFunctionReturn(0);
2152 }
2153 
2154 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2155 
2156 /* handles dense B */
2157 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2158 {
2159   Mat_Product    *product = C->product;
2160   PetscErrorCode ierr;
2161 
2162   PetscFunctionBegin;
2163   MatCheckProduct(C,1);
2164   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2165   if (product->A->boundtocpu) {
2166     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2167     PetscFunctionReturn(0);
2168   }
2169   switch (product->type) {
2170   case MATPRODUCT_AB:
2171   case MATPRODUCT_AtB:
2172   case MATPRODUCT_ABt:
2173   case MATPRODUCT_PtAP:
2174   case MATPRODUCT_RARt:
2175     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2176   default:
2177     break;
2178   }
2179   PetscFunctionReturn(0);
2180 }
2181 
2182 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2183 {
2184   PetscErrorCode ierr;
2185 
2186   PetscFunctionBegin;
2187   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2188   PetscFunctionReturn(0);
2189 }
2190 
2191 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2192 {
2193   PetscErrorCode ierr;
2194 
2195   PetscFunctionBegin;
2196   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2197   PetscFunctionReturn(0);
2198 }
2199 
2200 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2201 {
2202   PetscErrorCode ierr;
2203 
2204   PetscFunctionBegin;
2205   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2206   PetscFunctionReturn(0);
2207 }
2208 
2209 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2210 {
2211   PetscErrorCode ierr;
2212 
2213   PetscFunctionBegin;
2214   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2215   PetscFunctionReturn(0);
2216 }
2217 
2218 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2219 {
2220   PetscErrorCode ierr;
2221 
2222   PetscFunctionBegin;
2223   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2224   PetscFunctionReturn(0);
2225 }
2226 
2227 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2228 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2229 {
2230   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2231   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2232   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2233   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2234   PetscErrorCode               ierr;
2235   cudaError_t                  cerr;
2236   cusparseStatus_t             stat;
2237   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2238   PetscBool                    compressed;
2239 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2240   PetscInt                     nx,ny;
2241 #endif
2242 
2243   PetscFunctionBegin;
2244   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2245   if (!a->nonzerorowcnt) {
2246     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2247     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2248     PetscFunctionReturn(0);
2249   }
2250   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2251   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2252   if (!trans) {
2253     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2254     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2255   } else {
2256     if (herm || !cusparsestruct->transgen) {
2257       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2258       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2259     } else {
2260       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2261       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2262     }
2263   }
2264   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2265   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2266 
2267   try {
2268     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2269     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2270     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2271 
2272     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2273     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2274       /* z = A x + beta y.
2275          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2276          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2277       */
2278       xptr = xarray;
2279       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2280       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2281      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2282       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2283           allocated to accommodate different uses. So we get the length info directly from mat.
2284        */
2285       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2286         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2287         nx = mat->num_cols;
2288         ny = mat->num_rows;
2289       }
2290      #endif
2291     } else {
2292       /* z = A^T x + beta y
2293          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2294          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2295        */
2296       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2297       dptr = zarray;
2298       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2299       if (compressed) { /* Scatter x to work vector */
2300         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2301         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2302                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2303                          VecCUDAEqualsReverse());
2304       }
2305      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2306       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2307         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2308         nx = mat->num_rows;
2309         ny = mat->num_cols;
2310       }
2311      #endif
2312     }
2313 
2314     /* csr_spmv does y = alpha op(A) x + beta y */
2315     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2316      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2317       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2318       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2319         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2320         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2321         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2322                                 matstruct->matDescr,
2323                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2324                                 matstruct->cuSpMV[opA].vecYDescr,
2325                                 cusparse_scalartype,
2326                                 cusparsestruct->spmvAlg,
2327                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2328         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2329 
2330         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2331       } else {
2332         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2333         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2334         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2335       }
2336 
2337       stat = cusparseSpMV(cusparsestruct->handle, opA,
2338                                matstruct->alpha_one,
2339                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2340                                matstruct->cuSpMV[opA].vecXDescr,
2341                                beta,
2342                                matstruct->cuSpMV[opA].vecYDescr,
2343                                cusparse_scalartype,
2344                                cusparsestruct->spmvAlg,
2345                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2346      #else
2347       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2348       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2349                                mat->num_rows, mat->num_cols,
2350                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2351                                mat->values->data().get(), mat->row_offsets->data().get(),
2352                                mat->column_indices->data().get(), xptr, beta,
2353                                dptr);CHKERRCUSPARSE(stat);
2354      #endif
2355     } else {
2356       if (cusparsestruct->nrows) {
2357        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2358         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2359        #else
2360         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2361         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2362                                  matstruct->alpha_one, matstruct->descr, hybMat,
2363                                  xptr, beta,
2364                                  dptr);CHKERRCUSPARSE(stat);
2365        #endif
2366       }
2367     }
2368     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2369     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2370 
2371     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2372       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2373         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2374           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2375         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2376           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2377         }
2378       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2379         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2380       }
2381 
2382       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2383       if (compressed) {
2384         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2385         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2386         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2387                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2388                          VecCUDAPlusEquals());
2389         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2390         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2391       }
2392     } else {
2393       if (yy && yy != zz) {
2394         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2395       }
2396     }
2397     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2398     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2399     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2400   } catch(char *ex) {
2401     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2402   }
2403   if (yy) {
2404     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2405   } else {
2406     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2407   }
2408   PetscFunctionReturn(0);
2409 }
2410 
2411 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2412 {
2413   PetscErrorCode ierr;
2414 
2415   PetscFunctionBegin;
2416   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2417   PetscFunctionReturn(0);
2418 }
2419 
2420 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2421 {
2422   PetscErrorCode              ierr;
2423   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
2424   PetscBool                   is_seq = PETSC_TRUE;
2425   PetscInt                    nnz_state = A->nonzerostate;
2426   PetscFunctionBegin;
2427   if (A->factortype == MAT_FACTOR_NONE) {
2428     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2429   }
2430   if (d_mat) {
2431     cudaError_t err;
2432     ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr);
2433     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2434     nnz_state = h_mat.nonzerostate;
2435     is_seq = h_mat.seq;
2436   }
2437   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2438   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2439   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2440     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2441   } else if (nnz_state > A->nonzerostate) {
2442     A->offloadmask = PETSC_OFFLOAD_GPU;
2443   }
2444 
2445   PetscFunctionReturn(0);
2446 }
2447 
2448 /* --------------------------------------------------------------------------------*/
2449 /*@
2450    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2451    (the default parallel PETSc format). This matrix will ultimately pushed down
2452    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2453    assembly performance the user should preallocate the matrix storage by setting
2454    the parameter nz (or the array nnz).  By setting these parameters accurately,
2455    performance during matrix assembly can be increased by more than a factor of 50.
2456 
2457    Collective
2458 
2459    Input Parameters:
2460 +  comm - MPI communicator, set to PETSC_COMM_SELF
2461 .  m - number of rows
2462 .  n - number of columns
2463 .  nz - number of nonzeros per row (same for all rows)
2464 -  nnz - array containing the number of nonzeros in the various rows
2465          (possibly different for each row) or NULL
2466 
2467    Output Parameter:
2468 .  A - the matrix
2469 
2470    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2471    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2472    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2473 
2474    Notes:
2475    If nnz is given then nz is ignored
2476 
2477    The AIJ format (also called the Yale sparse matrix format or
2478    compressed row storage), is fully compatible with standard Fortran 77
2479    storage.  That is, the stored row and column indices can begin at
2480    either one (as in Fortran) or zero.  See the users' manual for details.
2481 
2482    Specify the preallocated storage with either nz or nnz (not both).
2483    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2484    allocation.  For large problems you MUST preallocate memory or you
2485    will get TERRIBLE performance, see the users' manual chapter on matrices.
2486 
2487    By default, this format uses inodes (identical nodes) when possible, to
2488    improve numerical efficiency of matrix-vector products and solves. We
2489    search for consecutive rows with the same nonzero structure, thereby
2490    reusing matrix information to achieve increased efficiency.
2491 
2492    Level: intermediate
2493 
2494 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2495 @*/
2496 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2497 {
2498   PetscErrorCode ierr;
2499 
2500   PetscFunctionBegin;
2501   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2502   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2503   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2504   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2505   PetscFunctionReturn(0);
2506 }
2507 
2508 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2509 {
2510   PetscErrorCode              ierr;
2511   PetscSplitCSRDataStructure  *d_mat = NULL;
2512 
2513   PetscFunctionBegin;
2514   if (A->factortype == MAT_FACTOR_NONE) {
2515     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2516     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2517     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
2518   } else {
2519     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2520   }
2521   if (d_mat) {
2522     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2523     cudaError_t                err;
2524     PetscSplitCSRDataStructure h_mat;
2525     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2526     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2527     if (h_mat.seq) {
2528       if (a->compressedrow.use) {
2529  	err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2530       }
2531       err = cudaFree(d_mat);CHKERRCUDA(err);
2532     }
2533   }
2534   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2535   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2536   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2537   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
2538   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2539   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2540   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2541   PetscFunctionReturn(0);
2542 }
2543 
2544 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2545 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2546 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2547 {
2548   PetscErrorCode ierr;
2549 
2550   PetscFunctionBegin;
2551   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2552   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2553   PetscFunctionReturn(0);
2554 }
2555 
2556 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2557 {
2558   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2559   PetscErrorCode ierr;
2560 
2561   PetscFunctionBegin;
2562   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2563   if (flg) {
2564     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2565 
2566     A->ops->mult                      = MatMult_SeqAIJ;
2567     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2568     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2569     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2570     A->ops->multhermitiantranspose    = NULL;
2571     A->ops->multhermitiantransposeadd = NULL;
2572     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2573     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2574     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2575     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2576     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
2577   } else {
2578     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2579     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2580     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2581     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2582     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2583     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2584     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2585     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2586     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2587     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2588     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
2589   }
2590   A->boundtocpu = flg;
2591   a->inode.use = flg;
2592   PetscFunctionReturn(0);
2593 }
2594 
2595 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2596 {
2597   PetscSplitCSRDataStructure *d_mat = NULL;
2598   PetscErrorCode             ierr;
2599   PetscBool                  both = PETSC_FALSE;
2600 
2601   PetscFunctionBegin;
2602   if (A->factortype == MAT_FACTOR_NONE) {
2603     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2604     if (spptr->mat) {
2605       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
2606       if (matrix->values) {
2607         both = PETSC_TRUE;
2608         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2609       }
2610     }
2611     if (spptr->matTranspose) {
2612       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
2613       if (matrix->values) {
2614         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2615       }
2616     }
2617     d_mat = spptr->deviceMat;
2618   }
2619   if (d_mat) {
2620     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
2621     PetscInt     n = A->rmap->n, nnz = a->i[n];
2622     cudaError_t  err;
2623     PetscScalar  *vals;
2624     ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr);
2625     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2626     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2627   }
2628   ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
2629   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2630 
2631   PetscFunctionReturn(0);
2632 }
2633 
2634 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2635 {
2636   PetscErrorCode   ierr;
2637   cusparseStatus_t stat;
2638   Mat              B;
2639 
2640   PetscFunctionBegin;
2641   if (reuse == MAT_INITIAL_MATRIX) {
2642     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2643   } else if (reuse == MAT_REUSE_MATRIX) {
2644     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2645   }
2646   B = *newmat;
2647 
2648   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2649   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2650 
2651   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2652     if (B->factortype == MAT_FACTOR_NONE) {
2653       Mat_SeqAIJCUSPARSE *spptr;
2654 
2655       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2656       spptr->format = MAT_CUSPARSE_CSR;
2657       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2658       B->spptr = spptr;
2659       spptr->deviceMat = NULL;
2660     } else {
2661       Mat_SeqAIJCUSPARSETriFactors *spptr;
2662 
2663       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2664       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2665       B->spptr = spptr;
2666     }
2667     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2668   }
2669   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2670   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2671   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2672   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2673   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2674   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;
2675 
2676   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2677   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2678   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2679   PetscFunctionReturn(0);
2680 }
2681 
2682 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2683 {
2684   PetscErrorCode ierr;
2685 
2686   PetscFunctionBegin;
2687   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
2688   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2689   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2690   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2691   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2692   ierr = PetscOptionsEnd();CHKERRQ(ierr);
2693   PetscFunctionReturn(0);
2694 }
2695 
2696 /*MC
2697    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2698 
2699    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2700    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2701    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2702 
2703    Options Database Keys:
2704 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2705 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2706 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2707 
2708   Level: beginner
2709 
2710 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2711 M*/
2712 
2713 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2714 
2715 
2716 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2717 {
2718   PetscErrorCode ierr;
2719 
2720   PetscFunctionBegin;
2721   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2722   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2723   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2724   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2725   PetscFunctionReturn(0);
2726 }
2727 
2728 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2729 {
2730   PetscErrorCode   ierr;
2731   cusparseStatus_t stat;
2732 
2733   PetscFunctionBegin;
2734   if (*cusparsestruct) {
2735     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2736     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2737     delete (*cusparsestruct)->workVector;
2738     delete (*cusparsestruct)->rowoffsets_gpu;
2739     delete (*cusparsestruct)->cooPerm;
2740     delete (*cusparsestruct)->cooPerm_a;
2741     delete (*cusparsestruct)->cooPerm_v;
2742     delete (*cusparsestruct)->cooPerm_w;
2743     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
2744    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2745     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2746    #endif
2747     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2748   }
2749   PetscFunctionReturn(0);
2750 }
2751 
2752 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2753 {
2754   PetscFunctionBegin;
2755   if (*mat) {
2756     delete (*mat)->values;
2757     delete (*mat)->column_indices;
2758     delete (*mat)->row_offsets;
2759     delete *mat;
2760     *mat = 0;
2761   }
2762   PetscFunctionReturn(0);
2763 }
2764 
2765 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2766 {
2767   cusparseStatus_t stat;
2768   PetscErrorCode   ierr;
2769 
2770   PetscFunctionBegin;
2771   if (*trifactor) {
2772     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2773     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2774     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2775     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2776     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
2777    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2778     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2779    #endif
2780     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
2781   }
2782   PetscFunctionReturn(0);
2783 }
2784 
2785 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2786 {
2787   CsrMatrix        *mat;
2788   cusparseStatus_t stat;
2789   cudaError_t      err;
2790 
2791   PetscFunctionBegin;
2792   if (*matstruct) {
2793     if ((*matstruct)->mat) {
2794       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2795        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2796         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2797        #else
2798         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2799         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2800        #endif
2801       } else {
2802         mat = (CsrMatrix*)(*matstruct)->mat;
2803         CsrMatrix_Destroy(&mat);
2804       }
2805     }
2806     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2807     delete (*matstruct)->cprowIndices;
2808     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2809     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2810     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2811 
2812    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2813     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2814     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2815     for (int i=0; i<3; i++) {
2816       if (mdata->cuSpMV[i].initialized) {
2817         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2818         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2819         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2820       }
2821     }
2822    #endif
2823     delete *matstruct;
2824     *matstruct = NULL;
2825   }
2826   PetscFunctionReturn(0);
2827 }
2828 
2829 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2830 {
2831   PetscErrorCode ierr;
2832 
2833   PetscFunctionBegin;
2834   if (*trifactors) {
2835     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2836     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2837     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2838     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2839     delete (*trifactors)->rpermIndices;
2840     delete (*trifactors)->cpermIndices;
2841     delete (*trifactors)->workVector;
2842     (*trifactors)->rpermIndices = NULL;
2843     (*trifactors)->cpermIndices = NULL;
2844     (*trifactors)->workVector = NULL;
2845   }
2846   PetscFunctionReturn(0);
2847 }
2848 
2849 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2850 {
2851   PetscErrorCode   ierr;
2852   cusparseHandle_t handle;
2853   cusparseStatus_t stat;
2854 
2855   PetscFunctionBegin;
2856   if (*trifactors) {
2857     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2858     if (handle = (*trifactors)->handle) {
2859       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2860     }
2861     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2862   }
2863   PetscFunctionReturn(0);
2864 }
2865 
2866 struct IJCompare
2867 {
2868   __host__ __device__
2869   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2870   {
2871     if (t1.get<0>() < t2.get<0>()) return true;
2872     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
2873     return false;
2874   }
2875 };
2876 
2877 struct IJEqual
2878 {
2879   __host__ __device__
2880   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2881   {
2882     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
2883     return true;
2884   }
2885 };
2886 
2887 struct IJDiff
2888 {
2889   __host__ __device__
2890   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2891   {
2892     return t1 == t2 ? 0 : 1;
2893   }
2894 };
2895 
2896 struct IJSum
2897 {
2898   __host__ __device__
2899   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2900   {
2901     return t1||t2;
2902   }
2903 };
2904 
2905 #include <thrust/iterator/discard_iterator.h>
2906 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
2907 {
2908   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2909   CsrMatrix          *matrix;
2910   PetscErrorCode     ierr;
2911   cudaError_t        cerr;
2912   PetscInt           n;
2913 
2914   PetscFunctionBegin;
2915   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
2916   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
2917   if (!cusp->cooPerm) {
2918     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2919     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2920     PetscFunctionReturn(0);
2921   }
2922   n = cusp->cooPerm->size();
2923   matrix = (CsrMatrix*)cusp->mat->mat;
2924   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
2925   if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); }
2926   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
2927   if (v) {
2928     cusp->cooPerm_v->assign(v,v+n);
2929     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
2930   }
2931   else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.);
2932   if (imode == ADD_VALUES) {
2933     if (cusp->cooPerm_a) {
2934       if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size());
2935       auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin());
2936       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
2937       thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
2938     } else {
2939       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2940                                                                 matrix->values->begin()));
2941       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2942                                                                 matrix->values->end()));
2943       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
2944     }
2945   } else {
2946     if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence)
2947                               if we are inserting two different values into the same location */
2948       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2949                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin())));
2950       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2951                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end())));
2952       thrust::for_each(zibit,zieit,VecCUDAEquals());
2953     } else {
2954       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2955                                                                 matrix->values->begin()));
2956       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2957                                                                 matrix->values->end()));
2958       thrust::for_each(zibit,zieit,VecCUDAEquals());
2959     }
2960   }
2961   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2962   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
2963   A->offloadmask = PETSC_OFFLOAD_GPU;
2964   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2965   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2966   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
2967   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2968   PetscFunctionReturn(0);
2969 }
2970 
2971 #include <thrust/binary_search.h>
2972 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
2973 {
2974   PetscErrorCode     ierr;
2975   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2976   CsrMatrix          *matrix;
2977   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2978   PetscInt           cooPerm_n, nzr = 0;
2979   cudaError_t        cerr;
2980 
2981   PetscFunctionBegin;
2982   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
2983   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
2984   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
2985   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
2986   if (n != cooPerm_n) {
2987     delete cusp->cooPerm;
2988     delete cusp->cooPerm_v;
2989     delete cusp->cooPerm_w;
2990     delete cusp->cooPerm_a;
2991     cusp->cooPerm = NULL;
2992     cusp->cooPerm_v = NULL;
2993     cusp->cooPerm_w = NULL;
2994     cusp->cooPerm_a = NULL;
2995   }
2996   if (n) {
2997     THRUSTINTARRAY d_i(n);
2998     THRUSTINTARRAY d_j(n);
2999     THRUSTINTARRAY ii(A->rmap->n);
3000 
3001     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3002     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3003 
3004     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3005     d_i.assign(coo_i,coo_i+n);
3006     d_j.assign(coo_j,coo_j+n);
3007     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3008     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3009 
3010     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3011     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3012     *cusp->cooPerm_a = d_i;
3013     THRUSTINTARRAY w = d_j;
3014 
3015     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3016     if (nekey == ekey) { /* all entries are unique */
3017       delete cusp->cooPerm_a;
3018       cusp->cooPerm_a = NULL;
3019     } else { /* I couldn't come up with a more elegant algorithm */
3020       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3021       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3022       (*cusp->cooPerm_a)[0] = 0;
3023       w[0] = 0;
3024       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3025       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3026     }
3027     thrust::counting_iterator<PetscInt> search_begin(0);
3028     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3029                         search_begin, search_begin + A->rmap->n,
3030                         ii.begin());
3031 
3032     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3033     a->singlemalloc = PETSC_FALSE;
3034     a->free_a       = PETSC_TRUE;
3035     a->free_ij      = PETSC_TRUE;
3036     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3037     a->i[0] = 0;
3038     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3039     a->nz = a->maxnz = a->i[A->rmap->n];
3040     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3041     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3042     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3043     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3044     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3045     for (PetscInt i = 0; i < A->rmap->n; i++) {
3046       const PetscInt nnzr = a->i[i+1] - a->i[i];
3047       nzr += (PetscInt)!!(nnzr);
3048       a->ilen[i] = a->imax[i] = nnzr;
3049     }
3050     A->preallocated = PETSC_TRUE;
3051     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3052   } else {
3053     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3054   }
3055   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3056   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
3057 
3058   /* We want to allocate the CUSPARSE struct for matvec now.
3059      The code is so convoluted now that I prefer to copy garbage to the GPU */
3060   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3061   A->offloadmask = PETSC_OFFLOAD_CPU;
3062   A->nonzerostate++;
3063   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3064   {
3065     matrix = (CsrMatrix*)cusp->mat->mat;
3066     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3067     thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3068     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3069   }
3070 
3071   A->offloadmask = PETSC_OFFLOAD_CPU;
3072   A->assembled = PETSC_FALSE;
3073   A->was_assembled = PETSC_FALSE;
3074   PetscFunctionReturn(0);
3075 }
3076