xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 61a622f347a73eb274681fd3fc19f99d2ac97e45)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
170     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
171     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
172   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
177   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
178 
179   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
180   (*B)->canuseordering = PETSC_TRUE;
181   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 }
184 
185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186 {
187   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
188 
189   PetscFunctionBegin;
190   switch (op) {
191   case MAT_CUSPARSE_MULT:
192     cusparsestruct->format = format;
193     break;
194   case MAT_CUSPARSE_ALL:
195     cusparsestruct->format = format;
196     break;
197   default:
198     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199   }
200   PetscFunctionReturn(0);
201 }
202 
203 /*@
204    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205    operation. Only the MatMult operation can use different GPU storage formats
206    for MPIAIJCUSPARSE matrices.
207    Not Collective
208 
209    Input Parameters:
210 +  A - Matrix of type SEQAIJCUSPARSE
211 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
213 
214    Output Parameter:
215 
216    Level: intermediate
217 
218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219 @*/
220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221 {
222   PetscErrorCode ierr;
223 
224   PetscFunctionBegin;
225   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
226   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   switch (op) {
236     case MAT_FORM_EXPLICIT_TRANSPOSE:
237       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
239       A->form_explicit_transpose = flg;
240       break;
241     default:
242       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
243       break;
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
249 
250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251 {
252   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253   IS             isrow = b->row,iscol = b->col;
254   PetscBool      row_identity,col_identity;
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
259   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
260   B->offloadmask = PETSC_OFFLOAD_CPU;
261   /* determine which version of MatSolve needs to be used. */
262   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
263   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
264   if (row_identity && col_identity) {
265     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->matsolve = NULL;
268     B->ops->matsolvetranspose = NULL;
269   } else {
270     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272     B->ops->matsolve = NULL;
273     B->ops->matsolvetranspose = NULL;
274   }
275 
276   /* get the triangular factors */
277   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282 {
283   PetscErrorCode           ierr;
284   MatCUSPARSEStorageFormat format;
285   PetscBool                flg;
286   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
287 
288   PetscFunctionBegin;
289   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
290   if (A->factortype == MAT_FACTOR_NONE) {
291     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
293     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
294 
295     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
297     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
298    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
301     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
303 
304     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
305                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
306     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
307 
308     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
309                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
310     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
311    #endif
312   }
313   ierr = PetscOptionsTail();CHKERRQ(ierr);
314   PetscFunctionReturn(0);
315 }
316 
317 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
318 {
319   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
320   PetscErrorCode               ierr;
321 
322   PetscFunctionBegin;
323   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
324   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
325   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
326   PetscFunctionReturn(0);
327 }
328 
329 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
330 {
331   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
332   PetscErrorCode               ierr;
333 
334   PetscFunctionBegin;
335   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
336   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
337   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
338   PetscFunctionReturn(0);
339 }
340 
341 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
342 {
343   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
344   PetscErrorCode               ierr;
345 
346   PetscFunctionBegin;
347   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
348   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
349   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
350   PetscFunctionReturn(0);
351 }
352 
353 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
354 {
355   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
356   PetscErrorCode               ierr;
357 
358   PetscFunctionBegin;
359   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
360   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
361   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
362   PetscFunctionReturn(0);
363 }
364 
365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
366 {
367   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
368   PetscInt                          n = A->rmap->n;
369   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
370   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
371   cusparseStatus_t                  stat;
372   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
373   const MatScalar                   *aa = a->a,*v;
374   PetscInt                          *AiLo, *AjLo;
375   PetscInt                          i,nz, nzLower, offset, rowOffset;
376   PetscErrorCode                    ierr;
377   cudaError_t                       cerr;
378 
379   PetscFunctionBegin;
380   if (!n) PetscFunctionReturn(0);
381   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
382     try {
383       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
384       nzLower=n+ai[n]-ai[1];
385       if (!loTriFactor) {
386         PetscScalar                       *AALo;
387 
388         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
389 
390         /* Allocate Space for the lower triangular matrix */
391         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
392         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
393 
394         /* Fill the lower triangular matrix */
395         AiLo[0]  = (PetscInt) 0;
396         AiLo[n]  = nzLower;
397         AjLo[0]  = (PetscInt) 0;
398         AALo[0]  = (MatScalar) 1.0;
399         v        = aa;
400         vi       = aj;
401         offset   = 1;
402         rowOffset= 1;
403         for (i=1; i<n; i++) {
404           nz = ai[i+1] - ai[i];
405           /* additional 1 for the term on the diagonal */
406           AiLo[i]    = rowOffset;
407           rowOffset += nz+1;
408 
409           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
410           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
411 
412           offset      += nz;
413           AjLo[offset] = (PetscInt) i;
414           AALo[offset] = (MatScalar) 1.0;
415           offset      += 1;
416 
417           v  += nz;
418           vi += nz;
419         }
420 
421         /* allocate space for the triangular factor information */
422         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
423         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
424         /* Create the matrix description */
425         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
426         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
427        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
428         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
429        #else
430         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
431        #endif
432         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
433         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
434 
435         /* set the operation */
436         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
437 
438         /* set the matrix */
439         loTriFactor->csrMat = new CsrMatrix;
440         loTriFactor->csrMat->num_rows = n;
441         loTriFactor->csrMat->num_cols = n;
442         loTriFactor->csrMat->num_entries = nzLower;
443 
444         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
445         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
446 
447         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
448         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
449 
450         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
451         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
452 
453         /* Create the solve analysis information */
454         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
455         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
456       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
457         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
458                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
459                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
460                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
461                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
462         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
463       #endif
464 
465         /* perform the solve analysis */
466         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
467                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
468                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
469                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
470                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
471                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
472                                #endif
473 );CHKERRCUSPARSE(stat);
474         cerr = WaitForCUDA();CHKERRCUDA(cerr);
475         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
476 
477         /* assign the pointer */
478         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
479         loTriFactor->AA_h = AALo;
480         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
481         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
482         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
483       } else { /* update values only */
484         if (!loTriFactor->AA_h) {
485           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
486         }
487         /* Fill the lower triangular matrix */
488         loTriFactor->AA_h[0]  = 1.0;
489         v        = aa;
490         vi       = aj;
491         offset   = 1;
492         for (i=1; i<n; i++) {
493           nz = ai[i+1] - ai[i];
494           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
495           offset      += nz;
496           loTriFactor->AA_h[offset] = 1.0;
497           offset      += 1;
498           v  += nz;
499         }
500         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
501         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
502       }
503     } catch(char *ex) {
504       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
505     }
506   }
507   PetscFunctionReturn(0);
508 }
509 
510 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
511 {
512   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
513   PetscInt                          n = A->rmap->n;
514   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
515   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
516   cusparseStatus_t                  stat;
517   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
518   const MatScalar                   *aa = a->a,*v;
519   PetscInt                          *AiUp, *AjUp;
520   PetscInt                          i,nz, nzUpper, offset;
521   PetscErrorCode                    ierr;
522   cudaError_t                       cerr;
523 
524   PetscFunctionBegin;
525   if (!n) PetscFunctionReturn(0);
526   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
527     try {
528       /* next, figure out the number of nonzeros in the upper triangular matrix. */
529       nzUpper = adiag[0]-adiag[n];
530       if (!upTriFactor) {
531         PetscScalar *AAUp;
532 
533         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
534 
535         /* Allocate Space for the upper triangular matrix */
536         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
537         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
538 
539         /* Fill the upper triangular matrix */
540         AiUp[0]=(PetscInt) 0;
541         AiUp[n]=nzUpper;
542         offset = nzUpper;
543         for (i=n-1; i>=0; i--) {
544           v  = aa + adiag[i+1] + 1;
545           vi = aj + adiag[i+1] + 1;
546 
547           /* number of elements NOT on the diagonal */
548           nz = adiag[i] - adiag[i+1]-1;
549 
550           /* decrement the offset */
551           offset -= (nz+1);
552 
553           /* first, set the diagonal elements */
554           AjUp[offset] = (PetscInt) i;
555           AAUp[offset] = (MatScalar)1./v[nz];
556           AiUp[i]      = AiUp[i+1] - (nz+1);
557 
558           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
559           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
560         }
561 
562         /* allocate space for the triangular factor information */
563         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
564         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
565 
566         /* Create the matrix description */
567         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
568         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
569        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
570         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
571        #else
572         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
573        #endif
574         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
575         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
576 
577         /* set the operation */
578         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
579 
580         /* set the matrix */
581         upTriFactor->csrMat = new CsrMatrix;
582         upTriFactor->csrMat->num_rows = n;
583         upTriFactor->csrMat->num_cols = n;
584         upTriFactor->csrMat->num_entries = nzUpper;
585 
586         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
587         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
588 
589         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
590         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
591 
592         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
593         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
594 
595         /* Create the solve analysis information */
596         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
597         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
598       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
599         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
600                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
601                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
602                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
603                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
604         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
605       #endif
606 
607         /* perform the solve analysis */
608         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
609                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
610                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
611                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
612                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
613                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
614                                #endif
615 );CHKERRCUSPARSE(stat);
616         cerr = WaitForCUDA();CHKERRCUDA(cerr);
617         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
618 
619         /* assign the pointer */
620         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
621         upTriFactor->AA_h = AAUp;
622         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
623         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
624         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
625       } else {
626         if (!upTriFactor->AA_h) {
627           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
628         }
629         /* Fill the upper triangular matrix */
630         offset = nzUpper;
631         for (i=n-1; i>=0; i--) {
632           v  = aa + adiag[i+1] + 1;
633 
634           /* number of elements NOT on the diagonal */
635           nz = adiag[i] - adiag[i+1]-1;
636 
637           /* decrement the offset */
638           offset -= (nz+1);
639 
640           /* first, set the diagonal elements */
641           upTriFactor->AA_h[offset] = 1./v[nz];
642           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
643         }
644         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
645         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
646       }
647     } catch(char *ex) {
648       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
649     }
650   }
651   PetscFunctionReturn(0);
652 }
653 
654 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
655 {
656   PetscErrorCode               ierr;
657   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
658   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
659   IS                           isrow = a->row,iscol = a->icol;
660   PetscBool                    row_identity,col_identity;
661   PetscInt                     n = A->rmap->n;
662 
663   PetscFunctionBegin;
664   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
665   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
666   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
667 
668   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
669   cusparseTriFactors->nnz=a->nz;
670 
671   A->offloadmask = PETSC_OFFLOAD_BOTH;
672   /* lower triangular indices */
673   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
674   if (!row_identity && !cusparseTriFactors->rpermIndices) {
675     const PetscInt *r;
676 
677     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
678     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
679     cusparseTriFactors->rpermIndices->assign(r, r+n);
680     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
681     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
682   }
683 
684   /* upper triangular indices */
685   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
686   if (!col_identity && !cusparseTriFactors->cpermIndices) {
687     const PetscInt *c;
688 
689     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
690     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
691     cusparseTriFactors->cpermIndices->assign(c, c+n);
692     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
693     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
694   }
695   PetscFunctionReturn(0);
696 }
697 
698 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
699 {
700   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
701   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
702   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
703   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
704   cusparseStatus_t                  stat;
705   PetscErrorCode                    ierr;
706   cudaError_t                       cerr;
707   PetscInt                          *AiUp, *AjUp;
708   PetscScalar                       *AAUp;
709   PetscScalar                       *AALo;
710   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
711   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
712   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
713   const MatScalar                   *aa = b->a,*v;
714 
715   PetscFunctionBegin;
716   if (!n) PetscFunctionReturn(0);
717   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
718     try {
719       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
720       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
721       if (!upTriFactor && !loTriFactor) {
722         /* Allocate Space for the upper triangular matrix */
723         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
724         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
725 
726         /* Fill the upper triangular matrix */
727         AiUp[0]=(PetscInt) 0;
728         AiUp[n]=nzUpper;
729         offset = 0;
730         for (i=0; i<n; i++) {
731           /* set the pointers */
732           v  = aa + ai[i];
733           vj = aj + ai[i];
734           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
735 
736           /* first, set the diagonal elements */
737           AjUp[offset] = (PetscInt) i;
738           AAUp[offset] = (MatScalar)1.0/v[nz];
739           AiUp[i]      = offset;
740           AALo[offset] = (MatScalar)1.0/v[nz];
741 
742           offset+=1;
743           if (nz>0) {
744             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
745             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
746             for (j=offset; j<offset+nz; j++) {
747               AAUp[j] = -AAUp[j];
748               AALo[j] = AAUp[j]/v[nz];
749             }
750             offset+=nz;
751           }
752         }
753 
754         /* allocate space for the triangular factor information */
755         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
756         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
757 
758         /* Create the matrix description */
759         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
760         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
761        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
762         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
763        #else
764         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
765        #endif
766         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
767         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
768 
769         /* set the matrix */
770         upTriFactor->csrMat = new CsrMatrix;
771         upTriFactor->csrMat->num_rows = A->rmap->n;
772         upTriFactor->csrMat->num_cols = A->cmap->n;
773         upTriFactor->csrMat->num_entries = a->nz;
774 
775         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
776         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
777 
778         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
779         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
780 
781         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
782         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
783 
784         /* set the operation */
785         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
786 
787         /* Create the solve analysis information */
788         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
789         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
790       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
791         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
792                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
793                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
794                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
795                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
796         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
797       #endif
798 
799         /* perform the solve analysis */
800         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
801                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
802                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
803                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
804                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
805                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
806                                 #endif
807 );CHKERRCUSPARSE(stat);
808         cerr = WaitForCUDA();CHKERRCUDA(cerr);
809         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
810 
811         /* assign the pointer */
812         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
813 
814         /* allocate space for the triangular factor information */
815         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
816         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
817 
818         /* Create the matrix description */
819         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
820         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
821        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
822         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
823        #else
824         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
825        #endif
826         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
827         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
828 
829         /* set the operation */
830         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
831 
832         /* set the matrix */
833         loTriFactor->csrMat = new CsrMatrix;
834         loTriFactor->csrMat->num_rows = A->rmap->n;
835         loTriFactor->csrMat->num_cols = A->cmap->n;
836         loTriFactor->csrMat->num_entries = a->nz;
837 
838         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
839         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
840 
841         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
842         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
843 
844         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
845         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
846 
847         /* Create the solve analysis information */
848         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
849         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
850       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
851         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
852                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
853                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
854                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
855                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
856         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
857       #endif
858 
859         /* perform the solve analysis */
860         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
861                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
862                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
863                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
864                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
865                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
866                                 #endif
867 );CHKERRCUSPARSE(stat);
868         cerr = WaitForCUDA();CHKERRCUDA(cerr);
869         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
870 
871         /* assign the pointer */
872         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
873 
874         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
875         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
876         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
877       } else {
878         /* Fill the upper triangular matrix */
879         offset = 0;
880         for (i=0; i<n; i++) {
881           /* set the pointers */
882           v  = aa + ai[i];
883           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
884 
885           /* first, set the diagonal elements */
886           AAUp[offset] = 1.0/v[nz];
887           AALo[offset] = 1.0/v[nz];
888 
889           offset+=1;
890           if (nz>0) {
891             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
892             for (j=offset; j<offset+nz; j++) {
893               AAUp[j] = -AAUp[j];
894               AALo[j] = AAUp[j]/v[nz];
895             }
896             offset+=nz;
897           }
898         }
899         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
900         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
901         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
902         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
903         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
904       }
905       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
906       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
907     } catch(char *ex) {
908       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
909     }
910   }
911   PetscFunctionReturn(0);
912 }
913 
914 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
915 {
916   PetscErrorCode               ierr;
917   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
918   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
919   IS                           ip = a->row;
920   PetscBool                    perm_identity;
921   PetscInt                     n = A->rmap->n;
922 
923   PetscFunctionBegin;
924   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
925   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
926   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
927   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
928 
929   A->offloadmask = PETSC_OFFLOAD_BOTH;
930 
931   /* lower triangular indices */
932   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
933   if (!perm_identity) {
934     IS             iip;
935     const PetscInt *irip,*rip;
936 
937     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
938     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
939     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
940     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
941     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
942     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
943     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
944     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
945     ierr = ISDestroy(&iip);CHKERRQ(ierr);
946     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
947     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
948   }
949   PetscFunctionReturn(0);
950 }
951 
952 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
953 {
954   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
955   IS             ip = b->row;
956   PetscBool      perm_identity;
957   PetscErrorCode ierr;
958 
959   PetscFunctionBegin;
960   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
961   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
962   B->offloadmask = PETSC_OFFLOAD_CPU;
963   /* determine which version of MatSolve needs to be used. */
964   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
965   if (perm_identity) {
966     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
967     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
968     B->ops->matsolve = NULL;
969     B->ops->matsolvetranspose = NULL;
970   } else {
971     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
972     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
973     B->ops->matsolve = NULL;
974     B->ops->matsolvetranspose = NULL;
975   }
976 
977   /* get the triangular factors */
978   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
979   PetscFunctionReturn(0);
980 }
981 
982 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
983 {
984   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
985   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
986   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
987   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
988   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
989   cusparseStatus_t                  stat;
990   cusparseIndexBase_t               indexBase;
991   cusparseMatrixType_t              matrixType;
992   cusparseFillMode_t                fillMode;
993   cusparseDiagType_t                diagType;
994   cudaError_t                       cerr;
995   PetscErrorCode                    ierr;
996 
997   PetscFunctionBegin;
998   /* allocate space for the transpose of the lower triangular factor */
999   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1000   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1001 
1002   /* set the matrix descriptors of the lower triangular factor */
1003   matrixType = cusparseGetMatType(loTriFactor->descr);
1004   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1005   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1006     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1007   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1008 
1009   /* Create the matrix description */
1010   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1011   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1012   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1013   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1014   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1015 
1016   /* set the operation */
1017   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1018 
1019   /* allocate GPU space for the CSC of the lower triangular factor*/
1020   loTriFactorT->csrMat = new CsrMatrix;
1021   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1022   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1023   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1024   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1025   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1026   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1027 
1028   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1029 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1030   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1031                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1032                                        loTriFactor->csrMat->values->data().get(),
1033                                        loTriFactor->csrMat->row_offsets->data().get(),
1034                                        loTriFactor->csrMat->column_indices->data().get(),
1035                                        loTriFactorT->csrMat->values->data().get(),
1036                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1037                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1038                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1039   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1040 #endif
1041 
1042   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1043   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1044                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1045                           loTriFactor->csrMat->values->data().get(),
1046                           loTriFactor->csrMat->row_offsets->data().get(),
1047                           loTriFactor->csrMat->column_indices->data().get(),
1048                           loTriFactorT->csrMat->values->data().get(),
1049                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1050                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1051                           CUSPARSE_ACTION_NUMERIC, indexBase,
1052                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1053                         #else
1054                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1055                           CUSPARSE_ACTION_NUMERIC, indexBase
1056                         #endif
1057 );CHKERRCUSPARSE(stat);
1058   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1059   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1060 
1061   /* Create the solve analysis information */
1062   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1063   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1064 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1065   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1066                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1067                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1068                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1069                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1070   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1071 #endif
1072 
1073   /* perform the solve analysis */
1074   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1075                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1076                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1077                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1078                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1079                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1080                           #endif
1081 );CHKERRCUSPARSE(stat);
1082   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1083   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1084 
1085   /* assign the pointer */
1086   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1087 
1088   /*********************************************/
1089   /* Now the Transpose of the Upper Tri Factor */
1090   /*********************************************/
1091 
1092   /* allocate space for the transpose of the upper triangular factor */
1093   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1094   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1095 
1096   /* set the matrix descriptors of the upper triangular factor */
1097   matrixType = cusparseGetMatType(upTriFactor->descr);
1098   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1099   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1100     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1101   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1102 
1103   /* Create the matrix description */
1104   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1105   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1106   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1107   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1108   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1109 
1110   /* set the operation */
1111   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1112 
1113   /* allocate GPU space for the CSC of the upper triangular factor*/
1114   upTriFactorT->csrMat = new CsrMatrix;
1115   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1116   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1117   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1118   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1119   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1120   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1121 
1122   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1123 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1124   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1125                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1126                                 upTriFactor->csrMat->values->data().get(),
1127                                 upTriFactor->csrMat->row_offsets->data().get(),
1128                                 upTriFactor->csrMat->column_indices->data().get(),
1129                                 upTriFactorT->csrMat->values->data().get(),
1130                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1131                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1132                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1133   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1134 #endif
1135 
1136   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1137   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1138                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1139                           upTriFactor->csrMat->values->data().get(),
1140                           upTriFactor->csrMat->row_offsets->data().get(),
1141                           upTriFactor->csrMat->column_indices->data().get(),
1142                           upTriFactorT->csrMat->values->data().get(),
1143                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1144                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1145                           CUSPARSE_ACTION_NUMERIC, indexBase,
1146                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1147                         #else
1148                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1149                           CUSPARSE_ACTION_NUMERIC, indexBase
1150                         #endif
1151 );CHKERRCUSPARSE(stat);
1152   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1153   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1154 
1155   /* Create the solve analysis information */
1156   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1157   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1158   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1159   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1160                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1161                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1162                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1163                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1164   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1165   #endif
1166 
1167   /* perform the solve analysis */
1168   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1169                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1170                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1171                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1172                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1173                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1174                           #endif
1175 );CHKERRCUSPARSE(stat);
1176   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1177   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1178 
1179   /* assign the pointer */
1180   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 struct PetscScalarToPetscInt
1185 {
1186   __host__ __device__
1187   PetscInt operator()(PetscScalar s)
1188   {
1189     return (PetscInt)PetscRealPart(s);
1190   }
1191 };
1192 
1193 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1194 {
1195   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1196   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1197   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1198   cusparseStatus_t             stat;
1199   cusparseIndexBase_t          indexBase;
1200   cudaError_t                  err;
1201   PetscErrorCode               ierr;
1202 
1203   PetscFunctionBegin;
1204   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1205   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1206   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1207   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1208   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1209   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1210   if (A->transupdated) PetscFunctionReturn(0);
1211   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1212   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1213   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1214     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1215   }
1216   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1217     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1218     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1219     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1220     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1221     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1222 
1223     /* set alpha and beta */
1224     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1225     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1226     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1227     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1228     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1229     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1230 
1231     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1232       CsrMatrix *matrixT = new CsrMatrix;
1233       matstructT->mat = matrixT;
1234       matrixT->num_rows = A->cmap->n;
1235       matrixT->num_cols = A->rmap->n;
1236       matrixT->num_entries = a->nz;
1237       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1238       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1239       matrixT->values = new THRUSTARRAY(a->nz);
1240 
1241       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1242       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1243 
1244      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1245       stat = cusparseCreateCsr(&matstructT->matDescr,
1246                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1247                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1248                                matrixT->values->data().get(),
1249                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1250                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1251      #endif
1252     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1253    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1254       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1255    #else
1256       CsrMatrix *temp  = new CsrMatrix;
1257       CsrMatrix *tempT = new CsrMatrix;
1258       /* First convert HYB to CSR */
1259       temp->num_rows = A->rmap->n;
1260       temp->num_cols = A->cmap->n;
1261       temp->num_entries = a->nz;
1262       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1263       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1264       temp->values = new THRUSTARRAY(a->nz);
1265 
1266       stat = cusparse_hyb2csr(cusparsestruct->handle,
1267                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1268                               temp->values->data().get(),
1269                               temp->row_offsets->data().get(),
1270                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1271 
1272       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1273       tempT->num_rows = A->rmap->n;
1274       tempT->num_cols = A->cmap->n;
1275       tempT->num_entries = a->nz;
1276       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1278       tempT->values = new THRUSTARRAY(a->nz);
1279 
1280       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1281                               temp->num_cols, temp->num_entries,
1282                               temp->values->data().get(),
1283                               temp->row_offsets->data().get(),
1284                               temp->column_indices->data().get(),
1285                               tempT->values->data().get(),
1286                               tempT->column_indices->data().get(),
1287                               tempT->row_offsets->data().get(),
1288                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1289 
1290       /* Last, convert CSC to HYB */
1291       cusparseHybMat_t hybMat;
1292       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1293       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1294         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1295       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1296                               matstructT->descr, tempT->values->data().get(),
1297                               tempT->row_offsets->data().get(),
1298                               tempT->column_indices->data().get(),
1299                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1300 
1301       /* assign the pointer */
1302       matstructT->mat = hybMat;
1303       A->transupdated = PETSC_TRUE;
1304       /* delete temporaries */
1305       if (tempT) {
1306         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1307         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1308         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1309         delete (CsrMatrix*) tempT;
1310       }
1311       if (temp) {
1312         if (temp->values) delete (THRUSTARRAY*) temp->values;
1313         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1314         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1315         delete (CsrMatrix*) temp;
1316       }
1317      #endif
1318     }
1319   }
1320   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1321     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1322     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1323     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1324     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1325     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1326     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1327     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1328     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1329     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1330     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1331     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1332       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1333       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1334       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1335     }
1336     if (!cusparsestruct->csr2csc_i) {
1337       THRUSTARRAY csr2csc_a(matrix->num_entries);
1338       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1339 
1340       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1341      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1342       void   *csr2cscBuffer;
1343       size_t csr2cscBufferSize;
1344       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1345                                            A->cmap->n, matrix->num_entries,
1346                                            matrix->values->data().get(),
1347                                            cusparsestruct->rowoffsets_gpu->data().get(),
1348                                            matrix->column_indices->data().get(),
1349                                            matrixT->values->data().get(),
1350                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1351                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1352                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1353       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1354      #endif
1355 
1356       if (matrix->num_entries) {
1357         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1358            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1359            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1360 
1361            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1362            should be filled with indexBase. So I just take a shortcut here.
1363         */
1364         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1365                               A->cmap->n,matrix->num_entries,
1366                               csr2csc_a.data().get(),
1367                               cusparsestruct->rowoffsets_gpu->data().get(),
1368                               matrix->column_indices->data().get(),
1369                               matrixT->values->data().get(),
1370                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1371                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1372                               CUSPARSE_ACTION_NUMERIC,indexBase,
1373                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1374                              #else
1375                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1376                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1377                              #endif
1378       } else {
1379         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1380       }
1381 
1382       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1383       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1384      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1385       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1386      #endif
1387     }
1388     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1389                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1390                                                      matrixT->values->begin()));
1391   }
1392   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1393   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1394   /* the compressed row indices is not used for matTranspose */
1395   matstructT->cprowIndices = NULL;
1396   /* assign the pointer */
1397   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1398   A->transupdated = PETSC_TRUE;
1399   PetscFunctionReturn(0);
1400 }
1401 
1402 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1403 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1404 {
1405   PetscInt                              n = xx->map->n;
1406   const PetscScalar                     *barray;
1407   PetscScalar                           *xarray;
1408   thrust::device_ptr<const PetscScalar> bGPU;
1409   thrust::device_ptr<PetscScalar>       xGPU;
1410   cusparseStatus_t                      stat;
1411   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1412   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1413   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1414   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1415   PetscErrorCode                        ierr;
1416 
1417   PetscFunctionBegin;
1418   /* Analyze the matrix and create the transpose ... on the fly */
1419   if (!loTriFactorT && !upTriFactorT) {
1420     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1421     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1422     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1423   }
1424 
1425   /* Get the GPU pointers */
1426   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1427   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1428   xGPU = thrust::device_pointer_cast(xarray);
1429   bGPU = thrust::device_pointer_cast(barray);
1430 
1431   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1432   /* First, reorder with the row permutation */
1433   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1434                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1435                xGPU);
1436 
1437   /* First, solve U */
1438   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1439                         upTriFactorT->csrMat->num_rows,
1440                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1441                         upTriFactorT->csrMat->num_entries,
1442                       #endif
1443                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1444                         upTriFactorT->csrMat->values->data().get(),
1445                         upTriFactorT->csrMat->row_offsets->data().get(),
1446                         upTriFactorT->csrMat->column_indices->data().get(),
1447                         upTriFactorT->solveInfo,
1448                         xarray, tempGPU->data().get()
1449                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1450                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1451                       #endif
1452 );CHKERRCUSPARSE(stat);
1453 
1454   /* Then, solve L */
1455   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1456                         loTriFactorT->csrMat->num_rows,
1457                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1458                         loTriFactorT->csrMat->num_entries,
1459                       #endif
1460                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1461                         loTriFactorT->csrMat->values->data().get(),
1462                         loTriFactorT->csrMat->row_offsets->data().get(),
1463                         loTriFactorT->csrMat->column_indices->data().get(),
1464                         loTriFactorT->solveInfo,
1465                         tempGPU->data().get(), xarray
1466                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1467                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1468                       #endif
1469 );CHKERRCUSPARSE(stat);
1470 
1471   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1472   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1473                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1474                tempGPU->begin());
1475 
1476   /* Copy the temporary to the full solution. */
1477   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1478 
1479   /* restore */
1480   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1481   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1482   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1483   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1484   PetscFunctionReturn(0);
1485 }
1486 
1487 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1488 {
1489   const PetscScalar                 *barray;
1490   PetscScalar                       *xarray;
1491   cusparseStatus_t                  stat;
1492   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1493   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1494   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1495   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1496   PetscErrorCode                    ierr;
1497 
1498   PetscFunctionBegin;
1499   /* Analyze the matrix and create the transpose ... on the fly */
1500   if (!loTriFactorT && !upTriFactorT) {
1501     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1502     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1503     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1504   }
1505 
1506   /* Get the GPU pointers */
1507   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1508   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1509 
1510   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1511   /* First, solve U */
1512   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1513                         upTriFactorT->csrMat->num_rows,
1514                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1515                         upTriFactorT->csrMat->num_entries,
1516                       #endif
1517                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1518                         upTriFactorT->csrMat->values->data().get(),
1519                         upTriFactorT->csrMat->row_offsets->data().get(),
1520                         upTriFactorT->csrMat->column_indices->data().get(),
1521                         upTriFactorT->solveInfo,
1522                         barray, tempGPU->data().get()
1523                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1524                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1525                       #endif
1526 );CHKERRCUSPARSE(stat);
1527 
1528   /* Then, solve L */
1529   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1530                         loTriFactorT->csrMat->num_rows,
1531                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1532                         loTriFactorT->csrMat->num_entries,
1533                       #endif
1534                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1535                         loTriFactorT->csrMat->values->data().get(),
1536                         loTriFactorT->csrMat->row_offsets->data().get(),
1537                         loTriFactorT->csrMat->column_indices->data().get(),
1538                         loTriFactorT->solveInfo,
1539                         tempGPU->data().get(), xarray
1540                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1541                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1542                       #endif
1543 );CHKERRCUSPARSE(stat);
1544 
1545   /* restore */
1546   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1547   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1548   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1549   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1550   PetscFunctionReturn(0);
1551 }
1552 
1553 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1554 {
1555   const PetscScalar                     *barray;
1556   PetscScalar                           *xarray;
1557   thrust::device_ptr<const PetscScalar> bGPU;
1558   thrust::device_ptr<PetscScalar>       xGPU;
1559   cusparseStatus_t                      stat;
1560   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1561   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1562   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1563   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1564   PetscErrorCode                        ierr;
1565 
1566   PetscFunctionBegin;
1567 
1568   /* Get the GPU pointers */
1569   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1570   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1571   xGPU = thrust::device_pointer_cast(xarray);
1572   bGPU = thrust::device_pointer_cast(barray);
1573 
1574   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1575   /* First, reorder with the row permutation */
1576   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1577                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1578                tempGPU->begin());
1579 
1580   /* Next, solve L */
1581   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1582                         loTriFactor->csrMat->num_rows,
1583                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1584                         loTriFactor->csrMat->num_entries,
1585                       #endif
1586                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1587                         loTriFactor->csrMat->values->data().get(),
1588                         loTriFactor->csrMat->row_offsets->data().get(),
1589                         loTriFactor->csrMat->column_indices->data().get(),
1590                         loTriFactor->solveInfo,
1591                         tempGPU->data().get(), xarray
1592                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1593                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1594                       #endif
1595 );CHKERRCUSPARSE(stat);
1596 
1597   /* Then, solve U */
1598   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1599                         upTriFactor->csrMat->num_rows,
1600                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1601                         upTriFactor->csrMat->num_entries,
1602                       #endif
1603                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1604                         upTriFactor->csrMat->values->data().get(),
1605                         upTriFactor->csrMat->row_offsets->data().get(),
1606                         upTriFactor->csrMat->column_indices->data().get(),
1607                         upTriFactor->solveInfo,
1608                         xarray, tempGPU->data().get()
1609                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1610                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1611                       #endif
1612 );CHKERRCUSPARSE(stat);
1613 
1614   /* Last, reorder with the column permutation */
1615   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1616                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1617                xGPU);
1618 
1619   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1620   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1621   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1622   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1623   PetscFunctionReturn(0);
1624 }
1625 
1626 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1627 {
1628   const PetscScalar                 *barray;
1629   PetscScalar                       *xarray;
1630   cusparseStatus_t                  stat;
1631   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1633   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1634   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1635   PetscErrorCode                    ierr;
1636 
1637   PetscFunctionBegin;
1638   /* Get the GPU pointers */
1639   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1640   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1641 
1642   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1643   /* First, solve L */
1644   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1645                         loTriFactor->csrMat->num_rows,
1646                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1647                         loTriFactor->csrMat->num_entries,
1648                       #endif
1649                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1650                         loTriFactor->csrMat->values->data().get(),
1651                         loTriFactor->csrMat->row_offsets->data().get(),
1652                         loTriFactor->csrMat->column_indices->data().get(),
1653                         loTriFactor->solveInfo,
1654                         barray, tempGPU->data().get()
1655                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1656                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1657                       #endif
1658 );CHKERRCUSPARSE(stat);
1659 
1660   /* Next, solve U */
1661   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1662                         upTriFactor->csrMat->num_rows,
1663                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1664                         upTriFactor->csrMat->num_entries,
1665                       #endif
1666                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1667                         upTriFactor->csrMat->values->data().get(),
1668                         upTriFactor->csrMat->row_offsets->data().get(),
1669                         upTriFactor->csrMat->column_indices->data().get(),
1670                         upTriFactor->solveInfo,
1671                         tempGPU->data().get(), xarray
1672                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1673                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1674                       #endif
1675 );CHKERRCUSPARSE(stat);
1676 
1677   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1678   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1679   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1680   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1681   PetscFunctionReturn(0);
1682 }
1683 
1684 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1685 {
1686   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1687   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1688   cudaError_t        cerr;
1689   PetscErrorCode     ierr;
1690 
1691   PetscFunctionBegin;
1692   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1693     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1694 
1695     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1696     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1697     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1698     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1699     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1700     A->offloadmask = PETSC_OFFLOAD_BOTH;
1701   }
1702   PetscFunctionReturn(0);
1703 }
1704 
1705 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1706 {
1707   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1708   PetscErrorCode ierr;
1709 
1710   PetscFunctionBegin;
1711   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1712   *array = a->a;
1713   A->offloadmask = PETSC_OFFLOAD_CPU;
1714   PetscFunctionReturn(0);
1715 }
1716 
1717 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1718 {
1719   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1720   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1721   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1722   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1723   PetscErrorCode               ierr;
1724   cusparseStatus_t             stat;
1725   PetscBool                    both = PETSC_TRUE;
1726   cudaError_t                  err;
1727 
1728   PetscFunctionBegin;
1729   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1730   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1731     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1732       CsrMatrix *matrix;
1733       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1734 
1735       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1736       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1737       matrix->values->assign(a->a, a->a+a->nz);
1738       err  = WaitForCUDA();CHKERRCUDA(err);
1739       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1740       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1741       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1742     } else {
1743       PetscInt nnz;
1744       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1745       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1746       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1747       delete cusparsestruct->workVector;
1748       delete cusparsestruct->rowoffsets_gpu;
1749       cusparsestruct->workVector = NULL;
1750       cusparsestruct->rowoffsets_gpu = NULL;
1751       try {
1752         if (a->compressedrow.use) {
1753           m    = a->compressedrow.nrows;
1754           ii   = a->compressedrow.i;
1755           ridx = a->compressedrow.rindex;
1756         } else {
1757           m    = A->rmap->n;
1758           ii   = a->i;
1759           ridx = NULL;
1760         }
1761         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1762         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1763         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1764         else nnz = a->nz;
1765 
1766         /* create cusparse matrix */
1767         cusparsestruct->nrows = m;
1768         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1769         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1770         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1771         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1772 
1773         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1774         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1775         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1776         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1777         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1778         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1779         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1780 
1781         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1782         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1783           /* set the matrix */
1784           CsrMatrix *mat= new CsrMatrix;
1785           mat->num_rows = m;
1786           mat->num_cols = A->cmap->n;
1787           mat->num_entries = nnz;
1788           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1789           mat->row_offsets->assign(ii, ii + m+1);
1790 
1791           mat->column_indices = new THRUSTINTARRAY32(nnz);
1792           mat->column_indices->assign(a->j, a->j+nnz);
1793 
1794           mat->values = new THRUSTARRAY(nnz);
1795           if (a->a) mat->values->assign(a->a, a->a+nnz);
1796 
1797           /* assign the pointer */
1798           matstruct->mat = mat;
1799          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1800           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1801             stat = cusparseCreateCsr(&matstruct->matDescr,
1802                                     mat->num_rows, mat->num_cols, mat->num_entries,
1803                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1804                                     mat->values->data().get(),
1805                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1806                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1807           }
1808          #endif
1809         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1810          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1811           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1812          #else
1813           CsrMatrix *mat= new CsrMatrix;
1814           mat->num_rows = m;
1815           mat->num_cols = A->cmap->n;
1816           mat->num_entries = nnz;
1817           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1818           mat->row_offsets->assign(ii, ii + m+1);
1819 
1820           mat->column_indices = new THRUSTINTARRAY32(nnz);
1821           mat->column_indices->assign(a->j, a->j+nnz);
1822 
1823           mat->values = new THRUSTARRAY(nnz);
1824           if (a->a) mat->values->assign(a->a, a->a+nnz);
1825 
1826           cusparseHybMat_t hybMat;
1827           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1828           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1829             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1830           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1831               matstruct->descr, mat->values->data().get(),
1832               mat->row_offsets->data().get(),
1833               mat->column_indices->data().get(),
1834               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1835           /* assign the pointer */
1836           matstruct->mat = hybMat;
1837 
1838           if (mat) {
1839             if (mat->values) delete (THRUSTARRAY*)mat->values;
1840             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1841             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1842             delete (CsrMatrix*)mat;
1843           }
1844          #endif
1845         }
1846 
1847         /* assign the compressed row indices */
1848         if (a->compressedrow.use) {
1849           cusparsestruct->workVector = new THRUSTARRAY(m);
1850           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1851           matstruct->cprowIndices->assign(ridx,ridx+m);
1852           tmp = m;
1853         } else {
1854           cusparsestruct->workVector = NULL;
1855           matstruct->cprowIndices    = NULL;
1856           tmp = 0;
1857         }
1858         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1859 
1860         /* assign the pointer */
1861         cusparsestruct->mat = matstruct;
1862       } catch(char *ex) {
1863         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1864       }
1865       err  = WaitForCUDA();CHKERRCUDA(err);
1866       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1867       cusparsestruct->nonzerostate = A->nonzerostate;
1868     }
1869     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1870   }
1871   PetscFunctionReturn(0);
1872 }
1873 
1874 struct VecCUDAPlusEquals
1875 {
1876   template <typename Tuple>
1877   __host__ __device__
1878   void operator()(Tuple t)
1879   {
1880     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1881   }
1882 };
1883 
1884 struct VecCUDAEquals
1885 {
1886   template <typename Tuple>
1887   __host__ __device__
1888   void operator()(Tuple t)
1889   {
1890     thrust::get<1>(t) = thrust::get<0>(t);
1891   }
1892 };
1893 
1894 struct VecCUDAEqualsReverse
1895 {
1896   template <typename Tuple>
1897   __host__ __device__
1898   void operator()(Tuple t)
1899   {
1900     thrust::get<0>(t) = thrust::get<1>(t);
1901   }
1902 };
1903 
1904 struct MatMatCusparse {
1905   PetscBool             cisdense;
1906   PetscScalar           *Bt;
1907   Mat                   X;
1908   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1909   PetscLogDouble        flops;
1910   CsrMatrix             *Bcsr;
1911 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1912   cusparseSpMatDescr_t  matSpBDescr;
1913   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1914   cusparseDnMatDescr_t  matBDescr;
1915   cusparseDnMatDescr_t  matCDescr;
1916   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1917   size_t                mmBufferSize;
1918   void                  *mmBuffer;
1919   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1920   cusparseSpGEMMDescr_t spgemmDesc;
1921 #endif
1922 };
1923 
1924 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1925 {
1926   PetscErrorCode   ierr;
1927   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1928   cudaError_t      cerr;
1929  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1930   cusparseStatus_t stat;
1931  #endif
1932 
1933   PetscFunctionBegin;
1934   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1935   delete mmdata->Bcsr;
1936  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1937   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1938   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1939   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1940   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1941   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1942   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1943  #endif
1944   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1945   ierr = PetscFree(data);CHKERRQ(ierr);
1946   PetscFunctionReturn(0);
1947 }
1948 
1949 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1950 
1951 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1952 {
1953   Mat_Product                  *product = C->product;
1954   Mat                          A,B;
1955   PetscInt                     m,n,blda,clda;
1956   PetscBool                    flg,biscuda;
1957   Mat_SeqAIJCUSPARSE           *cusp;
1958   cusparseStatus_t             stat;
1959   cusparseOperation_t          opA;
1960   const PetscScalar            *barray;
1961   PetscScalar                  *carray;
1962   PetscErrorCode               ierr;
1963   MatMatCusparse               *mmdata;
1964   Mat_SeqAIJCUSPARSEMultStruct *mat;
1965   CsrMatrix                    *csrmat;
1966 
1967   PetscFunctionBegin;
1968   MatCheckProduct(C,1);
1969   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
1970   mmdata = (MatMatCusparse*)product->data;
1971   A    = product->A;
1972   B    = product->B;
1973   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1974   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
1975   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1976      Instead of silently accepting the wrong answer, I prefer to raise the error */
1977   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1978   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1979   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
1980   switch (product->type) {
1981   case MATPRODUCT_AB:
1982   case MATPRODUCT_PtAP:
1983     mat = cusp->mat;
1984     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1985     m   = A->rmap->n;
1986     n   = B->cmap->n;
1987     break;
1988   case MATPRODUCT_AtB:
1989     if (!A->form_explicit_transpose) {
1990       mat = cusp->mat;
1991       opA = CUSPARSE_OPERATION_TRANSPOSE;
1992     } else {
1993       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
1994       mat  = cusp->matTranspose;
1995       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1996     }
1997     m = A->cmap->n;
1998     n = B->cmap->n;
1999     break;
2000   case MATPRODUCT_ABt:
2001   case MATPRODUCT_RARt:
2002     mat = cusp->mat;
2003     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2004     m   = A->rmap->n;
2005     n   = B->rmap->n;
2006     break;
2007   default:
2008     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2009   }
2010   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2011   csrmat = (CsrMatrix*)mat->mat;
2012   /* if the user passed a CPU matrix, copy the data to the GPU */
2013   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2014   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2015   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2016 
2017   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2018   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2019     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2020     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2021   } else {
2022     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2023     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2024   }
2025 
2026   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2027  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2028   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2029   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2030   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2031     size_t mmBufferSize;
2032     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2033     if (!mmdata->matBDescr) {
2034       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2035       mmdata->Blda = blda;
2036     }
2037 
2038     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2039     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2040       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2041       mmdata->Clda = clda;
2042     }
2043 
2044     if (!mat->matDescr) {
2045       stat = cusparseCreateCsr(&mat->matDescr,
2046                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2047                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2048                                csrmat->values->data().get(),
2049                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2050                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2051     }
2052     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2053                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2054                                    mmdata->matCDescr,cusparse_scalartype,
2055                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2056     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2057       cudaError_t cerr;
2058       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2059       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2060       mmdata->mmBufferSize = mmBufferSize;
2061     }
2062     mmdata->initialized = PETSC_TRUE;
2063   } else {
2064     /* to be safe, always update pointers of the mats */
2065     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2066     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2067     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2068   }
2069 
2070   /* do cusparseSpMM, which supports transpose on B */
2071   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2072                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2073                       mmdata->matCDescr,cusparse_scalartype,
2074                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2075  #else
2076   PetscInt k;
2077   /* cusparseXcsrmm does not support transpose on B */
2078   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2079     cublasHandle_t cublasv2handle;
2080     cublasStatus_t cerr;
2081 
2082     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2083     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2084                        B->cmap->n,B->rmap->n,
2085                        &PETSC_CUSPARSE_ONE ,barray,blda,
2086                        &PETSC_CUSPARSE_ZERO,barray,blda,
2087                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2088     blda = B->cmap->n;
2089     k    = B->cmap->n;
2090   } else {
2091     k    = B->rmap->n;
2092   }
2093 
2094   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2095   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2096                            csrmat->num_entries,mat->alpha_one,mat->descr,
2097                            csrmat->values->data().get(),
2098                            csrmat->row_offsets->data().get(),
2099                            csrmat->column_indices->data().get(),
2100                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2101                            carray,clda);CHKERRCUSPARSE(stat);
2102  #endif
2103   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2104   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2105   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2106   if (product->type == MATPRODUCT_RARt) {
2107     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2108     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2109   } else if (product->type == MATPRODUCT_PtAP) {
2110     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2111     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2112   } else {
2113     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2114   }
2115   if (mmdata->cisdense) {
2116     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2117   }
2118   if (!biscuda) {
2119     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2120   }
2121   PetscFunctionReturn(0);
2122 }
2123 
2124 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2125 {
2126   Mat_Product        *product = C->product;
2127   Mat                A,B;
2128   PetscInt           m,n;
2129   PetscBool          cisdense,flg;
2130   PetscErrorCode     ierr;
2131   MatMatCusparse     *mmdata;
2132   Mat_SeqAIJCUSPARSE *cusp;
2133 
2134   PetscFunctionBegin;
2135   MatCheckProduct(C,1);
2136   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2137   A    = product->A;
2138   B    = product->B;
2139   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2140   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2141   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2142   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2143   switch (product->type) {
2144   case MATPRODUCT_AB:
2145     m = A->rmap->n;
2146     n = B->cmap->n;
2147     break;
2148   case MATPRODUCT_AtB:
2149     m = A->cmap->n;
2150     n = B->cmap->n;
2151     break;
2152   case MATPRODUCT_ABt:
2153     m = A->rmap->n;
2154     n = B->rmap->n;
2155     break;
2156   case MATPRODUCT_PtAP:
2157     m = B->cmap->n;
2158     n = B->cmap->n;
2159     break;
2160   case MATPRODUCT_RARt:
2161     m = B->rmap->n;
2162     n = B->rmap->n;
2163     break;
2164   default:
2165     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2166   }
2167   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2168   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2169   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2170   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2171 
2172   /* product data */
2173   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2174   mmdata->cisdense = cisdense;
2175  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2176   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2177   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2178     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2179   }
2180  #endif
2181   /* for these products we need intermediate storage */
2182   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2183     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2184     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2185     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2186       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2187     } else {
2188       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2189     }
2190   }
2191   C->product->data    = mmdata;
2192   C->product->destroy = MatDestroy_MatMatCusparse;
2193 
2194   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2195   PetscFunctionReturn(0);
2196 }
2197 
2198 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2199 {
2200   Mat_Product                  *product = C->product;
2201   Mat                          A,B;
2202   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2203   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2204   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2205   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2206   PetscBool                    flg;
2207   PetscErrorCode               ierr;
2208   cusparseStatus_t             stat;
2209   cudaError_t                  cerr;
2210   MatProductType               ptype;
2211   MatMatCusparse               *mmdata;
2212 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2213   cusparseSpMatDescr_t         BmatSpDescr;
2214 #endif
2215 
2216   PetscFunctionBegin;
2217   MatCheckProduct(C,1);
2218   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2219   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2220   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2221   mmdata = (MatMatCusparse*)C->product->data;
2222   A = product->A;
2223   B = product->B;
2224   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2225     mmdata->reusesym = PETSC_FALSE;
2226     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2227     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2228     Cmat = Ccusp->mat;
2229     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2230     Ccsr = (CsrMatrix*)Cmat->mat;
2231     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2232     goto finalize;
2233   }
2234   if (!c->nz) goto finalize;
2235   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2236   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2237   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2238   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2239   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2240   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2241   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2242   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2243   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2244   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2245   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2246   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2247   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2248   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2249 
2250   ptype = product->type;
2251   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2252   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2253   switch (ptype) {
2254   case MATPRODUCT_AB:
2255     Amat = Acusp->mat;
2256     Bmat = Bcusp->mat;
2257     break;
2258   case MATPRODUCT_AtB:
2259     Amat = Acusp->matTranspose;
2260     Bmat = Bcusp->mat;
2261     break;
2262   case MATPRODUCT_ABt:
2263     Amat = Acusp->mat;
2264     Bmat = Bcusp->matTranspose;
2265     break;
2266   default:
2267     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2268   }
2269   Cmat = Ccusp->mat;
2270   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2271   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2272   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2273   Acsr = (CsrMatrix*)Amat->mat;
2274   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2275   Ccsr = (CsrMatrix*)Cmat->mat;
2276   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2277   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2278   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2279   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2280 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2281   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2282   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2283                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2284                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2285                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2286   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2287                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2288                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2289 #else
2290   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2291                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2292                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2293                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2294                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2295 #endif
2296   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2297   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2298   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2299   C->offloadmask = PETSC_OFFLOAD_GPU;
2300 finalize:
2301   /* shorter version of MatAssemblyEnd_SeqAIJ */
2302   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2303   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2304   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2305   c->reallocs         = 0;
2306   C->info.mallocs    += 0;
2307   C->info.nz_unneeded = 0;
2308   C->assembled = C->was_assembled = PETSC_TRUE;
2309   C->num_ass++;
2310   PetscFunctionReturn(0);
2311 }
2312 
2313 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2314 {
2315   Mat_Product                  *product = C->product;
2316   Mat                          A,B;
2317   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2318   Mat_SeqAIJ                   *a,*b,*c;
2319   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2320   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2321   PetscInt                     i,j,m,n,k;
2322   PetscBool                    flg;
2323   PetscErrorCode               ierr;
2324   cusparseStatus_t             stat;
2325   cudaError_t                  cerr;
2326   MatProductType               ptype;
2327   MatMatCusparse               *mmdata;
2328   PetscLogDouble               flops;
2329   PetscBool                    biscompressed,ciscompressed;
2330 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2331   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2332   size_t                       bufSize2;
2333   cusparseSpMatDescr_t         BmatSpDescr;
2334 #else
2335   int                          cnz;
2336 #endif
2337 
2338   PetscFunctionBegin;
2339   MatCheckProduct(C,1);
2340   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2341   A    = product->A;
2342   B    = product->B;
2343   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2344   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2345   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2346   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2347   a = (Mat_SeqAIJ*)A->data;
2348   b = (Mat_SeqAIJ*)B->data;
2349   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2350   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2351   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2352   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2353 
2354   /* product data */
2355   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2356   C->product->data    = mmdata;
2357   C->product->destroy = MatDestroy_MatMatCusparse;
2358 
2359   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2360   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2361   ptype = product->type;
2362   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2363   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2364   biscompressed = PETSC_FALSE;
2365   ciscompressed = PETSC_FALSE;
2366   switch (ptype) {
2367   case MATPRODUCT_AB:
2368     m = A->rmap->n;
2369     n = B->cmap->n;
2370     k = A->cmap->n;
2371     Amat = Acusp->mat;
2372     Bmat = Bcusp->mat;
2373     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2374     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2375     break;
2376   case MATPRODUCT_AtB:
2377     m = A->cmap->n;
2378     n = B->cmap->n;
2379     k = A->rmap->n;
2380     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2381     Amat = Acusp->matTranspose;
2382     Bmat = Bcusp->mat;
2383     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2384     break;
2385   case MATPRODUCT_ABt:
2386     m = A->rmap->n;
2387     n = B->rmap->n;
2388     k = A->cmap->n;
2389     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2390     Amat = Acusp->mat;
2391     Bmat = Bcusp->matTranspose;
2392     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2393     break;
2394   default:
2395     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2396   }
2397 
2398   /* create cusparse matrix */
2399   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2400   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2401   c     = (Mat_SeqAIJ*)C->data;
2402   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2403   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2404   Ccsr  = new CsrMatrix;
2405 
2406   c->compressedrow.use = ciscompressed;
2407   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2408     c->compressedrow.nrows = a->compressedrow.nrows;
2409     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2410     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2411     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2412     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2413     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2414   } else {
2415     c->compressedrow.nrows  = 0;
2416     c->compressedrow.i      = NULL;
2417     c->compressedrow.rindex = NULL;
2418     Ccusp->workVector       = NULL;
2419     Cmat->cprowIndices      = NULL;
2420   }
2421   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2422   Ccusp->mat      = Cmat;
2423   Ccusp->mat->mat = Ccsr;
2424   Ccsr->num_rows    = Ccusp->nrows;
2425   Ccsr->num_cols    = n;
2426   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2427   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2428   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2429   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2430   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2431   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2432   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2433   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2434   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2435   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2436   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2437     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2438     c->nz = 0;
2439     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2440     Ccsr->values = new THRUSTARRAY(c->nz);
2441     goto finalizesym;
2442   }
2443 
2444   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2445   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2446   Acsr = (CsrMatrix*)Amat->mat;
2447   if (!biscompressed) {
2448     Bcsr = (CsrMatrix*)Bmat->mat;
2449 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2450     BmatSpDescr = Bmat->matDescr;
2451 #endif
2452   } else { /* we need to use row offsets for the full matrix */
2453     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2454     Bcsr = new CsrMatrix;
2455     Bcsr->num_rows       = B->rmap->n;
2456     Bcsr->num_cols       = cBcsr->num_cols;
2457     Bcsr->num_entries    = cBcsr->num_entries;
2458     Bcsr->column_indices = cBcsr->column_indices;
2459     Bcsr->values         = cBcsr->values;
2460     if (!Bcusp->rowoffsets_gpu) {
2461       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2462       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2463       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2464     }
2465     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2466     mmdata->Bcsr = Bcsr;
2467 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2468     if (Bcsr->num_rows && Bcsr->num_cols) {
2469       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2470                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2471                                Bcsr->values->data().get(),
2472                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2473                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2474     }
2475     BmatSpDescr = mmdata->matSpBDescr;
2476 #endif
2477   }
2478   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2479   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2480   /* precompute flops count */
2481   if (ptype == MATPRODUCT_AB) {
2482     for (i=0, flops = 0; i<A->rmap->n; i++) {
2483       const PetscInt st = a->i[i];
2484       const PetscInt en = a->i[i+1];
2485       for (j=st; j<en; j++) {
2486         const PetscInt brow = a->j[j];
2487         flops += 2.*(b->i[brow+1] - b->i[brow]);
2488       }
2489     }
2490   } else if (ptype == MATPRODUCT_AtB) {
2491     for (i=0, flops = 0; i<A->rmap->n; i++) {
2492       const PetscInt anzi = a->i[i+1] - a->i[i];
2493       const PetscInt bnzi = b->i[i+1] - b->i[i];
2494       flops += (2.*anzi)*bnzi;
2495     }
2496   } else { /* TODO */
2497     flops = 0.;
2498   }
2499 
2500   mmdata->flops = flops;
2501   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2502 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2503   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2504   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2505                            NULL, NULL, NULL,
2506                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2507                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2508   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2509   /* ask bufferSize bytes for external memory */
2510   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2511                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2512                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2513                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2514   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2515   /* inspect the matrices A and B to understand the memory requirement for the next step */
2516   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2517                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2518                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2519                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2520   /* ask bufferSize again bytes for external memory */
2521   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2522                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2523                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2524                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2525   /* The CUSPARSE documentation is not clear, nor the API
2526      We need both buffers to perform the operations properly!
2527      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2528      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2529      is stored in the descriptor! What a messy API... */
2530   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2531   /* compute the intermediate product of A * B */
2532   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2533                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2534                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2535                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2536   /* get matrix C non-zero entries C_nnz1 */
2537   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2538   c->nz = (PetscInt) C_nnz1;
2539   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2540   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2541   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2542   Ccsr->values = new THRUSTARRAY(c->nz);
2543   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2544   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2545                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2546   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2547                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2548                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2549 #else
2550   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2551   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2552                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2553                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2554                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2555                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2556   c->nz = cnz;
2557   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2558   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2559   Ccsr->values = new THRUSTARRAY(c->nz);
2560   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2561 
2562   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2563   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2564      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2565      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2566   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2567                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2568                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2569                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2570                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2571 #endif
2572   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2573   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2574 finalizesym:
2575   c->singlemalloc = PETSC_FALSE;
2576   c->free_a       = PETSC_TRUE;
2577   c->free_ij      = PETSC_TRUE;
2578   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2579   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2580   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2581     PetscInt *d_i = c->i;
2582     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2583     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2584     ii   = *Ccsr->row_offsets;
2585     jj   = *Ccsr->column_indices;
2586     if (ciscompressed) d_i = c->compressedrow.i;
2587     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2588     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2589   } else {
2590     PetscInt *d_i = c->i;
2591     if (ciscompressed) d_i = c->compressedrow.i;
2592     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2593     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2594   }
2595   if (ciscompressed) { /* need to expand host row offsets */
2596     PetscInt r = 0;
2597     c->i[0] = 0;
2598     for (k = 0; k < c->compressedrow.nrows; k++) {
2599       const PetscInt next = c->compressedrow.rindex[k];
2600       const PetscInt old = c->compressedrow.i[k];
2601       for (; r < next; r++) c->i[r+1] = old;
2602     }
2603     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2604   }
2605   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2606   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2607   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2608   c->maxnz = c->nz;
2609   c->nonzerorowcnt = 0;
2610   c->rmax = 0;
2611   for (k = 0; k < m; k++) {
2612     const PetscInt nn = c->i[k+1] - c->i[k];
2613     c->ilen[k] = c->imax[k] = nn;
2614     c->nonzerorowcnt += (PetscInt)!!nn;
2615     c->rmax = PetscMax(c->rmax,nn);
2616   }
2617   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2618   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2619   Ccsr->num_entries = c->nz;
2620 
2621   C->nonzerostate++;
2622   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2623   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2624   Ccusp->nonzerostate = C->nonzerostate;
2625   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2626   C->preallocated  = PETSC_TRUE;
2627   C->assembled     = PETSC_FALSE;
2628   C->was_assembled = PETSC_FALSE;
2629   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2630     mmdata->reusesym = PETSC_TRUE;
2631     C->offloadmask   = PETSC_OFFLOAD_GPU;
2632   }
2633   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2634   PetscFunctionReturn(0);
2635 }
2636 
2637 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2638 
2639 /* handles sparse or dense B */
2640 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2641 {
2642   Mat_Product    *product = mat->product;
2643   PetscErrorCode ierr;
2644   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2645 
2646   PetscFunctionBegin;
2647   MatCheckProduct(mat,1);
2648   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2649   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2650     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2651   }
2652   if (product->type == MATPRODUCT_ABC) {
2653     Ciscusp = PETSC_FALSE;
2654     if (!product->C->boundtocpu) {
2655       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2656     }
2657   }
2658   if (isdense) {
2659     switch (product->type) {
2660     case MATPRODUCT_AB:
2661     case MATPRODUCT_AtB:
2662     case MATPRODUCT_ABt:
2663     case MATPRODUCT_PtAP:
2664     case MATPRODUCT_RARt:
2665      if (product->A->boundtocpu) {
2666         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2667       } else {
2668         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2669       }
2670       break;
2671     case MATPRODUCT_ABC:
2672       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2673       break;
2674     default:
2675       break;
2676     }
2677   } else if (Biscusp && Ciscusp) {
2678     switch (product->type) {
2679     case MATPRODUCT_AB:
2680     case MATPRODUCT_AtB:
2681     case MATPRODUCT_ABt:
2682       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2683       break;
2684     case MATPRODUCT_PtAP:
2685     case MATPRODUCT_RARt:
2686     case MATPRODUCT_ABC:
2687       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2688       break;
2689     default:
2690       break;
2691     }
2692   } else { /* fallback for AIJ */
2693     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2694   }
2695   PetscFunctionReturn(0);
2696 }
2697 
2698 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2699 {
2700   PetscErrorCode ierr;
2701 
2702   PetscFunctionBegin;
2703   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2704   PetscFunctionReturn(0);
2705 }
2706 
2707 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2708 {
2709   PetscErrorCode ierr;
2710 
2711   PetscFunctionBegin;
2712   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2713   PetscFunctionReturn(0);
2714 }
2715 
2716 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2717 {
2718   PetscErrorCode ierr;
2719 
2720   PetscFunctionBegin;
2721   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2722   PetscFunctionReturn(0);
2723 }
2724 
2725 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2726 {
2727   PetscErrorCode ierr;
2728 
2729   PetscFunctionBegin;
2730   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2731   PetscFunctionReturn(0);
2732 }
2733 
2734 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2735 {
2736   PetscErrorCode ierr;
2737 
2738   PetscFunctionBegin;
2739   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2740   PetscFunctionReturn(0);
2741 }
2742 
2743 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2744 {
2745   int i = blockIdx.x*blockDim.x + threadIdx.x;
2746   if (i < n) y[idx[i]] += x[i];
2747 }
2748 
2749 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2750 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2751 {
2752   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2753   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2754   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2755   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2756   PetscErrorCode               ierr;
2757   cusparseStatus_t             stat;
2758   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2759   PetscBool                    compressed;
2760 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2761   PetscInt                     nx,ny;
2762 #endif
2763 
2764   PetscFunctionBegin;
2765   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2766   if (!a->nonzerorowcnt) {
2767     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2768     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2769     PetscFunctionReturn(0);
2770   }
2771   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2772   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2773   if (!trans) {
2774     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2775     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2776   } else {
2777     if (herm || !A->form_explicit_transpose) {
2778       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2779       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2780     } else {
2781       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2782       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2783     }
2784   }
2785   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2786   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2787 
2788   try {
2789     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2790     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2791     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2792 
2793     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2794     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2795       /* z = A x + beta y.
2796          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2797          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2798       */
2799       xptr = xarray;
2800       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2801       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2802      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2803       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2804           allocated to accommodate different uses. So we get the length info directly from mat.
2805        */
2806       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2807         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2808         nx = mat->num_cols;
2809         ny = mat->num_rows;
2810       }
2811      #endif
2812     } else {
2813       /* z = A^T x + beta y
2814          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2815          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2816        */
2817       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2818       dptr = zarray;
2819       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2820       if (compressed) { /* Scatter x to work vector */
2821         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2822         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2823                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2824                          VecCUDAEqualsReverse());
2825       }
2826      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2827       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2828         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2829         nx = mat->num_rows;
2830         ny = mat->num_cols;
2831       }
2832      #endif
2833     }
2834 
2835     /* csr_spmv does y = alpha op(A) x + beta y */
2836     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2837      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2838       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2839       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2840         cudaError_t cerr;
2841         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2842         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2843         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2844                                 matstruct->matDescr,
2845                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2846                                 matstruct->cuSpMV[opA].vecYDescr,
2847                                 cusparse_scalartype,
2848                                 cusparsestruct->spmvAlg,
2849                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2850         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2851 
2852         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2853       } else {
2854         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2855         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2856         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2857       }
2858 
2859       stat = cusparseSpMV(cusparsestruct->handle, opA,
2860                                matstruct->alpha_one,
2861                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2862                                matstruct->cuSpMV[opA].vecXDescr,
2863                                beta,
2864                                matstruct->cuSpMV[opA].vecYDescr,
2865                                cusparse_scalartype,
2866                                cusparsestruct->spmvAlg,
2867                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2868      #else
2869       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2870       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2871                                mat->num_rows, mat->num_cols,
2872                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2873                                mat->values->data().get(), mat->row_offsets->data().get(),
2874                                mat->column_indices->data().get(), xptr, beta,
2875                                dptr);CHKERRCUSPARSE(stat);
2876      #endif
2877     } else {
2878       if (cusparsestruct->nrows) {
2879        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2880         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2881        #else
2882         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2883         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2884                                  matstruct->alpha_one, matstruct->descr, hybMat,
2885                                  xptr, beta,
2886                                  dptr);CHKERRCUSPARSE(stat);
2887        #endif
2888       }
2889     }
2890     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2891 
2892     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2893       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2894         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2895           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2896         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2897           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2898         }
2899       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2900         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2901       }
2902 
2903       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2904       if (compressed) {
2905         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2906         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2907            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2908            prevent that. So I just add a ScatterAdd kernel.
2909          */
2910        #if 0
2911         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2912         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2913                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2914                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2915                          VecCUDAPlusEquals());
2916        #else
2917         PetscInt n = matstruct->cprowIndices->size();
2918         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2919        #endif
2920         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2921       }
2922     } else {
2923       if (yy && yy != zz) {
2924         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2925       }
2926     }
2927     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2928     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2929     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2930   } catch(char *ex) {
2931     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2932   }
2933   if (yy) {
2934     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2935   } else {
2936     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2937   }
2938   PetscFunctionReturn(0);
2939 }
2940 
2941 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2942 {
2943   PetscErrorCode ierr;
2944 
2945   PetscFunctionBegin;
2946   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2947   PetscFunctionReturn(0);
2948 }
2949 
2950 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2951 {
2952   PetscErrorCode              ierr;
2953   PetscSplitCSRDataStructure  *d_mat = NULL;
2954   PetscFunctionBegin;
2955   if (A->factortype == MAT_FACTOR_NONE) {
2956     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2957   }
2958   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2959   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2960   if (d_mat) {
2961     A->offloadmask = PETSC_OFFLOAD_GPU;
2962   }
2963 
2964   PetscFunctionReturn(0);
2965 }
2966 
2967 /* --------------------------------------------------------------------------------*/
2968 /*@
2969    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2970    (the default parallel PETSc format). This matrix will ultimately pushed down
2971    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2972    assembly performance the user should preallocate the matrix storage by setting
2973    the parameter nz (or the array nnz).  By setting these parameters accurately,
2974    performance during matrix assembly can be increased by more than a factor of 50.
2975 
2976    Collective
2977 
2978    Input Parameters:
2979 +  comm - MPI communicator, set to PETSC_COMM_SELF
2980 .  m - number of rows
2981 .  n - number of columns
2982 .  nz - number of nonzeros per row (same for all rows)
2983 -  nnz - array containing the number of nonzeros in the various rows
2984          (possibly different for each row) or NULL
2985 
2986    Output Parameter:
2987 .  A - the matrix
2988 
2989    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2990    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2991    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2992 
2993    Notes:
2994    If nnz is given then nz is ignored
2995 
2996    The AIJ format (also called the Yale sparse matrix format or
2997    compressed row storage), is fully compatible with standard Fortran 77
2998    storage.  That is, the stored row and column indices can begin at
2999    either one (as in Fortran) or zero.  See the users' manual for details.
3000 
3001    Specify the preallocated storage with either nz or nnz (not both).
3002    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3003    allocation.  For large problems you MUST preallocate memory or you
3004    will get TERRIBLE performance, see the users' manual chapter on matrices.
3005 
3006    By default, this format uses inodes (identical nodes) when possible, to
3007    improve numerical efficiency of matrix-vector products and solves. We
3008    search for consecutive rows with the same nonzero structure, thereby
3009    reusing matrix information to achieve increased efficiency.
3010 
3011    Level: intermediate
3012 
3013 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3014 @*/
3015 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3016 {
3017   PetscErrorCode ierr;
3018 
3019   PetscFunctionBegin;
3020   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3021   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3022   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3023   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3024   PetscFunctionReturn(0);
3025 }
3026 
3027 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3028 {
3029   PetscErrorCode              ierr;
3030   PetscSplitCSRDataStructure  *d_mat = NULL;
3031 
3032   PetscFunctionBegin;
3033   if (A->factortype == MAT_FACTOR_NONE) {
3034     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3035     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3036     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3037   } else {
3038     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3039   }
3040   if (d_mat) {
3041     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3042     cudaError_t                err;
3043     PetscSplitCSRDataStructure h_mat;
3044     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3045     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3046     if (a->compressedrow.use) {
3047       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3048     }
3049     err = cudaFree(d_mat);CHKERRCUDA(err);
3050   }
3051   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3052   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3053   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3054   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3055   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3056   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3057   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3058   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3059   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3060   PetscFunctionReturn(0);
3061 }
3062 
3063 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3064 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3065 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3066 {
3067   PetscErrorCode ierr;
3068 
3069   PetscFunctionBegin;
3070   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3071   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3072   PetscFunctionReturn(0);
3073 }
3074 
3075 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3076 {
3077   PetscErrorCode     ierr;
3078   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3079   Mat_SeqAIJCUSPARSE *cy;
3080   Mat_SeqAIJCUSPARSE *cx;
3081   PetscScalar        *ay;
3082   const PetscScalar  *ax;
3083   CsrMatrix          *csry,*csrx;
3084 
3085   PetscFunctionBegin;
3086   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3087   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3088   if (X->ops->axpy != Y->ops->axpy) {
3089     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3090     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3091     PetscFunctionReturn(0);
3092   }
3093   /* if we are here, it means both matrices are bound to GPU */
3094   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3095   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3096   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3097   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3098   csry = (CsrMatrix*)cy->mat->mat;
3099   csrx = (CsrMatrix*)cx->mat->mat;
3100   /* see if we can turn this into a cublas axpy */
3101   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3102     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3103     if (eq) {
3104       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3105     }
3106     if (eq) str = SAME_NONZERO_PATTERN;
3107   }
3108   /* spgeam is buggy with one column */
3109   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3110 
3111   if (str == SUBSET_NONZERO_PATTERN) {
3112     cusparseStatus_t stat;
3113     PetscScalar      b = 1.0;
3114 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3115     size_t           bufferSize;
3116     void             *buffer;
3117     cudaError_t      cerr;
3118 #endif
3119 
3120     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3121     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3122     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3123 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3124     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3125                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3126                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3127                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3128     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3129     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3130     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3131                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3132                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3133                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3134     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3135     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3136     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3137 #else
3138     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3139     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3140                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3141                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3142                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3143     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3144     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3145 #endif
3146     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3147     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3148     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3149     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3150   } else if (str == SAME_NONZERO_PATTERN) {
3151     cublasHandle_t cublasv2handle;
3152     cublasStatus_t berr;
3153     PetscBLASInt   one = 1, bnz = 1;
3154 
3155     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3156     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3157     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3158     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3159     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3160     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3161     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3162     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3163     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3164     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3165     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3166   } else {
3167     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3168     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3169   }
3170   PetscFunctionReturn(0);
3171 }
3172 
3173 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3174 {
3175   PetscErrorCode ierr;
3176   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3177   PetscScalar    *ay;
3178   cublasHandle_t cublasv2handle;
3179   cublasStatus_t berr;
3180   PetscBLASInt   one = 1, bnz = 1;
3181 
3182   PetscFunctionBegin;
3183   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3184   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3185   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3186   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3187   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3188   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3189   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3190   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3191   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3192   PetscFunctionReturn(0);
3193 }
3194 
3195 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3196 {
3197   PetscErrorCode             ierr;
3198   PetscBool                  both = PETSC_FALSE;
3199   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3200 
3201   PetscFunctionBegin;
3202   if (A->factortype == MAT_FACTOR_NONE) {
3203     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3204     if (spptr->mat) {
3205       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3206       if (matrix->values) {
3207         both = PETSC_TRUE;
3208         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3209       }
3210     }
3211     if (spptr->matTranspose) {
3212       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3213       if (matrix->values) {
3214         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3215       }
3216     }
3217   }
3218   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3219   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3220   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3221   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3222   else A->offloadmask = PETSC_OFFLOAD_CPU;
3223 
3224   PetscFunctionReturn(0);
3225 }
3226 
3227 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3228 {
3229   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3230   PetscErrorCode ierr;
3231 
3232   PetscFunctionBegin;
3233   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3234   if (flg) {
3235     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3236 
3237     A->ops->scale                     = MatScale_SeqAIJ;
3238     A->ops->axpy                      = MatAXPY_SeqAIJ;
3239     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3240     A->ops->mult                      = MatMult_SeqAIJ;
3241     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3242     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3243     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3244     A->ops->multhermitiantranspose    = NULL;
3245     A->ops->multhermitiantransposeadd = NULL;
3246     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3247     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3248     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3249     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3250     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3251     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3252     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3253     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3254   } else {
3255     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3256     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3257     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3258     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3259     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3260     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3261     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3262     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3263     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3264     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3265     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3266     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3267     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3268     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3269     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3270     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3271     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3272   }
3273   A->boundtocpu = flg;
3274   a->inode.use = flg;
3275   PetscFunctionReturn(0);
3276 }
3277 
3278 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3279 {
3280   PetscErrorCode   ierr;
3281   cusparseStatus_t stat;
3282   Mat              B;
3283 
3284   PetscFunctionBegin;
3285   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3286   if (reuse == MAT_INITIAL_MATRIX) {
3287     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3288   } else if (reuse == MAT_REUSE_MATRIX) {
3289     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3290   }
3291   B = *newmat;
3292 
3293   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3294   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3295 
3296   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3297     if (B->factortype == MAT_FACTOR_NONE) {
3298       Mat_SeqAIJCUSPARSE *spptr;
3299       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3300       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3301       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3302       spptr->format     = MAT_CUSPARSE_CSR;
3303      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3304       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3305       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3306       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3307      #endif
3308       B->spptr = spptr;
3309     } else {
3310       Mat_SeqAIJCUSPARSETriFactors *spptr;
3311 
3312       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3313       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3314       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3315       B->spptr = spptr;
3316     }
3317     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3318   }
3319   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3320   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3321   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3322   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3323   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3324   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3325 
3326   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3327   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3328   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3329   PetscFunctionReturn(0);
3330 }
3331 
3332 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3333 {
3334   PetscErrorCode ierr;
3335 
3336   PetscFunctionBegin;
3337   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3338   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3339   PetscFunctionReturn(0);
3340 }
3341 
3342 /*MC
3343    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3344 
3345    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3346    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3347    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3348 
3349    Options Database Keys:
3350 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3351 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3352 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3353 
3354   Level: beginner
3355 
3356 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3357 M*/
3358 
3359 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3360 
3361 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3362 {
3363   PetscErrorCode ierr;
3364 
3365   PetscFunctionBegin;
3366   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3367   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3368   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3369   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3370   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3371 
3372   PetscFunctionReturn(0);
3373 }
3374 
3375 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3376 {
3377   PetscErrorCode   ierr;
3378   cusparseStatus_t stat;
3379 
3380   PetscFunctionBegin;
3381   if (*cusparsestruct) {
3382     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3383     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3384     delete (*cusparsestruct)->workVector;
3385     delete (*cusparsestruct)->rowoffsets_gpu;
3386     delete (*cusparsestruct)->cooPerm;
3387     delete (*cusparsestruct)->cooPerm_a;
3388     delete (*cusparsestruct)->csr2csc_i;
3389     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3390     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3391   }
3392   PetscFunctionReturn(0);
3393 }
3394 
3395 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3396 {
3397   PetscFunctionBegin;
3398   if (*mat) {
3399     delete (*mat)->values;
3400     delete (*mat)->column_indices;
3401     delete (*mat)->row_offsets;
3402     delete *mat;
3403     *mat = 0;
3404   }
3405   PetscFunctionReturn(0);
3406 }
3407 
3408 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3409 {
3410   cusparseStatus_t stat;
3411   PetscErrorCode   ierr;
3412 
3413   PetscFunctionBegin;
3414   if (*trifactor) {
3415     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3416     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3417     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3418     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3419     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3420    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3421     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3422    #endif
3423     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3424   }
3425   PetscFunctionReturn(0);
3426 }
3427 
3428 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3429 {
3430   CsrMatrix        *mat;
3431   cusparseStatus_t stat;
3432   cudaError_t      err;
3433 
3434   PetscFunctionBegin;
3435   if (*matstruct) {
3436     if ((*matstruct)->mat) {
3437       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3438        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3439         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3440        #else
3441         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3442         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3443        #endif
3444       } else {
3445         mat = (CsrMatrix*)(*matstruct)->mat;
3446         CsrMatrix_Destroy(&mat);
3447       }
3448     }
3449     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3450     delete (*matstruct)->cprowIndices;
3451     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3452     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3453     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3454 
3455    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3456     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3457     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3458     for (int i=0; i<3; i++) {
3459       if (mdata->cuSpMV[i].initialized) {
3460         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3461         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3462         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3463       }
3464     }
3465    #endif
3466     delete *matstruct;
3467     *matstruct = NULL;
3468   }
3469   PetscFunctionReturn(0);
3470 }
3471 
3472 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3473 {
3474   PetscErrorCode ierr;
3475 
3476   PetscFunctionBegin;
3477   if (*trifactors) {
3478     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3479     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3480     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3481     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3482     delete (*trifactors)->rpermIndices;
3483     delete (*trifactors)->cpermIndices;
3484     delete (*trifactors)->workVector;
3485     (*trifactors)->rpermIndices = NULL;
3486     (*trifactors)->cpermIndices = NULL;
3487     (*trifactors)->workVector = NULL;
3488     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3489     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3490     (*trifactors)->init_dev_prop = PETSC_FALSE;
3491   }
3492   PetscFunctionReturn(0);
3493 }
3494 
3495 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3496 {
3497   PetscErrorCode   ierr;
3498   cusparseHandle_t handle;
3499   cusparseStatus_t stat;
3500 
3501   PetscFunctionBegin;
3502   if (*trifactors) {
3503     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3504     if (handle = (*trifactors)->handle) {
3505       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3506     }
3507     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3508   }
3509   PetscFunctionReturn(0);
3510 }
3511 
3512 struct IJCompare
3513 {
3514   __host__ __device__
3515   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3516   {
3517     if (t1.get<0>() < t2.get<0>()) return true;
3518     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3519     return false;
3520   }
3521 };
3522 
3523 struct IJEqual
3524 {
3525   __host__ __device__
3526   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3527   {
3528     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3529     return true;
3530   }
3531 };
3532 
3533 struct IJDiff
3534 {
3535   __host__ __device__
3536   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3537   {
3538     return t1 == t2 ? 0 : 1;
3539   }
3540 };
3541 
3542 struct IJSum
3543 {
3544   __host__ __device__
3545   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3546   {
3547     return t1||t2;
3548   }
3549 };
3550 
3551 #include <thrust/iterator/discard_iterator.h>
3552 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3553 {
3554   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3555   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3556   THRUSTARRAY                           *cooPerm_v = NULL;
3557   thrust::device_ptr<const PetscScalar> d_v;
3558   CsrMatrix                             *matrix;
3559   PetscErrorCode                        ierr;
3560   PetscInt                              n;
3561 
3562   PetscFunctionBegin;
3563   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3564   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3565   if (!cusp->cooPerm) {
3566     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3567     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3568     PetscFunctionReturn(0);
3569   }
3570   matrix = (CsrMatrix*)cusp->mat->mat;
3571   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3572   if (!v) {
3573     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3574     goto finalize;
3575   }
3576   n = cusp->cooPerm->size();
3577   if (isCudaMem(v)) {
3578     d_v = thrust::device_pointer_cast(v);
3579   } else {
3580     cooPerm_v = new THRUSTARRAY(n);
3581     cooPerm_v->assign(v,v+n);
3582     d_v = cooPerm_v->data();
3583     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3584   }
3585   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3586   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3587     if (cusp->cooPerm_a) {
3588       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3589       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3590       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3591       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3592       delete cooPerm_w;
3593     } else {
3594       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3595                                                                 matrix->values->begin()));
3596       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3597                                                                 matrix->values->end()));
3598       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3599     }
3600   } else {
3601     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3602       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3603       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3604     } else {
3605       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3606                                                                 matrix->values->begin()));
3607       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3608                                                                 matrix->values->end()));
3609       thrust::for_each(zibit,zieit,VecCUDAEquals());
3610     }
3611   }
3612   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3613 finalize:
3614   delete cooPerm_v;
3615   A->offloadmask = PETSC_OFFLOAD_GPU;
3616   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3617   /* shorter version of MatAssemblyEnd_SeqAIJ */
3618   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3619   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3620   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3621   a->reallocs         = 0;
3622   A->info.mallocs    += 0;
3623   A->info.nz_unneeded = 0;
3624   A->assembled = A->was_assembled = PETSC_TRUE;
3625   A->num_ass++;
3626   PetscFunctionReturn(0);
3627 }
3628 
3629 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3630 {
3631   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3632   PetscErrorCode     ierr;
3633 
3634   PetscFunctionBegin;
3635   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3636   if (!cusp) PetscFunctionReturn(0);
3637   if (destroy) {
3638     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3639     delete cusp->csr2csc_i;
3640     cusp->csr2csc_i = NULL;
3641   }
3642   A->transupdated = PETSC_FALSE;
3643   PetscFunctionReturn(0);
3644 }
3645 
3646 #include <thrust/binary_search.h>
3647 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3648 {
3649   PetscErrorCode     ierr;
3650   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3651   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3652   PetscInt           cooPerm_n, nzr = 0;
3653   cudaError_t        cerr;
3654 
3655   PetscFunctionBegin;
3656   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3657   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3658   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3659   if (n != cooPerm_n) {
3660     delete cusp->cooPerm;
3661     delete cusp->cooPerm_a;
3662     cusp->cooPerm = NULL;
3663     cusp->cooPerm_a = NULL;
3664   }
3665   if (n) {
3666     THRUSTINTARRAY d_i(n);
3667     THRUSTINTARRAY d_j(n);
3668     THRUSTINTARRAY ii(A->rmap->n);
3669 
3670     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3671     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3672 
3673     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3674     d_i.assign(coo_i,coo_i+n);
3675     d_j.assign(coo_j,coo_j+n);
3676     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3677     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3678 
3679     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3680     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3681     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3682     *cusp->cooPerm_a = d_i;
3683     THRUSTINTARRAY w = d_j;
3684 
3685     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3686     if (nekey == ekey) { /* all entries are unique */
3687       delete cusp->cooPerm_a;
3688       cusp->cooPerm_a = NULL;
3689     } else { /* I couldn't come up with a more elegant algorithm */
3690       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3691       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3692       (*cusp->cooPerm_a)[0] = 0;
3693       w[0] = 0;
3694       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3695       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3696     }
3697     thrust::counting_iterator<PetscInt> search_begin(0);
3698     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3699                         search_begin, search_begin + A->rmap->n,
3700                         ii.begin());
3701     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3702 
3703     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3704     a->singlemalloc = PETSC_FALSE;
3705     a->free_a       = PETSC_TRUE;
3706     a->free_ij      = PETSC_TRUE;
3707     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3708     a->i[0] = 0;
3709     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3710     a->nz = a->maxnz = a->i[A->rmap->n];
3711     a->rmax = 0;
3712     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3713     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3714     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3715     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3716     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3717     for (PetscInt i = 0; i < A->rmap->n; i++) {
3718       const PetscInt nnzr = a->i[i+1] - a->i[i];
3719       nzr += (PetscInt)!!(nnzr);
3720       a->ilen[i] = a->imax[i] = nnzr;
3721       a->rmax = PetscMax(a->rmax,nnzr);
3722     }
3723     a->nonzerorowcnt = nzr;
3724     A->preallocated = PETSC_TRUE;
3725     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3726     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3727   } else {
3728     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3729   }
3730   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3731 
3732   /* We want to allocate the CUSPARSE struct for matvec now.
3733      The code is so convoluted now that I prefer to copy zeros */
3734   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3735   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3736   A->offloadmask = PETSC_OFFLOAD_CPU;
3737   A->nonzerostate++;
3738   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3739   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3740 
3741   A->assembled = PETSC_FALSE;
3742   A->was_assembled = PETSC_FALSE;
3743   PetscFunctionReturn(0);
3744 }
3745 
3746 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3747 {
3748   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3749   CsrMatrix          *csr;
3750   PetscErrorCode     ierr;
3751 
3752   PetscFunctionBegin;
3753   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3754   PetscValidPointer(a,2);
3755   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3756   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3757   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3758   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3759   csr = (CsrMatrix*)cusp->mat->mat;
3760   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3761   *a = csr->values->data().get();
3762   PetscFunctionReturn(0);
3763 }
3764 
3765 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3766 {
3767   PetscFunctionBegin;
3768   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3769   PetscValidPointer(a,2);
3770   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3771   *a = NULL;
3772   PetscFunctionReturn(0);
3773 }
3774 
3775 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3776 {
3777   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3778   CsrMatrix          *csr;
3779   PetscErrorCode     ierr;
3780 
3781   PetscFunctionBegin;
3782   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3783   PetscValidPointer(a,2);
3784   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3785   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3786   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3787   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3788   csr = (CsrMatrix*)cusp->mat->mat;
3789   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3790   *a = csr->values->data().get();
3791   A->offloadmask = PETSC_OFFLOAD_GPU;
3792   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3793   PetscFunctionReturn(0);
3794 }
3795 
3796 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3797 {
3798   PetscErrorCode ierr;
3799 
3800   PetscFunctionBegin;
3801   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3802   PetscValidPointer(a,2);
3803   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3804   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3805   *a = NULL;
3806   PetscFunctionReturn(0);
3807 }
3808 
3809 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3810 {
3811   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3812   CsrMatrix          *csr;
3813   PetscErrorCode     ierr;
3814 
3815   PetscFunctionBegin;
3816   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3817   PetscValidPointer(a,2);
3818   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3819   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3820   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3821   csr = (CsrMatrix*)cusp->mat->mat;
3822   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3823   *a = csr->values->data().get();
3824   A->offloadmask = PETSC_OFFLOAD_GPU;
3825   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3826   PetscFunctionReturn(0);
3827 }
3828 
3829 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3830 {
3831   PetscErrorCode ierr;
3832 
3833   PetscFunctionBegin;
3834   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3835   PetscValidPointer(a,2);
3836   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3837   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3838   *a = NULL;
3839   PetscFunctionReturn(0);
3840 }
3841 
3842 struct IJCompare4
3843 {
3844   __host__ __device__
3845   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3846   {
3847     if (t1.get<0>() < t2.get<0>()) return true;
3848     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3849     return false;
3850   }
3851 };
3852 
3853 struct Shift
3854 {
3855   int _shift;
3856 
3857   Shift(int shift) : _shift(shift) {}
3858   __host__ __device__
3859   inline int operator() (const int &c)
3860   {
3861     return c + _shift;
3862   }
3863 };
3864 
3865 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3866 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3867 {
3868   PetscErrorCode               ierr;
3869   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3870   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3871   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3872   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3873   PetscInt                     Annz,Bnnz;
3874   cusparseStatus_t             stat;
3875   PetscInt                     i,m,n,zero = 0;
3876   cudaError_t                  cerr;
3877 
3878   PetscFunctionBegin;
3879   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3880   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3881   PetscValidPointer(C,4);
3882   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3883   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3884   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3885   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3886   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3887   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3888   if (reuse == MAT_INITIAL_MATRIX) {
3889     m     = A->rmap->n;
3890     n     = A->cmap->n + B->cmap->n;
3891     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3892     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3893     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3894     c     = (Mat_SeqAIJ*)(*C)->data;
3895     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3896     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3897     Ccsr  = new CsrMatrix;
3898     Cmat->cprowIndices      = NULL;
3899     c->compressedrow.use    = PETSC_FALSE;
3900     c->compressedrow.nrows  = 0;
3901     c->compressedrow.i      = NULL;
3902     c->compressedrow.rindex = NULL;
3903     Ccusp->workVector       = NULL;
3904     Ccusp->nrows    = m;
3905     Ccusp->mat      = Cmat;
3906     Ccusp->mat->mat = Ccsr;
3907     Ccsr->num_rows  = m;
3908     Ccsr->num_cols  = n;
3909     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3910     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3911     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3912     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3913     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3914     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3915     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3916     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3917     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3918     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3919     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3920     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
3921     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
3922     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3923     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3924 
3925     Acsr = (CsrMatrix*)Acusp->mat->mat;
3926     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3927     Annz = (PetscInt)Acsr->column_indices->size();
3928     Bnnz = (PetscInt)Bcsr->column_indices->size();
3929     c->nz = Annz + Bnnz;
3930     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3931     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3932     Ccsr->values = new THRUSTARRAY(c->nz);
3933     Ccsr->num_entries = c->nz;
3934     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3935     if (c->nz) {
3936       auto Acoo = new THRUSTINTARRAY32(Annz);
3937       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3938       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3939       THRUSTINTARRAY32 *Aroff,*Broff;
3940 
3941       if (a->compressedrow.use) { /* need full row offset */
3942         if (!Acusp->rowoffsets_gpu) {
3943           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3944           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3945           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3946         }
3947         Aroff = Acusp->rowoffsets_gpu;
3948       } else Aroff = Acsr->row_offsets;
3949       if (b->compressedrow.use) { /* need full row offset */
3950         if (!Bcusp->rowoffsets_gpu) {
3951           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3952           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3953           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3954         }
3955         Broff = Bcusp->rowoffsets_gpu;
3956       } else Broff = Bcsr->row_offsets;
3957       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3958       stat = cusparseXcsr2coo(Acusp->handle,
3959                               Aroff->data().get(),
3960                               Annz,
3961                               m,
3962                               Acoo->data().get(),
3963                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3964       stat = cusparseXcsr2coo(Bcusp->handle,
3965                               Broff->data().get(),
3966                               Bnnz,
3967                               m,
3968                               Bcoo->data().get(),
3969                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3970       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3971       auto Aperm = thrust::make_constant_iterator(1);
3972       auto Bperm = thrust::make_constant_iterator(0);
3973 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3974       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3975       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3976 #else
3977       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3978       auto Bcib = Bcsr->column_indices->begin();
3979       auto Bcie = Bcsr->column_indices->end();
3980       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
3981 #endif
3982       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
3983       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3984       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3985       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
3986       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
3987       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3988       auto p1 = Ccusp->cooPerm->begin();
3989       auto p2 = Ccusp->cooPerm->begin();
3990       thrust::advance(p2,Annz);
3991       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
3992 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
3993       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
3994 #endif
3995       auto cci = thrust::make_counting_iterator(zero);
3996       auto cce = thrust::make_counting_iterator(c->nz);
3997 #if 0 //Errors on SUMMIT cuda 11.1.0
3998       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
3999 #else
4000       auto pred = thrust::identity<int>();
4001       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4002       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4003 #endif
4004       stat = cusparseXcoo2csr(Ccusp->handle,
4005                               Ccoo->data().get(),
4006                               c->nz,
4007                               m,
4008                               Ccsr->row_offsets->data().get(),
4009                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4010       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4011       delete wPerm;
4012       delete Acoo;
4013       delete Bcoo;
4014       delete Ccoo;
4015 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4016       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4017                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4018                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4019                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4020 #endif
4021       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4022         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4023         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4024         CsrMatrix *CcsrT = new CsrMatrix;
4025         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4026         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4027 
4028         (*C)->form_explicit_transpose = PETSC_TRUE;
4029         (*C)->transupdated = PETSC_TRUE;
4030         Ccusp->rowoffsets_gpu = NULL;
4031         CmatT->cprowIndices = NULL;
4032         CmatT->mat = CcsrT;
4033         CcsrT->num_rows = n;
4034         CcsrT->num_cols = m;
4035         CcsrT->num_entries = c->nz;
4036 
4037         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4038         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4039         CcsrT->values = new THRUSTARRAY(c->nz);
4040 
4041         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4042         auto rT = CcsrT->row_offsets->begin();
4043         if (AT) {
4044           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4045           thrust::advance(rT,-1);
4046         }
4047         if (BT) {
4048           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4049           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4050           thrust::copy(titb,tite,rT);
4051         }
4052         auto cT = CcsrT->column_indices->begin();
4053         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4054         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4055         auto vT = CcsrT->values->begin();
4056         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4057         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4058         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4059 
4060         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4061         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4062         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4063         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4064         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4065         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4066         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4067         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4068         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4070         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4071                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4072                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4073                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4074 #endif
4075         Ccusp->matTranspose = CmatT;
4076       }
4077     }
4078 
4079     c->singlemalloc = PETSC_FALSE;
4080     c->free_a       = PETSC_TRUE;
4081     c->free_ij      = PETSC_TRUE;
4082     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4083     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4084     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4085       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4086       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4087       ii   = *Ccsr->row_offsets;
4088       jj   = *Ccsr->column_indices;
4089       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4090       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4091     } else {
4092       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4093       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4094     }
4095     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4096     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4097     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4098     c->maxnz = c->nz;
4099     c->nonzerorowcnt = 0;
4100     c->rmax = 0;
4101     for (i = 0; i < m; i++) {
4102       const PetscInt nn = c->i[i+1] - c->i[i];
4103       c->ilen[i] = c->imax[i] = nn;
4104       c->nonzerorowcnt += (PetscInt)!!nn;
4105       c->rmax = PetscMax(c->rmax,nn);
4106     }
4107     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4108     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4109     (*C)->nonzerostate++;
4110     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4111     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4112     Ccusp->nonzerostate = (*C)->nonzerostate;
4113     (*C)->preallocated  = PETSC_TRUE;
4114   } else {
4115     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4116     c = (Mat_SeqAIJ*)(*C)->data;
4117     if (c->nz) {
4118       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4119       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4120       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4121       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4122       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4123       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4124       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4125       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4126       Acsr = (CsrMatrix*)Acusp->mat->mat;
4127       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4128       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4129       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4130       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4131       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4132       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4133       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4134       auto pmid = Ccusp->cooPerm->begin();
4135       thrust::advance(pmid,Acsr->num_entries);
4136       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4137       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4138                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4139       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4140                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4141       thrust::for_each(zibait,zieait,VecCUDAEquals());
4142       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4143                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4144       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4145                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4146       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4147       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4148       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4149         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4150         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4151         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4152         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4153         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4154         auto vT = CcsrT->values->begin();
4155         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4156         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4157         (*C)->transupdated = PETSC_TRUE;
4158       }
4159       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4160     }
4161   }
4162   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4163   (*C)->assembled     = PETSC_TRUE;
4164   (*C)->was_assembled = PETSC_FALSE;
4165   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4166   PetscFunctionReturn(0);
4167 }
4168 
4169 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4170 {
4171   PetscErrorCode    ierr;
4172   bool              dmem;
4173   const PetscScalar *av;
4174   cudaError_t       cerr;
4175 
4176   PetscFunctionBegin;
4177   dmem = isCudaMem(v);
4178   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4179   if (n && idx) {
4180     THRUSTINTARRAY widx(n);
4181     widx.assign(idx,idx+n);
4182     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4183 
4184     THRUSTARRAY *w = NULL;
4185     thrust::device_ptr<PetscScalar> dv;
4186     if (dmem) {
4187       dv = thrust::device_pointer_cast(v);
4188     } else {
4189       w = new THRUSTARRAY(n);
4190       dv = w->data();
4191     }
4192     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4193 
4194     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4195     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4196     thrust::for_each(zibit,zieit,VecCUDAEquals());
4197     if (w) {
4198       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4199     }
4200     delete w;
4201   } else {
4202     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4203   }
4204   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4205   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4206   PetscFunctionReturn(0);
4207 }
4208